diff options
Diffstat (limited to 'pkg/sentry')
296 files changed, 5260 insertions, 2242 deletions
diff --git a/pkg/sentry/arch/fpu/BUILD b/pkg/sentry/arch/fpu/BUILD index 6cdd21b1b..1f371e513 100644 --- a/pkg/sentry/arch/fpu/BUILD +++ b/pkg/sentry/arch/fpu/BUILD @@ -9,6 +9,7 @@ go_library( "fpu_amd64.go", "fpu_amd64.s", "fpu_arm64.go", + "fpu_unsafe.go", ], visibility = ["//:sandbox"], deps = [ diff --git a/pkg/sentry/arch/fpu/fpu.go b/pkg/sentry/arch/fpu/fpu.go index 867d309a3..62bde19d3 100644 --- a/pkg/sentry/arch/fpu/fpu.go +++ b/pkg/sentry/arch/fpu/fpu.go @@ -17,7 +17,6 @@ package fpu import ( "fmt" - "reflect" ) // State represents floating point state. @@ -40,15 +39,3 @@ type ErrLoadingState struct { func (e ErrLoadingState) Error() string { return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supportedFeatures, e.savedFeatures) } - -// alignedBytes returns a slice of size bytes, aligned in memory to the given -// alignment. This is used because we require certain structures to be aligned -// in a specific way (for example, the X86 floating point data). -func alignedBytes(size, alignment uint) []byte { - data := make([]byte, size+alignment-1) - offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment)) - if offset == 0 { - return data[:size:size] - } - return data[alignment-offset:][:size:size] -} diff --git a/pkg/sentry/arch/fpu/fpu_unsafe.go b/pkg/sentry/arch/fpu/fpu_unsafe.go new file mode 100644 index 000000000..c91dc99be --- /dev/null +++ b/pkg/sentry/arch/fpu/fpu_unsafe.go @@ -0,0 +1,31 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fpu + +import ( + "unsafe" +) + +// alignedBytes returns a slice of size bytes, aligned in memory to the given +// alignment. This is used because we require certain structures to be aligned +// in a specific way (for example, the X86 floating point data). +func alignedBytes(size, alignment uint) []byte { + data := make([]byte, size+alignment-1) + offset := uint(uintptr(unsafe.Pointer(&data[0])) % uintptr(alignment)) + if offset == 0 { + return data[:size:size] + } + return data[alignment-offset:][:size:size] +} diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 7ee237c9f..cfb33a398 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -1,17 +1,25 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") package(licenses = ["notice"]) +proto_library( + name = "control", + srcs = ["control.proto"], + visibility = ["//visibility:public"], +) + go_library( name = "control", srcs = [ "control.go", + "events.go", "fs.go", "lifecycle.go", "logging.go", "pprof.go", "proc.go", "state.go", + "usage.go", ], visibility = [ "//:sandbox", @@ -19,6 +27,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/eventchannel", "//pkg/fd", "//pkg/log", "//pkg/sentry/fdimport", @@ -39,6 +48,7 @@ go_library( "//pkg/tcpip/link/sniffer", "//pkg/urpc", "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/control/control.proto b/pkg/sentry/control/control.proto new file mode 100644 index 000000000..72dda3fbc --- /dev/null +++ b/pkg/sentry/control/control.proto @@ -0,0 +1,40 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +// ControlConfig configures the permission of controls. +message ControlConfig { + // Names for individual control URPC service objects. + // Any new service object that should be given conditional access should be + // named here and conditionally added based on presence in allowed_controls. + enum Endpoint { + UNKNOWN = 0; + EVENTS = 1; + FS = 2; + LIFECYCLE = 3; + LOGGING = 4; + PROFILE = 5; + USAGE = 6; + PROC = 7; + STATE = 8; + DEBUG = 9; + } + + // allowed_controls represents which endpoints may be registered to the + // server. + repeated Endpoint allowed_controls = 1; +} diff --git a/pkg/sentry/control/events.go b/pkg/sentry/control/events.go new file mode 100644 index 000000000..92e437ae7 --- /dev/null +++ b/pkg/sentry/control/events.go @@ -0,0 +1,65 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "errors" + "fmt" + + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/urpc" +) + +// EventsOpts are the arguments for eventchannel-related commands. +type EventsOpts struct { + urpc.FilePayload +} + +// Events is the control server state for eventchannel-related commands. +type Events struct { + emitter eventchannel.Emitter +} + +// AttachDebugEmitter receives a connected unix domain socket FD from the client +// and establishes it as a new emitter for the sentry eventchannel. Any existing +// emitters are replaced on a subsequent attach. +func (e *Events) AttachDebugEmitter(o *EventsOpts, _ *struct{}) error { + if len(o.FilePayload.Files) < 1 { + return errors.New("no output writer provided") + } + + sock, err := o.ReleaseFD(0) + if err != nil { + return err + } + sockFD := sock.Release() + + // SocketEmitter takes ownership of sockFD. + emitter, err := eventchannel.SocketEmitter(sockFD) + if err != nil { + return fmt.Errorf("failed to create SocketEmitter for FD %d: %v", sockFD, err) + } + + // If there is already a debug emitter, close the old one. + if e.emitter != nil { + e.emitter.Close() + } + + e.emitter = eventchannel.DebugEmitterFrom(emitter) + + // Register the new stream destination. + eventchannel.AddEmitter(e.emitter) + return nil +} diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 2f3664c57..f721b7236 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -26,6 +26,23 @@ import ( "gvisor.dev/gvisor/pkg/urpc" ) +const ( + // DefaultBlockProfileRate is the default profiling rate for block + // profiles. + // + // The default here is 10%, which will record a stacktrace 10% of the + // time when blocking occurs. Since these events should not be super + // frequent, we expect this to achieve a reasonable balance between + // collecting the data we need and imposing a high performance cost + // (e.g. skewing even the CPU profile). + DefaultBlockProfileRate = 10 + + // DefaultMutexProfileRate is the default profiling rate for mutex + // profiles. Like the block rate above, we use a default rate of 10% + // for the same reasons. + DefaultMutexProfileRate = 10 +) + // Profile includes profile-related RPC stubs. It provides a way to // control the built-in runtime profiling facilities. // @@ -175,12 +192,8 @@ func (p *Profile) Block(o *BlockProfileOpts, _ *struct{}) error { defer p.blockMu.Unlock() // Always set the rate. We then wait to collect a profile at this rate, - // and disable when we're done. Note that the default here is 10%, which - // will record a stacktrace 10% of the time when blocking occurs. Since - // these events should not be super frequent, we expect this to achieve - // a reasonable balance between collecting the data we need and imposing - // a high performance cost (e.g. skewing even the CPU profile). - rate := 10 + // and disable when we're done. + rate := DefaultBlockProfileRate if o.Rate != 0 { rate = o.Rate } @@ -220,9 +233,8 @@ func (p *Profile) Mutex(o *MutexProfileOpts, _ *struct{}) error { p.mutexMu.Lock() defer p.mutexMu.Unlock() - // Always set the fraction. Like the block rate above, we use - // a default rate of 10% for the same reasons. - fraction := 10 + // Always set the fraction. + fraction := DefaultMutexProfileRate if o.Fraction != 0 { fraction = o.Fraction } diff --git a/pkg/sentry/control/usage.go b/pkg/sentry/control/usage.go new file mode 100644 index 000000000..cc78d3f45 --- /dev/null +++ b/pkg/sentry/control/usage.go @@ -0,0 +1,183 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "fmt" + "os" + "runtime" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/urpc" +) + +// Usage includes usage-related RPC stubs. +type Usage struct { + Kernel *kernel.Kernel +} + +// MemoryUsageOpts contains usage options. +type MemoryUsageOpts struct { + // Full indicates that a full accounting should be done. If Full is not + // specified, then a partial accounting will be done, and Unknown will + // contain a majority of memory. See Collect for more information. + Full bool `json:"Full"` +} + +// MemoryUsage is a memory usage structure. +type MemoryUsage struct { + Unknown uint64 `json:"Unknown"` + System uint64 `json:"System"` + Anonymous uint64 `json:"Anonymous"` + PageCache uint64 `json:"PageCache"` + Mapped uint64 `json:"Mapped"` + Tmpfs uint64 `json:"Tmpfs"` + Ramdiskfs uint64 `json:"Ramdiskfs"` + Total uint64 `json:"Total"` +} + +// MemoryUsageFileOpts contains usage file options. +type MemoryUsageFileOpts struct { + // Version is used to ensure both sides agree on the format of the + // shared memory buffer. + Version uint64 `json:"Version"` +} + +// MemoryUsageFile contains the file handle to the usage file. +type MemoryUsageFile struct { + urpc.FilePayload +} + +// UsageFD returns the file that tracks the memory usage of the application. +func (u *Usage) UsageFD(opts *MemoryUsageFileOpts, out *MemoryUsageFile) error { + // Only support version 1 for now. + if opts.Version != 1 { + return fmt.Errorf("unsupported version requested: %d", opts.Version) + } + + mf := u.Kernel.MemoryFile() + *out = MemoryUsageFile{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{ + usage.MemoryAccounting.File, + mf.File(), + }, + }, + } + + return nil +} + +// Collect returns memory used by the sandboxed application. +func (u *Usage) Collect(opts *MemoryUsageOpts, out *MemoryUsage) error { + if opts.Full { + // Ensure everything is up to date. + if err := u.Kernel.MemoryFile().UpdateUsage(); err != nil { + return err + } + + // Copy out a snapshot. + snapshot, total := usage.MemoryAccounting.Copy() + *out = MemoryUsage{ + System: snapshot.System, + Anonymous: snapshot.Anonymous, + PageCache: snapshot.PageCache, + Mapped: snapshot.Mapped, + Tmpfs: snapshot.Tmpfs, + Ramdiskfs: snapshot.Ramdiskfs, + Total: total, + } + } else { + // Get total usage from the MemoryFile implementation. + total, err := u.Kernel.MemoryFile().TotalUsage() + if err != nil { + return err + } + + // The memory accounting is guaranteed to be accurate only when + // UpdateUsage is called. If UpdateUsage is not called, then only Mapped + // will be up-to-date. + snapshot, _ := usage.MemoryAccounting.Copy() + *out = MemoryUsage{ + Unknown: total, + Mapped: snapshot.Mapped, + Total: total + snapshot.Mapped, + } + + } + + return nil +} + +// UsageReduceOpts contains options to Usage.Reduce(). +type UsageReduceOpts struct { + // If Wait is true, Reduce blocks until all activity initiated by + // Usage.Reduce() has completed. + Wait bool `json:"wait"` +} + +// UsageReduceOutput contains output from Usage.Reduce(). +type UsageReduceOutput struct{} + +// Reduce requests that the sentry attempt to reduce its memory usage. +func (u *Usage) Reduce(opts *UsageReduceOpts, out *UsageReduceOutput) error { + mf := u.Kernel.MemoryFile() + mf.StartEvictions() + if opts.Wait { + mf.WaitForEvictions() + } + return nil +} + +// MemoryUsageRecord contains the mapping and platform memory file. +type MemoryUsageRecord struct { + mmap uintptr + stats *usage.RTMemoryStats + mf os.File +} + +// NewMemoryUsageRecord creates a new MemoryUsageRecord from usageFile and +// platformFile. +func NewMemoryUsageRecord(usageFile, platformFile os.File) (*MemoryUsageRecord, error) { + mmap, _, e := unix.RawSyscall6(unix.SYS_MMAP, 0, usage.RTMemoryStatsSize, unix.PROT_READ, unix.MAP_SHARED, usageFile.Fd(), 0) + if e != 0 { + return nil, fmt.Errorf("mmap returned %d, want 0", e) + } + + m := MemoryUsageRecord{ + mmap: mmap, + stats: usage.RTMemoryStatsPointer(mmap), + mf: platformFile, + } + + runtime.SetFinalizer(&m, finalizer) + return &m, nil +} + +func finalizer(m *MemoryUsageRecord) { + unix.RawSyscall(unix.SYS_MUNMAP, m.mmap, usage.RTMemoryStatsSize, 0) +} + +// Fetch fetches the usage info from a MemoryUsageRecord. +func (m *MemoryUsageRecord) Fetch() (mapped, unknown, total uint64, err error) { + var stat unix.Stat_t + if err := unix.Fstat(int(m.mf.Fd()), &stat); err != nil { + return 0, 0, 0, err + } + fmem := uint64(stat.Blocks) * 512 + return m.stats.RTMapped, fmem, m.stats.RTMapped + fmem, nil +} diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD index 4c8604d58..66b9ed523 100644 --- a/pkg/sentry/devices/memdev/BUILD +++ b/pkg/sentry/devices/memdev/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/rand", "//pkg/safemem", "//pkg/sentry/fsimpl/devtmpfs", @@ -23,7 +24,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index fece3e762..fc702c9f6 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -16,8 +16,8 @@ package memdev import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -66,12 +66,12 @@ func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.Rea // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSPC + return 0, linuxerr.ENOSPC } // Write implements vfs.FileDescriptionImpl.Write. func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSPC + return 0, linuxerr.ENOSPC } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/devices/quotedev/BUILD b/pkg/sentry/devices/quotedev/BUILD deleted file mode 100644 index d09214e3e..000000000 --- a/pkg/sentry/devices/quotedev/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -licenses(["notice"]) - -go_library( - name = "quotedev", - srcs = ["quotedev.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/sentry/fsimpl/devtmpfs", - "//pkg/sentry/vfs", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/devices/quotedev/quotedev.go b/pkg/sentry/devices/quotedev/quotedev.go deleted file mode 100644 index 6114cb724..000000000 --- a/pkg/sentry/devices/quotedev/quotedev.go +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2021 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package quotedev implements a vfs.Device for /dev/gvisor_quote. -package quotedev - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -const ( - quoteDevMinor = 0 -) - -// quoteDevice implements vfs.Device for /dev/gvisor_quote -// -// +stateify savable -type quoteDevice struct{} - -// Open implements vfs.Device.Open. -// TODO(b/157161182): Add support for attestation ioctls. -func (quoteDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.EIO -} - -// Register registers all devices implemented by this package in vfsObj. -func Register(vfsObj *vfs.VirtualFilesystem) error { - return vfsObj.RegisterDevice(vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, quoteDevice{}, &vfs.RegisterDeviceOptions{ - GroupName: "gvisor_quote", - }) -} - -// CreateDevtmpfsFiles creates device special files in dev representing all -// devices implemented by this package. -func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { - return dev.CreateDeviceFile(ctx, "gvisor_quote", vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, 0666 /* mode */) -} diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD index b4b6ca38a..ab4cd0b33 100644 --- a/pkg/sentry/devices/ttydev/BUILD +++ b/pkg/sentry/devices/ttydev/BUILD @@ -9,8 +9,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go index a287c65ca..29b79b5d6 100644 --- a/pkg/sentry/devices/ttydev/ttydev.go +++ b/pkg/sentry/devices/ttydev/ttydev.go @@ -18,9 +18,9 @@ package ttydev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -36,7 +36,7 @@ type ttyDevice struct{} // Open implements vfs.Device.Open. func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.EIO + return nil, linuxerr.EIO } // Register registers all devices implemented by this package in vfsObj. diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 58fe1e77c..4e573d249 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -68,7 +68,6 @@ go_library( "//pkg/sentry/usage", "//pkg/state", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index a8591052c..e48bd4dba 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -195,7 +194,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx) if err != nil { log.Warningf("copy up failed to get lower attributes: %v", err) - return syserror.EIO + return linuxerr.EIO } var childUpperInode *Inode @@ -211,7 +210,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { childFile, err := parentUpper.Create(ctx, root, next.name, FileFlags{Read: true, Write: true}, attrs.Perms) if err != nil { log.Warningf("copy up failed to create file: %v", err) - return syserror.EIO + return linuxerr.EIO } defer childFile.DecRef(ctx) childUpperInode = childFile.Dirent.Inode @@ -219,13 +218,13 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { case Directory: if err := parentUpper.CreateDirectory(ctx, root, next.name, attrs.Perms); err != nil { log.Warningf("copy up failed to create directory: %v", err) - return syserror.EIO + return linuxerr.EIO } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { werr := fmt.Errorf("copy up failed to lookup directory: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode @@ -235,17 +234,17 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { link, err := childLower.Readlink(ctx) if err != nil { log.Warningf("copy up failed to read symlink value: %v", err) - return syserror.EIO + return linuxerr.EIO } if err := parentUpper.CreateLink(ctx, root, link, next.name); err != nil { log.Warningf("copy up failed to create symlink: %v", err) - return syserror.EIO + return linuxerr.EIO } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { werr := fmt.Errorf("copy up failed to lookup symlink: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode @@ -259,14 +258,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil { werr := fmt.Errorf("copy up failed to copy up attributes: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } // Copy the entire file. if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil { werr := fmt.Errorf("copy up failed to copy up contents: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } lowerMappable := next.Inode.overlay.lower.Mappable() @@ -274,7 +273,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { if lowerMappable != nil && upperMappable == nil { werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence") cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } // Propagate memory mappings to the upper Inode. diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index e28a8961b..7baf26b24 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -34,7 +34,6 @@ go_library( "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/socket/netstack", - "//pkg/syserror", "//pkg/tcpip/link/tun", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go index deb9c6ad8..6f0c1fc68 100644 --- a/pkg/sentry/fs/dev/full.go +++ b/pkg/sentry/fs/dev/full.go @@ -17,9 +17,9 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -77,5 +77,5 @@ var _ fs.FileOperations = (*fullFileOperations)(nil) // Write implements FileOperations.Write. func (*fullFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.ENOSPC + return 0, linuxerr.ENOSPC } diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index ad8ff227e..d300a32e0 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) type globalDirentMap struct { @@ -963,7 +962,7 @@ func (d *Dirent) isMountPointLocked() bool { func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) { // Did we race with deletion? if atomic.LoadInt32(&d.deleted) != 0 { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // Refuse to mount a symlink. @@ -998,7 +997,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { // Did we race with deletion? if atomic.LoadInt32(&d.deleted) != 0 { - return syserror.ENOENT + return linuxerr.ENOENT } // Remount our former child in its place. diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 5c889c861..9f1fe5160 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -22,7 +22,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -46,7 +45,6 @@ go_test( "//pkg/hostarch", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/syserror", "//pkg/usermem", "@com_github_google_uuid//:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index f8a29816b..4370cce33 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -142,7 +141,7 @@ func (p *pipeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{secio.FullReader{p.file}}) total := int64(bufN) + n if err != nil && isBlockError(err) { - return total, syserror.ErrWouldBlock + return total, linuxerr.ErrWouldBlock } return total, err } @@ -151,13 +150,13 @@ func (p *pipeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO func (p *pipeOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { n, err := src.CopyInTo(ctx, safemem.FromIOWriter{p.file}) if err != nil && isBlockError(err) { - return n, syserror.ErrWouldBlock + return n, linuxerr.ErrWouldBlock } return n, err } // isBlockError unwraps os errors and checks if they are caused by EAGAIN or -// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +// EWOULDBLOCK. This is so they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { if linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) { return true diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go index adda19168..e91e1b5cb 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener.go @@ -21,9 +21,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) // NonBlockingOpener is a generic host file opener used to retry opening host @@ -40,7 +40,7 @@ func Open(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (fs p := &pipeOpenState{} canceled := false for { - if file, err := p.TryOpen(ctx, opener, flags); err != syserror.ErrWouldBlock { + if file, err := p.TryOpen(ctx, opener, flags); err != linuxerr.ErrWouldBlock { return file, err } @@ -51,7 +51,7 @@ func Open(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (fs if p.hostFile != nil { p.hostFile.Close() } - return nil, syserror.ErrInterrupted + return nil, linuxerr.ErrInterrupted } cancel := ctx.SleepStart() @@ -106,13 +106,13 @@ func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, f } return newPipeOperations(ctx, opener, flags, f, nil) - // Handle opening O_WRONLY blocking: convert ENXIO to syserror.ErrWouldBlock. + // Handle opening O_WRONLY blocking: convert ENXIO to linuxerr.ErrWouldBlock. // See TryOpenWriteOnly for more details. case flags.Write: return p.TryOpenWriteOnly(ctx, opener) default: - // Handle opening O_RDONLY blocking: convert EOF from read to syserror.ErrWouldBlock. + // Handle opening O_RDONLY blocking: convert EOF from read to linuxerr.ErrWouldBlock. // See TryOpenReadOnly for more details. return p.TryOpenReadOnly(ctx, opener) } @@ -120,7 +120,7 @@ func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, f // TryOpenReadOnly tries to open a host pipe read only but only returns a fs.File when // there is a coordinating writer. Call TryOpenReadOnly repeatedly on the same pipeOpenState -// until syserror.ErrWouldBlock is no longer returned. +// until linuxerr.ErrWouldBlock is no longer returned. // // How it works: // @@ -150,7 +150,7 @@ func (p *pipeOpenState) TryOpenReadOnly(ctx context.Context, opener NonBlockingO if n == 0 { // EOF means that we're not ready yet. if rerr == nil || rerr == io.EOF { - return nil, syserror.ErrWouldBlock + return nil, linuxerr.ErrWouldBlock } // Any error that is not EWOULDBLOCK also means we're not // ready yet, and probably never will be ready. In this @@ -175,16 +175,16 @@ func (p *pipeOpenState) TryOpenReadOnly(ctx context.Context, opener NonBlockingO // TryOpenWriteOnly tries to open a host pipe write only but only returns a fs.File when // there is a coordinating reader. Call TryOpenWriteOnly repeatedly on the same pipeOpenState -// until syserror.ErrWouldBlock is no longer returned. +// until linuxerr.ErrWouldBlock is no longer returned. // // How it works: // // Opening a pipe write only will return ENXIO until readers are available. Converts the ENXIO -// to an syserror.ErrWouldBlock, to tell callers to retry. +// to an linuxerr.ErrWouldBlock, to tell callers to retry. func (*pipeOpenState) TryOpenWriteOnly(ctx context.Context, opener NonBlockingOpener) (*pipeOperations, error) { hostFile, err := opener.NonBlockingOpen(ctx, fs.PermMask{Write: true}) if unwrapError(err) == unix.ENXIO { - return nil, syserror.ErrWouldBlock + return nil, linuxerr.ErrWouldBlock } if err != nil { return nil, err diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 89d8be741..e1587288e 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -30,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -146,18 +145,18 @@ func TestTryOpen(t *testing.T) { err: unix.ENOENT, }, { - desc: "Blocking Write only returns with syserror.ErrWouldBlock", + desc: "Blocking Write only returns with linuxerr.ErrWouldBlock", makePipe: true, flags: fs.FileFlags{Write: true}, expectFile: false, - err: syserror.ErrWouldBlock, + err: linuxerr.ErrWouldBlock, }, { - desc: "Blocking Read only returns with syserror.ErrWouldBlock", + desc: "Blocking Read only returns with linuxerr.ErrWouldBlock", makePipe: true, flags: fs.FileFlags{Read: true}, expectFile: false, - err: syserror.ErrWouldBlock, + err: linuxerr.ErrWouldBlock, }, } { name := pipename() @@ -316,7 +315,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { // another writer comes along. This means we can open the same pipe write only // with no problems + write to it, given that opener.Open already tried to open // the pipe RDONLY and succeeded, which we know happened if TryOpen returns - // syserror.ErrwouldBlock. + // linuxerr.ErrwouldBlock. // // This simulates the open(RDONLY) <-> open(WRONLY)+write race we care about, but // does not cause our test to be racy (which would be terrible). @@ -328,8 +327,8 @@ func TestCopiedReadAheadBuffer(t *testing.T) { pipeOps.Release(ctx) t.Fatalf("open(%s, %o) got file, want nil", name, unix.O_RDONLY) } - if err != syserror.ErrWouldBlock { - t.Fatalf("open(%s, %o) got error %v, want %v", name, unix.O_RDONLY, err, syserror.ErrWouldBlock) + if err != linuxerr.ErrWouldBlock { + t.Fatalf("open(%s, %o) got error %v, want %v", name, unix.O_RDONLY, err, linuxerr.ErrWouldBlock) } // Then open the same pipe write only and write some bytes to it. The next diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index 4c8905a7e..63900e766 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -238,7 +237,7 @@ func TestPipeRequest(t *testing.T) { context: &Readv{Dst: usermem.BytesIOSequence(make([]byte, 10))}, flags: fs.FileFlags{Read: true}, keepOpenPartner: true, - err: syserror.ErrWouldBlock, + err: linuxerr.ErrWouldBlock, }, { desc: "Writev on pipe from empty buffer returns nil", @@ -410,8 +409,8 @@ func TestPipeReadsAccumulate(t *testing.T) { n, err := p.Read(ctx, file, iov, 0) total := n iov = iov.DropFirst64(n) - if err != syserror.ErrWouldBlock { - t.Fatalf("Readv got error %v, want %v", err, syserror.ErrWouldBlock) + if err != linuxerr.ErrWouldBlock { + t.Fatalf("Readv got error %v, want %v", err, linuxerr.ErrWouldBlock) } // Write a few more bytes to allow us to read more/accumulate. @@ -479,8 +478,8 @@ func TestPipeWritesAccumulate(t *testing.T) { } iov := usermem.BytesIOSequence(writeBuffer) n, err := p.Write(ctx, file, iov, 0) - if err != syserror.ErrWouldBlock { - t.Fatalf("Writev got error %v, want %v", err, syserror.ErrWouldBlock) + if err != linuxerr.ErrWouldBlock { + t.Fatalf("Writev got error %v, want %v", err, linuxerr.ErrWouldBlock) } if n != int64(pipeSize) { t.Fatalf("Writev partial write, got: %v, want %v", n, pipeSize) diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 57f904801..df04f044d 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsmetric" @@ -27,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -195,10 +195,10 @@ func (f *File) EventUnregister(e *waiter.Entry) { // offset to the value returned by f.FileOperations.Seek if the operation // is successful. // -// Returns syserror.ErrInterrupted if seeking was interrupted. +// Returns linuxerr.ErrInterrupted if seeking was interrupted. func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64, error) { if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -217,10 +217,10 @@ func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64 // Readdir unconditionally updates the access time on the File's Inode, // see fs/readdir.c:iterate_dir. // -// Returns syserror.ErrInterrupted if reading was interrupted. +// Returns linuxerr.ErrInterrupted if reading was interrupted. func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -232,13 +232,13 @@ func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { // Readv calls f.FileOperations.Read with f as the File, advancing the file // offset if f.FileOperations.Read returns bytes read > 0. // -// Returns syserror.ErrInterrupted if reading was interrupted. +// Returns linuxerr.ErrInterrupted if reading was interrupted. func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) { start := fsmetric.StartReadWait() defer fsmetric.FinishReadWait(fsmetric.ReadWait, start) if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } fsmetric.Reads.Increment() @@ -260,7 +260,7 @@ func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) defer fsmetric.FinishReadWait(fsmetric.ReadWait, start) if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } fsmetric.Reads.Increment() @@ -276,10 +276,10 @@ func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) // unavoidably racy for network file systems. Writev also truncates src // to avoid overrunning the current file size limit if necessary. // -// Returns syserror.ErrInterrupted if writing was interrupted. +// Returns linuxerr.ErrInterrupted if writing was interrupted. func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error) { if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. @@ -297,7 +297,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error case ok && limit == 0: unlockAppendMu() f.mu.Unlock() - return 0, syserror.ErrExceedsFileSizeLimit + return 0, linuxerr.ErrExceedsFileSizeLimit case ok: src = src.TakeFirst64(limit) } @@ -335,7 +335,7 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 limit, ok := f.checkLimit(ctx, offset) switch { case ok && limit == 0: - return 0, syserror.ErrExceedsFileSizeLimit + return 0, linuxerr.ErrExceedsFileSizeLimit case ok: src = src.TakeFirst64(limit) } @@ -352,7 +352,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { if err != nil { // This is an odd error, we treat it as evidence that // something is terribly wrong with the filesystem. - return syserror.EIO + return linuxerr.EIO } // Update the offset. @@ -381,10 +381,10 @@ func (f *File) checkLimit(ctx context.Context, offset int64) (int64, bool) { // Fsync calls f.FileOperations.Fsync with f as the File. // -// Returns syserror.ErrInterrupted if syncing was interrupted. +// Returns linuxerr.ErrInterrupted if syncing was interrupted. func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncType) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -393,10 +393,10 @@ func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncT // Flush calls f.FileOperations.Flush with f as the File. // -// Returns syserror.ErrInterrupted if syncing was interrupted. +// Returns linuxerr.ErrInterrupted if syncing was interrupted. func (f *File) Flush(ctx context.Context) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -405,10 +405,10 @@ func (f *File) Flush(ctx context.Context) error { // ConfigureMMap calls f.FileOperations.ConfigureMMap with f as the File. // -// Returns syserror.ErrInterrupted if interrupted. +// Returns linuxerr.ErrInterrupted if interrupted. func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -417,10 +417,10 @@ func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { // UnstableAttr calls f.FileOperations.UnstableAttr with f as the File. // -// Returns syserror.ErrInterrupted if interrupted. +// Returns linuxerr.ErrInterrupted if interrupted. func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { if !f.mu.Lock(ctx) { - return UnstableAttr{}, syserror.ErrInterrupted + return UnstableAttr{}, linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -495,7 +495,7 @@ type lockedReader struct { // Read implements io.Reader.Read. func (r *lockedReader) Read(buf []byte) (int, error) { if r.Ctx.Interrupted() { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.Offset) r.Offset += n @@ -505,7 +505,7 @@ func (r *lockedReader) Read(buf []byte) (int, error) { // ReadAt implements io.Reader.ReadAt. func (r *lockedReader) ReadAt(buf []byte, offset int64) (int, error) { if r.Ctx.Interrupted() { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), offset) return int(n), err @@ -530,7 +530,7 @@ type lockedWriter struct { // Write implements io.Writer.Write. func (w *lockedWriter) Write(buf []byte) (int, error) { if w.Ctx.Interrupted() { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } n, err := w.WriteAt(buf, w.Offset) w.Offset += int64(n) @@ -549,7 +549,7 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { // contract. Enforce that here. for written < len(buf) { if w.Ctx.Interrupted() { - return written, syserror.ErrInterrupted + return written, linuxerr.ErrInterrupted } var n int64 n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written)) diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index 6ec721022..ce47c3907 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -120,7 +120,7 @@ type FileOperations interface { // Files with !FileFlags.Pwrite. // // If only part of src could be written, Write must return an error - // indicating why (e.g. syserror.ErrWouldBlock). + // indicating why (e.g. linuxerr.ErrWouldBlock). // // Write does not check permissions nor flags. // diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 06c07c807..a27dd0b9a 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -16,6 +16,7 @@ package fs import ( "io" + "math" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" @@ -23,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -246,7 +246,7 @@ func (f *overlayFileOperations) onTop(ctx context.Context, file *File, fn func(* // Something very wrong; return a generic filesystem // error to avoid propagating internals. f.upperMu.Unlock() - return syserror.EIO + return linuxerr.EIO } // Save upper file. @@ -361,10 +361,13 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt return linuxerr.ENODEV } - // FIXME(jamieliu): This is a copy/paste of fsutil.GenericConfigureMMap, - // which we can't use because the overlay implementation is in package fs, - // so depending on fs/fsutil would create a circular dependency. Move - // overlay to fs/overlay. + // TODO(gvisor.dev/issue/1624): This is a copy/paste of + // fsutil.GenericConfigureMMap, which we can't use because the overlay + // implementation is in package fs, so depending on fs/fsutil would create + // a circular dependency. VFS2 overlay doesn't have this issue. + if opts.Offset+opts.Length > math.MaxInt64 { + return linuxerr.EOVERFLOW + } opts.Mappable = o opts.MappingIdentity = file file.IncRef() diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 6bf2d51cb..1a59800ea 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -90,7 +90,6 @@ go_library( "//pkg/sentry/usage", "//pkg/state", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index 00b3bb29b..38e3ed42d 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -16,13 +16,13 @@ package fsutil import ( "io" + "math" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -211,6 +211,9 @@ func (FileNoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) err // GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most // filesystems that support memory mapping. func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error { + if opts.Offset+opts.Length > math.MaxInt64 { + return linuxerr.EOVERFLOW + } opts.Mappable = m opts.MappingIdentity = file file.IncRef() @@ -232,12 +235,12 @@ type FileNoSplice struct{} // WriteTo implements fs.FileOperations.WriteTo. func (FileNoSplice) WriteTo(context.Context, *fs.File, io.Writer, int64, bool) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // ReadFrom implements fs.FileOperations.ReadFrom. func (FileNoSplice) ReadFrom(context.Context, *fs.File, io.Reader, int64) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // DirFileOperations implements most of fs.FileOperations for directories, @@ -255,12 +258,12 @@ type DirFileOperations struct { // Read implements fs.FileOperations.Read func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements fs.FileOperations.Write. func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // StaticDirFileOperations implements fs.FileOperations for directories with diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index 23528bf25..37ddb1a3c 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -93,7 +93,8 @@ func NewHostFileMapper() *HostFileMapper { func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() - for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + chunkStart := mr.Start &^ chunkMask + for { refs := f.refs[chunkStart] pgs := pagesInChunk(mr, chunkStart) if refs+pgs < refs { @@ -101,6 +102,10 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) } f.refs[chunkStart] = refs + pgs + chunkStart += chunkSize + if chunkStart >= mr.End || chunkStart == 0 { + break + } } } @@ -112,7 +117,8 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() - for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + chunkStart := mr.Start &^ chunkMask + for { refs := f.refs[chunkStart] pgs := pagesInChunk(mr, chunkStart) switch { @@ -128,6 +134,10 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { case refs < pgs: panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) } + chunkStart += chunkSize + if chunkStart >= mr.End || chunkStart == 0 { + break + } } } @@ -161,7 +171,8 @@ func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, if write { prot |= unix.PROT_WRITE } - for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { + chunkStart := fr.Start &^ chunkMask + for { m, ok := f.mappings[chunkStart] if !ok { addr, _, errno := unix.Syscall6( @@ -201,6 +212,10 @@ func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, endOff = fr.End - chunkStart } fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff)) + chunkStart += chunkSize + if chunkStart >= fr.End || chunkStart == 0 { + break + } } return nil } diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 7c2de04c1..06a994193 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -167,7 +166,7 @@ func (i *InodeSimpleAttributes) DropLink() { // StatFS implements fs.InodeOperations.StatFS. func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) { if i.fsType == 0 { - return fs.Info{}, syserror.ENOSYS + return fs.Info{}, linuxerr.ENOSYS } return fs.Info{Type: i.fsType}, nil } @@ -294,7 +293,7 @@ type InodeNoStatFS struct{} // StatFS implements fs.InodeOperations.StatFS. func (InodeNoStatFS) StatFS(context.Context) (fs.Info, error) { - return fs.Info{}, syserror.ENOSYS + return fs.Info{}, linuxerr.ENOSYS } // InodeStaticFileGetter implements GetFile for a file with static contents. @@ -401,7 +400,7 @@ type InodeIsDirTruncate struct{} // Truncate implements fs.InodeOperations.Truncate. func (InodeIsDirTruncate) Truncate(context.Context, *fs.Inode, int64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // InodeNoopTruncate implements fs.InodeOperations.Truncate as a noop. @@ -425,7 +424,7 @@ type InodeNotOpenable struct{} // GetFile implements fs.InodeOperations.GetFile. func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) { - return nil, syserror.EIO + return nil, linuxerr.EIO } // InodeNotVirtual can be used by Inodes that are not virtual. @@ -529,5 +528,5 @@ type InodeIsDirAllocate struct{} // Allocate implements fs.InodeOperations.Allocate. func (InodeIsDirAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { - return syserror.EISDIR + return linuxerr.EISDIR } diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index c08301d19..ee2f287d9 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -26,6 +26,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors", "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/hostarch", @@ -48,7 +49,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/unet", "//pkg/usermem", "//pkg/waiter", @@ -63,10 +63,10 @@ go_test( library = ":gofer", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/p9", "//pkg/p9/p9test", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 73d80d9b5..62a517cd7 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -226,7 +226,7 @@ func (f *fileOperations) maybeSync(ctx context.Context, file *fs.File, offset, n func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { if fs.IsDir(file.Dirent.Inode.StableAttr) { // Not all remote file systems enforce this so this client does. - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } var ( @@ -294,7 +294,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO if fs.IsDir(file.Dirent.Inode.StableAttr) { // Not all remote file systems enforce this so this client does. f.incrementReadCounters(start) - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) { diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 546ee7d04..4924debeb 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -19,8 +19,8 @@ import ( "testing" "time" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/p9/p9test" "gvisor.dev/gvisor/pkg/sentry/contexttest" @@ -97,7 +97,7 @@ func TestLookup(t *testing.T) { }, { name: "mock Walk fails (function fails)", - want: unix.ENOENT, + want: linuxerr.ENOENT, }, } @@ -123,7 +123,7 @@ func TestLookup(t *testing.T) { var newInodeOperations fs.InodeOperations if dirent != nil { if dirent.IsNegative() { - err = unix.ENOENT + err = linuxerr.ENOENT } else { newInodeOperations = dirent.Inode.InodeOperations } @@ -131,9 +131,11 @@ func TestLookup(t *testing.T) { // Check return values. if err != test.want { + t.Logf("err: %v %T", err, err) t.Errorf("Lookup got err %v, want %v", err, test.want) } if err == nil && newInodeOperations == nil { + t.Logf("err: %v %T", err, err) t.Errorf("Lookup got non-nil err and non-nil node, wanted at least one non-nil") } }) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 9ff64a8b6..c3856094f 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + gErr "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" @@ -32,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // inodeOperations implements fs.InodeOperations. @@ -719,12 +719,12 @@ func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) er } func init() { - syserror.AddErrorUnwrapper(func(err error) (unix.Errno, bool) { + linuxerr.AddErrorUnwrapper(func(err error) (*gErr.Error, bool) { if _, ok := err.(p9.ErrSocket); ok { // Treat as an I/O error. - return unix.EIO, true + return linuxerr.EIO, true } - return 0, false + return nil, false }) } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 88d83060c..2f8769f1e 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's @@ -60,7 +59,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string if cp.cacheNegativeDirents() { return fs.NewNegativeDirent(name), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } i.readdirMu.Unlock() } @@ -74,7 +73,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // is created over it. return fs.NewNegativeDirent(name), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return nil, err } @@ -169,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string hostFile.Close() } unopened.close(ctx) - return nil, syserror.EIO + return nil, linuxerr.EIO } qid := qids[0] diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 24fc6305c..921612e9c 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/uniqueid", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", "//pkg/usermem", diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 77c08a7ce..1d0d95634 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -201,7 +200,7 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I writer := fd.NewReadWriter(f.iops.fileState.FD()) n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer}) if isBlockError(err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } return n, err } @@ -232,7 +231,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO if n != 0 { err = nil } else { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } } return n, err diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 5f6af2067..92d58e3e9 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -220,7 +219,7 @@ func (i *inodeOperations) Release(context.Context) { // Lookup implements fs.InodeOperations.Lookup. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // Create implements fs.InodeOperations.Create. @@ -400,7 +399,7 @@ func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error // StatFS implements fs.InodeOperations.StatFS. func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) { - return fs.Info{}, syserror.ENOSYS + return fs.Info{}, linuxerr.ENOSYS } // AddLink implements fs.InodeOperations.AddLink. diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 6f38b25c3..4e561c5ed 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -327,7 +326,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // If the signal is SIGTTIN, then we are attempting to read // from the TTY. Don't send the signal and return EIO. if sig == linux.SIGTTIN { - return syserror.EIO + return linuxerr.EIO } // Otherwise, we are writing or changing terminal state. This is allowed. @@ -336,7 +335,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // If the process group is an orphan, return EIO. if pg.IsOrphan() { - return syserror.EIO + return linuxerr.EIO } // Otherwise, send the signal to the process group and return ERESTARTSYS. @@ -349,7 +348,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } // LINT.ThenChange(../../fsimpl/host/tty.go) diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index e7db79189..f2a33cc14 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -96,7 +96,7 @@ type dirInfo struct { // LINT.IfChange // isBlockError unwraps os errors and checks if they are caused by EAGAIN or -// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +// EWOULDBLOCK. This is so they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { if linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) { return true diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index ec204e5cf..2c6b9e9db 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Inode is a file system object that can be simultaneously referenced by different @@ -357,7 +356,7 @@ func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error // Truncate calls i.InodeOperations.Truncate with i as the Inode. func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { if IsDir(i.StableAttr) { - return syserror.EISDIR + return linuxerr.EISDIR } if i.overlay != nil { diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 98e9fb2b1..0f8022906 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -66,7 +66,7 @@ type InodeOperations interface { // // * A nil Dirent and a non-nil error. If the reason that Lookup failed // was because the name does not exist under Inode, then must return - // syserror.ENOENT. + // linuxerr.ENOENT. // // * If name does not exist under dir and the file system wishes this // fact to be cached, a non-nil Dirent containing a nil Inode and a diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index c47b9ce58..21ad7fa69 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) func overlayHasWhiteout(ctx context.Context, parent *Inode, name string) bool { @@ -103,7 +102,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Upper fs is not OK with a negative Dirent // being cached in the Dirent tree, so don't // return one. - return nil, false, syserror.ENOENT + return nil, false, linuxerr.ENOENT } entry, err := newOverlayEntry(ctx, upperInode, nil, false) if err != nil { @@ -165,7 +164,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name if negativeUpperChild { return NewNegativeDirent(name), false, nil } - return nil, false, syserror.ENOENT + return nil, false, linuxerr.ENOENT } // Did we find a lower Inode? Remember this because we may decide we don't diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index ee28b0f99..51cd6cd37 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -141,7 +140,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i if i.events.Empty() { // Nothing to read yet, tell caller to block. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } var writeLen int64 @@ -179,7 +178,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i // WriteTo implements FileOperations.WriteTo. func (*Inotify) WriteTo(context.Context, *File, io.Writer, int64, bool) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Fsync implements FileOperations.Fsync. @@ -189,7 +188,7 @@ func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error { // ReadFrom implements FileOperations.ReadFrom. func (*Inotify) ReadFrom(context.Context, *File, io.Reader, int64) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Flush implements FileOperations.Flush. diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index e6d74b949..bc75ae505 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -50,7 +50,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/usermem", diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 379429ab2..75dc5d204 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -107,7 +107,7 @@ func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen return 0, linuxerr.EINVAL } - m, err := getTaskMM(f.t) + m, err := getTaskMMIncRef(f.t) if err != nil { return 0, err } diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index e90da225a..e68bb46c0 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -20,12 +20,12 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -37,7 +37,7 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF n, err := strconv.ParseUint(p, 10, 64) if err != nil { // Not found. - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } var file *fs.File @@ -48,7 +48,7 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF } }) if file == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return toInode(file, fdFlags), nil } diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 546b57287..b9629c598 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package proc implements a partial in-memory file system for profs. +// Package proc implements a partial in-memory file system for procfs. package proc import ( @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -125,7 +124,7 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if t := kernel.TaskFromContext(ctx); t != nil { tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return strconv.FormatUint(uint64(tgid), 10), nil } @@ -149,7 +148,7 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) if tid == 0 || tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index 085aa6d61..443b9a94c 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -109,6 +109,9 @@ func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode "shmall": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))), "shmmax": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))), "shmmni": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))), + "msgmni": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.MSGMNI, 10))), + "msgmax": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.MSGMAX, 10))), + "msgmnb": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.MSGMNB, 10))), } d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555)) diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index edd62b857..03f2a882d 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -35,17 +35,30 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // LINT.IfChange -// getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's -// users count is incremented, and must be decremented by the caller when it is -// no longer in use. -func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { +// getTaskMM gets the kernel task's MemoryManager. No additional reference is +// taken on mm here. This is safe because MemoryManager.destroy is required to +// leave the MemoryManager in a state where it's still usable as a +// DynamicBytesSource. +func getTaskMM(t *kernel.Task) *mm.MemoryManager { + var tmm *mm.MemoryManager + t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// getTaskMMIncRef returns t's MemoryManager. If getTaskMMIncRef succeeds, the +// MemoryManager's users count is incremented, and must be decremented by the +// caller when it is no longer in use. +func getTaskMMIncRef(t *kernel.Task) (*mm.MemoryManager, error) { if t.ExitState() == kernel.TaskExitDead { return nil, linuxerr.ESRCH } @@ -182,7 +195,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry tasks := f.t.ThreadGroup().MemberIDs(f.pidns) if len(tasks) == 0 { - return offset, syserror.ENOENT + return offset, linuxerr.ENOENT } if offset == 0 { @@ -234,15 +247,15 @@ var _ fs.FileOperations = (*subtasksFile)(nil) func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { tid, err := strconv.ParseUint(p, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } task := s.p.pidns.TaskWithID(kernel.ThreadID(tid)) if task == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if task.ThreadGroup() != s.t.ThreadGroup() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } td := s.p.newTaskDir(ctx, task, dir.MountSource, false) @@ -270,21 +283,18 @@ func (e *exe) executable() (file fsbridge.File, err error) { if err := checkTaskState(e.t); err != nil { return nil, err } - e.t.WithMuLocked(func(t *kernel.Task) { - mm := t.MemoryManager() - if mm == nil { - err = linuxerr.EACCES - return - } + mm := getTaskMM(e.t) + if mm == nil { + return nil, linuxerr.EACCES + } - // The MemoryManager may be destroyed, in which case - // MemoryManager.destroy will simply set the executable to nil - // (with locks held). - file = mm.Executable() - if file == nil { - err = linuxerr.ESRCH - } - }) + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + file = mm.Executable() + if file == nil { + err = linuxerr.ESRCH + } return } @@ -464,7 +474,7 @@ func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen if dst.NumBytes() == 0 { return 0, nil } - mm, err := getTaskMM(m.t) + mm, err := getTaskMMIncRef(m.t) if err != nil { return 0, nil } @@ -479,7 +489,7 @@ func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen return int64(n), nil } if readErr != nil { - return 0, syserror.EIO + return 0, linuxerr.EIO } return 0, nil } @@ -495,22 +505,9 @@ func newMaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inod return newProcInode(ctx, seqfile.NewSeqFile(ctx, &mapsData{t}), msrc, fs.SpecialFile, t) } -func (md *mapsData) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - md.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - // No additional reference is taken on mm here. This is safe - // because MemoryManager.destroy is required to leave the - // MemoryManager in a state where it's still usable as a SeqSource. - tmm = mm - } - }) - return tmm -} - // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. func (md *mapsData) NeedsUpdate(generation int64) bool { - if mm := md.mm(); mm != nil { + if mm := getTaskMM(md.t); mm != nil { return mm.NeedsUpdate(generation) } return true @@ -518,7 +515,7 @@ func (md *mapsData) NeedsUpdate(generation int64) bool { // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { - if mm := md.mm(); mm != nil { + if mm := getTaskMM(md.t); mm != nil { return mm.ReadMapsSeqFileData(ctx, h) } return []seqfile.SeqData{}, 0 @@ -535,22 +532,9 @@ func newSmaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Ino return newProcInode(ctx, seqfile.NewSeqFile(ctx, &smapsData{t}), msrc, fs.SpecialFile, t) } -func (sd *smapsData) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - sd.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - // No additional reference is taken on mm here. This is safe - // because MemoryManager.destroy is required to leave the - // MemoryManager in a state where it's still usable as a SeqSource. - tmm = mm - } - }) - return tmm -} - // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. func (sd *smapsData) NeedsUpdate(generation int64) bool { - if mm := sd.mm(); mm != nil { + if mm := getTaskMM(sd.t); mm != nil { return mm.NeedsUpdate(generation) } return true @@ -558,7 +542,7 @@ func (sd *smapsData) NeedsUpdate(generation int64) bool { // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. func (sd *smapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { - if mm := sd.mm(); mm != nil { + if mm := getTaskMM(sd.t); mm != nil { return mm.ReadSmapsSeqFileData(ctx, h) } return []seqfile.SeqData{}, 0 @@ -628,12 +612,10 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) fmt.Fprintf(&buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) + if mm := getTaskMM(s.t); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } fmt.Fprintf(&buf, "%d %d ", vss, rss/hostarch.PageSize) // rsslim. @@ -678,12 +660,10 @@ func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([ } var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) + if mm := getTaskMM(s.t); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } var buf bytes.Buffer fmt.Fprintf(&buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) @@ -735,12 +715,13 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.CurrentMaxFDs() } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() - } }) + + if mm := getTaskMM(s.t); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } fmt.Fprintf(&buf, "FDSize:\t%d\n", fds) fmt.Fprintf(&buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(&buf, "VmRSS:\t%d kB\n", rss>>10) @@ -926,7 +907,7 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc return 0, linuxerr.EINVAL } - m, err := getTaskMM(f.t) + m, err := getTaskMMIncRef(f.t) if err != nil { return 0, err } diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index b46567cf8..bfff010c5 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -21,7 +21,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/socket/unix/transport", "//pkg/sync", - "//pkg/syserror", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index 33023af77..b1fadee7a 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // CreateOps represents operations to create different file types. @@ -284,9 +283,9 @@ func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) { return inode, nil } - // fs.InodeOperations.Lookup returns syserror.ENOENT if p + // fs.InodeOperations.Lookup returns linuxerr.ENOENT if p // does not exist. - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // createInodeOperationsCommon creates a new child node at this dir by calling diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index fff4befb2..266140f6f 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/syserror" ) // Splice moves data to this file, directly from another. @@ -55,26 +54,26 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, case dst.UniqueID < src.UniqueID: // Acquire dst first. if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } if !src.mu.Lock(ctx) { dst.mu.Unlock() - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } case dst.UniqueID > src.UniqueID: // Acquire src first. if !src.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } if !dst.mu.Lock(ctx) { src.mu.Unlock() - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } case dst.UniqueID == src.UniqueID: // Acquire only one lock; it's the same file. This is a // bit of a edge case, but presumably it's possible. if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } srcLock = false // Only need one unlock. } @@ -84,13 +83,13 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, case dstLock: // Acquire only dst. if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } opts.DstStart = dst.offset // Safe: locked. case srcLock: // Acquire only src. if !src.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } opts.SrcStart = src.offset // Safe: locked. } @@ -108,7 +107,7 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, limit, ok := dst.checkLimit(ctx, opts.DstStart) switch { case ok && limit == 0: - err = syserror.ErrExceedsFileSizeLimit + err = linuxerr.ErrExceedsFileSizeLimit case ok && limit < opts.Length: opts.Length = limit // Cap the write. } diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index 0148b33cf..e61115932 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -14,7 +14,6 @@ go_library( "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel/time", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 093a14c1f..1c8518d71 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -134,7 +133,7 @@ func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.I } return sizeofUint64, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Write implements fs.FileOperations.Write. diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 5933cb67b..9e9dc06f3 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -31,7 +31,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 3242dcb6a..5716e2ee9 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -155,12 +154,12 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str n, err := strconv.ParseUint(name, 10, 32) if err != nil { // Not found. - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s, ok := d.replicas[uint32(n)] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s.IncRef() @@ -235,7 +234,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e n := d.next if n == math.MaxUint32 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } if _, ok := d.replicas[n]; ok { @@ -335,10 +334,10 @@ func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, seriali // Read implements FileOperations.Read func (df *dirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements FileOperations.Write. func (df *dirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 3ba02c218..f9fca6d8e 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -193,7 +193,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -207,7 +207,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error { @@ -228,7 +228,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -242,7 +242,7 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq l.masterWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // transformer is a helper interface to make it easier to stateify queue. diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 11d6c15d0..25d3c887e 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -17,12 +17,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -110,7 +110,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl defer q.mu.Unlock() if !q.readable { - return 0, false, syserror.ErrWouldBlock + return 0, false, linuxerr.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { @@ -155,7 +155,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip room := waitBufMaxBytes - q.waitBufLen // If out of room, return EAGAIN. if room == 0 && copyLen > 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Cap the size of the wait buffer. if copyLen > room { diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD index 4acc73ee0..23b5508fd 100644 --- a/pkg/sentry/fs/user/BUILD +++ b/pkg/sentry/fs/user/BUILD @@ -19,7 +19,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go index f6eaab2bd..67a9adfd7 100644 --- a/pkg/sentry/fs/user/path.go +++ b/pkg/sentry/fs/user/path.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // ResolveExecutablePath resolves the given executable name given the working @@ -81,7 +80,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s root := fs.RootFromContext(ctx) if root == nil { // Caller has no root. Don't bother traversing anything. - return "", syserror.ENOENT + return "", linuxerr.ENOENT } defer root.DecRef(ctx) for _, p := range paths { @@ -117,7 +116,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s } // Couldn't find it. - return "", syserror.ENOENT + return "", linuxerr.ENOENT } func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { @@ -156,7 +155,7 @@ func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNam } // Couldn't find it. - return "", syserror.ENOENT + return "", linuxerr.ENOENT } // getPath returns the PATH as a slice of strings given the environment diff --git a/pkg/sentry/fsimpl/cgroupfs/BUILD b/pkg/sentry/fsimpl/cgroupfs/BUILD index 4c9c5b344..e5fdcc776 100644 --- a/pkg/sentry/fsimpl/cgroupfs/BUILD +++ b/pkg/sentry/fsimpl/cgroupfs/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/context", "//pkg/coverage", "//pkg/errors/linuxerr", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -43,7 +44,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go index 4290ffe0d..71bb0a9c8 100644 --- a/pkg/sentry/fsimpl/cgroupfs/base.go +++ b/pkg/sentry/fsimpl/cgroupfs/base.go @@ -88,7 +88,6 @@ type controller interface { // +stateify savable type cgroupInode struct { dir - fs *filesystem // ts is the list of tasks in this cgroup. The kernel is responsible for // removing tasks from this list before they're destroyed, so any tasks on @@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil) func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { c := &cgroupInode{ - fs: fs, - ts: make(map[*kernel.Task]struct{}), + dir: dir{fs: fs}, + ts: make(map[*kernel.Task]struct{}), } + c.dir.cgi = c contents := make(map[string]kernfs.Inode) contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c}) @@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential } c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) - c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - c.dir.InitRefs() + c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents)) atomic.AddUint64(&fs.numCgroups, 1) diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index 22c8b7fda..edc3b50b9 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -32,7 +32,8 @@ // controllers associated with them. // // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between -// cgroupfs dentries and inodes. +// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref +// counted and exist until they're unlinked once or the FS is destroyed. // // # Synchronization // @@ -48,10 +49,11 @@ // Lock order: // // kernel.CgroupRegistry.mu -// cgroupfs.filesystem.mu -// kernel.TaskSet.mu -// kernel.Task.mu -// cgroupfs.filesystem.tasksMu. +// kernfs.filesystem.mu +// kernel.TaskSet.mu +// kernel.Task.mu +// cgroupfs.filesystem.tasksMu. +// cgroupfs.dir.OrderedChildren.mu package cgroupfs import ( @@ -63,6 +65,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -108,6 +111,7 @@ type FilesystemType struct{} // +stateify savable type InternalData struct { DefaultControlValues map[string]int64 + InitialCgroupPath string } // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. @@ -134,6 +138,11 @@ type filesystem struct { numCgroups uint64 // Protected by atomic ops. root *kernfs.Dentry + // effectiveRoot is the initial cgroup new tasks are created in. Unless + // overwritten by internal mount options, root == effectiveRoot. If + // effectiveRoot != root, an extra reference is held on effectiveRoot for + // the lifetime of the filesystem. + effectiveRoot *kernfs.Dentry // tasksMu serializes task membership changes across all cgroups within a // filesystem. @@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs := vfsfs.Impl().(*filesystem) ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) fs.root.IncRef() + if fs.effectiveRoot != fs.root { + fs.effectiveRoot.IncRef() + } return vfsfs, fs.root.VFSDentry(), nil } @@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var defaults map[string]int64 if opts.InternalData != nil { - ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) defaults = opts.InternalData.(*InternalData).DefaultControlValues + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) } for _, ty := range wantControllers { @@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) fs.root = &rootD + fs.effectiveRoot = fs.root + + if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) + rootD.DecRef(ctx) + fs.VFSFilesystem().DecRef(ctx) + return nil, nil, err + } // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same @@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), rootD.VFSDentry(), nil } +// prepareInitialCgroup creates the initial cgroup according to opts. An initial +// cgroup is optional, and if not specified, this function is a no-op. +func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { + if opts.InternalData == nil { + return nil + } + initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath + if initPathStr == "" { + return nil + } + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) + initPath := fspath.Parse(initPathStr) + if !initPath.Absolute || !initPath.HasComponents() { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) + return linuxerr.EINVAL + } + + // Have initial cgroup target, create the tree. + cgDir := fs.root.Inode().(*cgroupInode) + for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { + cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{}) + if err != nil { + return err + } + cgDir = cgDirI.(*cgroupInode) + } + + // Walk to target dentry. + initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) + if err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) + return linuxerr.ENOENT + } + fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. + return nil +} + func (fs *filesystem) rootCgroup() kernel.Cgroup { return kernel.Cgroup{ - Dentry: fs.root, - CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), + Dentry: fs.effectiveRoot, + CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), } } @@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) { r.Unregister(fs.hierarchyID) } + if fs.root != fs.effectiveRoot { + fs.effectiveRoot.DecRef(ctx) + } + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } @@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error // // +stateify savable type dir struct { - dirRefs + kernfs.InodeNoopRefCount kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir. + kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren implStatFS locks vfs.FileLocks + + fs *filesystem // Immutable. + cgi *cgroupInode // Immutable. } // Keep implements kernfs.Inode.Keep. @@ -378,9 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry return fd.VFSFileDescription(), nil } -// DecRef implements kernfs.Inode.DecRef. -func (d *dir) DecRef(ctx context.Context) { - d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +// NewDir implements kernfs.Inode.NewDir. +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { + // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable." + // -- Linux, kernel/cgroup.c:cgroup_mkdir(). + if strings.Contains(name, "\n") { + return nil, linuxerr.EINVAL + } + return d.OrderedChildren.Inserter(name, func() kernfs.Inode { + d.IncLinks(1) + return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx)) + }) +} + +// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of +// cgroup directories, and the rename may only change the name within the same +// parent. See linux, kernel/cgroup.c:cgroup_rename(). +func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { + if _, ok := child.(*cgroupInode); !ok { + // Not a cgroup directory. Control files are backed by different types. + return linuxerr.ENOTDIR + } + + dstCGInode, ok := dst.(*cgroupInode) + if !ok { + // Not a cgroup inode, so definitely can't be *this* inode. + return linuxerr.EIO + } + // Note: We're intentionally comparing addresses, since two different dirs + // could plausibly be identical in memory, but would occupy different + // locations in memory. + if d != &dstCGInode.dir { + // Destination dir is a different cgroup inode. Cross directory renames + // aren't allowed. + return linuxerr.EIO + } + + // Rename moves oldname to newname within d. Proceed. + return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) +} + +// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only +// files in the filesystem are control files, which can't be deleted. +func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { + return linuxerr.EPERM +} + +// hasChildrenLocked returns whether the cgroup dir contains any objects that +// prevent it from being deleted. +func (d *dir) hasChildrenLocked() bool { + // Subdirs take a link on the parent, so checks if there are any direct + // children cgroups. Exclude the dir's self link and the link from ".". + if d.InodeAttrs.Links()-2 > 0 { + return true + } + return len(d.cgi.ts) > 0 +} + +// HasChildren implements kernfs.Inode.HasChildren. +// +// The empty check for a cgroupfs directory is unlike a regular directory since +// a cgroupfs directory will always have control files. A cgroupfs directory can +// be deleted if cgroup contains no tasks and has no sub-cgroups. +func (d *dir) HasChildren() bool { + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + return d.hasChildrenLocked() +} + +// RmDir implements kernfs.Inode.RmDir. +func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { + // Unlike a normal directory, we need to recheck if d is empty again, since + // vfs/kernfs can't stop tasks from entering or leaving the cgroup. + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + + cgi, ok := child.(*cgroupInode) + if !ok { + return linuxerr.ENOTDIR + } + if cgi.dir.hasChildrenLocked() { + return linuxerr.ENOTEMPTY + } + + // Disallow deletion of the effective root cgroup. + if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { + ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) + return linuxerr.EBUSY + } + + err := d.OrderedChildren.RmDir(ctx, name, child) + if err == nil { + d.InodeAttrs.DecLinks() + } + return err } // controllerFile represents a generic control file that appears within a cgroup diff --git a/pkg/sentry/fsimpl/cgroupfs/memory.go b/pkg/sentry/fsimpl/cgroupfs/memory.go index 485c98376..d880c9bc4 100644 --- a/pkg/sentry/fsimpl/cgroupfs/memory.go +++ b/pkg/sentry/fsimpl/cgroupfs/memory.go @@ -31,22 +31,34 @@ import ( type memoryController struct { controllerCommon - limitBytes int64 + limitBytes int64 + softLimitBytes int64 + moveChargeAtImmigrate int64 } var _ controller = (*memoryController)(nil) func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryController { c := &memoryController{ - // Linux sets this to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, which - // is ~ 2**63 on a 64-bit system. So essentially, inifinity. The exact - // value isn't very important. - limitBytes: math.MaxInt64, + // Linux sets these limits to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, + // which is ~ 2**63 on a 64-bit system. So essentially, inifinity. The + // exact value isn't very important. + + limitBytes: math.MaxInt64, + softLimitBytes: math.MaxInt64, } - if val, ok := defaults["memory.limit_in_bytes"]; ok { - c.limitBytes = val - delete(defaults, "memory.limit_in_bytes") + + consumeDefault := func(name string, valPtr *int64) { + if val, ok := defaults[name]; ok { + *valPtr = val + delete(defaults, name) + } } + + consumeDefault("memory.limit_in_bytes", &c.limitBytes) + consumeDefault("memory.soft_limit_in_bytes", &c.softLimitBytes) + consumeDefault("memory.move_charge_at_immigrate", &c.moveChargeAtImmigrate) + c.controllerCommon.init(controllerMemory, fs) return c } @@ -55,6 +67,8 @@ func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryContr func (c *memoryController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents["memory.usage_in_bytes"] = c.fs.newControllerFile(ctx, creds, &memoryUsageInBytesData{}) contents["memory.limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.limitBytes)) + contents["memory.soft_limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.softLimitBytes)) + contents["memory.move_charge_at_immigrate"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.moveChargeAtImmigrate)) } // +stateify savable diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index f981ff296..e0b879339 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -45,7 +45,6 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 7a488e9fd..e711debcb 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the filesystem name. @@ -180,7 +179,7 @@ func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credential i.mu.Lock() defer i.mu.Unlock() if i.nextIdx == math.MaxUint32 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } idx := i.nextIdx i.nextIdx++ @@ -241,7 +240,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro // Not a static entry. idx, err := strconv.ParseUint(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } i.mu.Lock() defer i.mu.Unlock() @@ -250,7 +249,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro return ri, nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // IterDirents implements kernfs.Inode.IterDirents. diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index 9cb21e83b..609623f9f 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -203,7 +203,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque } else if notifyEcho { l.masterWaiter.Notify(waiter.ReadableEvents) } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -220,7 +220,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { @@ -242,7 +242,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -257,7 +257,7 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq l.masterWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // transformer is a helper interface to make it easier to stateify queue. diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index ff1d89955..85aeefa43 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -17,12 +17,12 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -110,7 +110,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl defer q.mu.Unlock() if !q.readable { - return 0, false, false, syserror.ErrWouldBlock + return 0, false, false, linuxerr.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { @@ -156,7 +156,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip room := waitBufMaxBytes - q.waitBufLen // If out of room, return EAGAIN. if room == 0 && copyLen > 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Cap the size of the wait buffer. if copyLen > room { diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD index c09fdc7f9..1cb049a29 100644 --- a/pkg/sentry/fsimpl/eventfd/BUILD +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -9,11 +9,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/log", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index 4f79cfcb7..af5ba5131 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -22,11 +22,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -149,7 +149,7 @@ func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem var buf [8]byte if _, err := unix.Read(efd.hostfd, buf[:]); err != nil { if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -167,7 +167,7 @@ func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequenc // We can't complete the read if the value is currently zero. if efd.val == 0 { efd.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } // Update the value based on the mode the event is operating in. @@ -200,7 +200,7 @@ func (efd *EventFileDescription) hostWriteLocked(val uint64) error { hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(efd.hostfd, buf[:]) if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -232,7 +232,7 @@ func (efd *EventFileDescription) Signal(val uint64) error { // uint64 minus 1. if val > math.MaxUint64-1-efd.val { efd.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } efd.val += val diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/pkg/sentry/fsimpl/ext/BUILD diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 871df5984..05c4fbeb2 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -59,7 +59,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -84,7 +83,6 @@ go_test( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index dab1e779d..0f855ac59 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -38,7 +37,7 @@ type fuseDevice struct{} // Open implements vfs.Device.Open. func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if !kernel.FUSEEnabled { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } var fd DeviceFD @@ -126,7 +125,7 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Read implements vfs.FileDescriptionImpl.Read. @@ -192,7 +191,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts } if req == nil { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // We already checked the size: dst must be able to fit the whole request. @@ -205,7 +204,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts return 0, err } if n != len(req.data) { - return 0, syserror.EIO + return 0, linuxerr.EIO } if req.hdr.Opcode == linux.FUSE_WRITE { @@ -214,7 +213,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts return 0, err } if written != len(req.payload) { - return 0, syserror.EIO + return 0, linuxerr.EIO } n += int(written) } @@ -238,7 +237,7 @@ func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset i return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Write implements vfs.FileDescriptionImpl.Write. @@ -395,7 +394,7 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // sendResponse sends a response to the waiting task (if any). diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 04250d796..8951b5ba8 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -20,11 +20,11 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -186,7 +186,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem. // "would block". n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{}) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go index fcc5d9a2a..9611edd5a 100644 --- a/pkg/sentry/fsimpl/fuse/directory.go +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -32,27 +32,27 @@ type directoryFD struct { // Allocate implements directoryFD.Allocate. func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // PRead implements vfs.FileDescriptionImpl.PRead. func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Read implements vfs.FileDescriptionImpl.Read. func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements vfs.FileDescriptionImpl.Write. func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 172cbd88f..af16098d2 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -30,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -612,7 +611,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo return nil, err } if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { - return nil, syserror.EIO + return nil, linuxerr.EIO } child := i.fs.newInode(ctx, out.NodeID, out.Attr) return child, nil diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go index 35d0ab6f4..fe119aa43 100644 --- a/pkg/sentry/fsimpl/fuse/read_write.go +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) // ReadInPages sends FUSE_READ requests for the size after round it up to @@ -221,7 +220,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, // Write more than requested? EIO. if out.Size > toWrite { - return 0, syserror.EIO + return 0, linuxerr.EIO } written += out.Size diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go index 6c4de3507..38cde8208 100644 --- a/pkg/sentry/fsimpl/fuse/regular_file.go +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -108,7 +107,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, err } if int64(cp) != toCopy { - return 0, syserror.EIO + return 0, linuxerr.EIO } copied += toCopy } @@ -205,7 +204,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, err } if int64(cp) != srclen { - return 0, offset, syserror.EIO + return 0, offset, linuxerr.EIO } n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) @@ -216,7 +215,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off if n == 0 { // We have checked srclen != 0 previously. // If err == nil, then it's a short write and we return EIO. - return 0, offset, syserror.EIO + return 0, offset, linuxerr.EIO } written = int64(n) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 752060044..509dd0e1a 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -54,7 +54,10 @@ go_library( "//pkg/fdnotifier", "//pkg/fspath", "//pkg/hostarch", + "//pkg/lisafs", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/metric", "//pkg/p9", "//pkg/refs", @@ -79,7 +82,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/unet", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 5c48a9fee..d99a6112c 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -222,47 +222,88 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() - if d.readFile.isNil() { + if !d.isReadFileOk() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. d.handleMu.RUnlock() panic("gofer.dentry.getDirents called without a readable handle") } + // shouldSeek0 indicates whether the server should SEEK to 0 before reading + // directory entries. + shouldSeek0 := true for { - p9ds, err := d.readFile.readdir(ctx, off, count) - if err != nil { - d.handleMu.RUnlock() - return nil, err - } - if len(p9ds) == 0 { - d.handleMu.RUnlock() - break - } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { - continue + if d.fs.opts.lisaEnabled { + countLisa := int32(count) + if shouldSeek0 { + // See lisafs.Getdents64Req.Count. + countLisa = -countLisa + shouldSeek0 = false + } + lisafsDs, err := d.readFDLisa.Getdents64(ctx, countLisa) + if err != nil { + d.handleMu.RUnlock() + return nil, err + } + if len(lisafsDs) == 0 { + d.handleMu.RUnlock() + break + } + for i := range lisafsDs { + name := string(lisafsDs[i].Name) + if name == "." || name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: name, + Ino: d.fs.inoFromKey(inoKey{ + ino: uint64(lisafsDs[i].Ino), + devMinor: uint32(lisafsDs[i].DevMinor), + devMajor: uint32(lisafsDs[i].DevMajor), + }), + NextOff: int64(len(dirents) + 1), + Type: uint8(lisafsDs[i].Type), + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[name] = struct{}{} + } } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: d.fs.inoFromQIDPath(p9d.QID.Path), - NextOff: int64(len(dirents) + 1), + } else { + p9ds, err := d.readFile.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break } - dirents = append(dirents, dirent) - if realChildren != nil { - realChildren[p9d.Name] = struct{}{} + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: d.fs.inoFromQIDPath(p9d.QID.Path), + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } } + off = p9ds[len(p9ds)-1].Offset } - off = p9ds[len(p9ds)-1].Offset } } // Emit entries for synthetic children. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 05b776c2e..f7b3446d3 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,10 +21,12 @@ import ( "sync" "sync/atomic" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/fsmetric" @@ -33,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -54,9 +55,47 @@ func (fs *filesystem) Sync(ctx context.Context) error { // regardless. var retErr error + if fs.opts.lisaEnabled { + // Try accumulating all FDIDs to fsync and fsync then via one RPC as + // opposed to making an RPC per FDID. Passing a non-nil accFsyncFDIDs to + // dentry.syncCachedFile() and specialFileFD.sync() will cause them to not + // make an RPC, instead accumulate syncable FDIDs in the passed slice. + accFsyncFDIDs := make([]lisafs.FDID, 0, len(ds)+len(sffds)) + + // Sync syncable dentries. + for _, d := range ds { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */, &accFsyncFDIDs); err != nil { + ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) + if retErr == nil { + retErr = err + } + } + } + + // Sync special files, which may be writable but do not use dentry shared + // handles (so they won't be synced by the above). + for _, sffd := range sffds { + if err := sffd.sync(ctx, true /* forFilesystemSync */, &accFsyncFDIDs); err != nil { + ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) + if retErr == nil { + retErr = err + } + } + } + + if err := fs.clientLisa.SyncFDs(ctx, accFsyncFDIDs); err != nil { + ctx.Infof("gofer.filesystem.Sync: fs.fsyncMultipleFDLisa failed: %v", err) + if retErr == nil { + retErr = err + } + } + + return retErr + } + // Sync syncable dentries. for _, d := range ds { - if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */); err != nil { ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) if retErr == nil { retErr = err @@ -67,7 +106,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { + if err := sffd.sync(ctx, true /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */); err != nil { ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) if retErr == nil { retErr = err @@ -198,7 +237,13 @@ afterSymlink: rp.Advance() return d.parent, followedSymlink, nil } - child, err := fs.getChildLocked(ctx, d, name, ds) + var child *dentry + var err error + if fs.opts.lisaEnabled { + child, err = fs.getChildAndWalkPathLocked(ctx, d, rp, ds) + } else { + child, err = fs.getChildLocked(ctx, d, name, ds) + } if err != nil { return nil, false, err } @@ -220,6 +265,99 @@ afterSymlink: return child, followedSymlink, nil } +// Preconditions: +// * fs.opts.lisaEnabled. +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +// * parent.isDir(). +// * parent and the dentry at name have been revalidated. +func (fs *filesystem) getChildAndWalkPathLocked(ctx context.Context, parent *dentry, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + // Note that pit is a copy of the iterator that does not affect rp. + pit := rp.Pit() + first := pit.String() + if len(first) > maxFilenameLen { + return nil, linuxerr.ENAMETOOLONG + } + if child, ok := parent.children[first]; ok || parent.isSynthetic() { + if child == nil { + return nil, linuxerr.ENOENT + } + return child, nil + } + + // Walk as much of the path as possible in 1 RPC. + names := []string{first} + for pit = pit.Next(); pit.Ok(); pit = pit.Next() { + name := pit.String() + if name == "." { + continue + } + if name == ".." { + break + } + names = append(names, name) + } + status, inodes, err := parent.controlFDLisa.WalkMultiple(ctx, names) + if err != nil { + return nil, err + } + if len(inodes) == 0 { + parent.cacheNegativeLookupLocked(first) + return nil, linuxerr.ENOENT + } + + // Add the walked inodes into the dentry tree. + curParent := parent + curParentDirMuLock := func() { + if curParent != parent { + curParent.dirMu.Lock() + } + } + curParentDirMuUnlock := func() { + if curParent != parent { + curParent.dirMu.Unlock() // +checklocksforce: locked via curParentDirMuLock(). + } + } + var ret *dentry + var dentryCreationErr error + for i := range inodes { + if dentryCreationErr != nil { + fs.clientLisa.CloseFDBatched(ctx, inodes[i].ControlFD) + continue + } + + child, err := fs.newDentryLisa(ctx, &inodes[i]) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, inodes[i].ControlFD) + dentryCreationErr = err + continue + } + curParentDirMuLock() + curParent.cacheNewChildLocked(child, names[i]) + curParentDirMuUnlock() + // For now, child has 0 references, so our caller should call + // child.checkCachingLocked(). curParent gained a ref so we should also + // call curParent.checkCachingLocked() so it can be removed from the cache + // if needed. We only do that for the first iteration because all + // subsequent parents would have already been added to ds. + if i == 0 { + *ds = appendDentry(*ds, curParent) + } + *ds = appendDentry(*ds, child) + curParent = child + if i == 0 { + ret = child + } + } + + if status == lisafs.WalkComponentDoesNotExist && curParent.isDir() { + curParentDirMuLock() + curParent.cacheNegativeLookupLocked(names[len(inodes)]) + curParentDirMuUnlock() + } + return ret, dentryCreationErr +} + // getChildLocked returns a dentry representing the child of parent with the // given name. Returns ENOENT if the child doesn't exist. // @@ -228,32 +366,47 @@ afterSymlink: // * parent.dirMu must be locked. // * parent.isDir(). // * name is not "." or "..". -// * dentry at name has been revalidated +// * parent and the dentry at name have been revalidated. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { return nil, linuxerr.ENAMETOOLONG } if child, ok := parent.children[name]; ok || parent.isSynthetic() { if child == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return child, nil } - qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) - if err != nil { - if linuxerr.Equals(linuxerr.ENOENT, err) { - parent.cacheNegativeLookupLocked(name) + var child *dentry + if fs.opts.lisaEnabled { + childInode, err := parent.controlFDLisa.Walk(ctx, name) + if err != nil { + if linuxerr.Equals(linuxerr.ENOENT, err) { + parent.cacheNegativeLookupLocked(name) + } + return nil, err + } + // Create a new dentry representing the file. + child, err = fs.newDentryLisa(ctx, childInode) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, childInode.ControlFD) + return nil, err + } + } else { + qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) + if err != nil { + if linuxerr.Equals(linuxerr.ENOENT, err) { + parent.cacheNegativeLookupLocked(name) + } + return nil, err + } + // Create a new dentry representing the file. + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) + if err != nil { + file.close(ctx) + return nil, err } - return nil, err - } - - // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) - if err != nil { - file.close(ctx) - delete(parent.children, name) - return nil, err } parent.cacheNewChildLocked(child, name) appendNewChildDentry(ds, parent, child) @@ -329,7 +482,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error), createInSyntheticDir func(parent *dentry, name string) error, updateChild func(child *dentry)) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) @@ -349,7 +502,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } if parent.isDeleted() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil { return err @@ -395,7 +548,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return err } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if parent.isSynthetic() { if createInSyntheticDir == nil { @@ -416,9 +569,26 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // No cached dentry exists; however, in InteropModeShared there might still be // an existing file at name. Just attempt the file creation RPC anyways. If a // file does exist, the RPC will fail with EEXIST like we would have. - if err := createInRemoteDir(parent, name, &ds); err != nil { + lisaInode, err := createInRemoteDir(parent, name, &ds) + if err != nil { return err } + // lisafs may aggresively cache newly created inodes. This has helped reduce + // Walk RPCs in practice. + if lisaInode != nil { + child, err := fs.newDentryLisa(ctx, lisaInode) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, lisaInode.ControlFD) + return err + } + parent.cacheNewChildLocked(child, name) + appendNewChildDentry(&ds, parent, child) + + // lisafs may update dentry properties upon successful creation. + if updateChild != nil { + updateChild(child) + } + } if fs.opts.interop != InteropModeShared { if child, ok := parent.children[name]; ok && child == nil { // Delete the now-stale negative dentry. @@ -463,7 +633,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } } else { if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } } @@ -486,7 +656,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b child, ok = parent.children[name] if ok && child == nil { // Hit a negative cached entry, child doesn't exist. - return syserror.ENOENT + return linuxerr.ENOENT } } else { child, _, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) @@ -552,7 +722,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // child must be a non-directory file. if child != nil && child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { if child != nil { @@ -563,10 +733,14 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } if parent.isSynthetic() { if child == nil { - return syserror.ENOENT + return linuxerr.ENOENT } } else if child == nil || !child.isSynthetic() { - err = parent.file.unlinkAt(ctx, name, flags) + if fs.opts.lisaEnabled { + err = parent.controlFDLisa.UnlinkAt(ctx, name, flags) + } else { + err = parent.file.unlinkAt(ctx, name, flags) + } if err != nil { if child != nil { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. @@ -659,40 +833,43 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error { + err := fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, ds **[]*dentry) (*lisafs.Inode, error) { if rp.Mount() != vd.Mount() { - return linuxerr.EXDEV + return nil, linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) if d.isDir() { - return linuxerr.EPERM + return nil, linuxerr.EPERM } gid := auth.KGID(atomic.LoadUint32(&d.gid)) uid := auth.KUID(atomic.LoadUint32(&d.uid)) mode := linux.FileMode(atomic.LoadUint32(&d.mode)) if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { - return err + return nil, err } if d.nlink == 0 { - return syserror.ENOENT + return nil, linuxerr.ENOENT } if d.nlink == math.MaxUint32 { - return linuxerr.EMLINK + return nil, linuxerr.EMLINK } - if err := parent.file.link(ctx, d.file, childName); err != nil { - return err + if fs.opts.lisaEnabled { + return parent.controlFDLisa.LinkAt(ctx, d.controlFDLisa.ID(), childName) } + return nil, parent.file.link(ctx, d.file, childName) + }, nil, nil) + if err == nil { // Success! - atomic.AddUint32(&d.nlink, 1) - return nil - }, nil) + vd.Dentry().Impl().(*dentry).incLinks() + } + return err } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { creds := rp.Credentials() - return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) { // If the parent is a setgid directory, use the parent's GID // rather than the caller's and enable setgid. kgid := creds.EffectiveKGID @@ -701,9 +878,18 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v kgid = auth.KGID(atomic.LoadUint32(&parent.gid)) mode |= linux.S_ISGID } - if _, err := parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)); err != nil { + var ( + childDirInode *lisafs.Inode + err error + ) + if fs.opts.lisaEnabled { + childDirInode, err = parent.controlFDLisa.MkdirAt(ctx, name, mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(kgid)) + } else { + _, err = parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) + } + if err != nil { if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { - return err + return nil, err } ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) parent.createSyntheticChildLocked(&createSyntheticOpts{ @@ -717,7 +903,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if fs.opts.interop != InteropModeShared { parent.incLinks() } - return nil + return childDirInode, nil }, func(parent *dentry, name string) error { if !opts.ForSyntheticMountpoint { // Can't create non-synthetic files in synthetic directories. @@ -731,16 +917,26 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }) parent.incLinks() return nil - }) + }, nil) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) { creds := rp.Credentials() - _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - if !linuxerr.Equals(linuxerr.EPERM, err) { - return err + var ( + childInode *lisafs.Inode + err error + ) + if fs.opts.lisaEnabled { + childInode, err = parent.controlFDLisa.MknodAt(ctx, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID), opts.DevMinor, opts.DevMajor) + } else { + _, err = parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + } + if err == nil { + return childInode, nil + } else if !linuxerr.Equals(linuxerr.EPERM, err) { + return nil, err } // EPERM means that gofer does not allow creating a socket or pipe. Fallback @@ -751,10 +947,10 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v switch { case err == nil: // Step succeeded, another file exists. - return linuxerr.EEXIST + return nil, linuxerr.EEXIST case !linuxerr.Equals(linuxerr.ENOENT, err): // Unexpected error. - return err + return nil, err } switch opts.Mode.FileType() { @@ -767,7 +963,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v endpoint: opts.Endpoint, }) *ds = appendDentry(*ds, parent) - return nil + return nil, nil case linux.S_IFIFO: parent.createSyntheticChildLocked(&createSyntheticOpts{ name: name, @@ -777,11 +973,11 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize), }) *ds = appendDentry(*ds, parent) - return nil + return nil, nil } // Retain error from gofer if synthetic file cannot be created internally. - return linuxerr.EPERM - }, nil) + return nil, linuxerr.EPERM + }, nil, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -811,7 +1007,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -841,7 +1037,7 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil { return nil, err @@ -922,11 +1118,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL @@ -987,6 +1183,23 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } + if d.fs.opts.lisaEnabled { + // Note that special value of linux.SockType = 0 is interpreted by lisafs + // as "do not care about the socket type". Analogous to p9.AnonymousSocket. + sockFD, err := d.controlFDLisa.Connect(ctx, 0 /* sockType */) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), sockFD, &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + unix.Close(sockFD) + return nil, err + } + return fd, nil + } fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) if err != nil { return nil, err @@ -999,6 +1212,7 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio fdObj.Close() return nil, err } + // Ownership has been transferred to fd. fdObj.Release() return fd, nil } @@ -1018,7 +1232,13 @@ func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs. // since closed its end. isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 retry: - h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + var h handle + var err error + if d.fs.opts.lisaEnabled { + h, err = openHandleLisa(ctx, d.controlFDLisa, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + } else { + h, err = openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + } if err != nil { if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) { // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails @@ -1054,7 +1274,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving return nil, err } if d.isDeleted() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -1062,18 +1282,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } defer mnt.EndWrite() - // 9P2000.L's lcreate takes a fid representing the parent directory, and - // converts it into an open fid representing the created file, so we need - // to duplicate the directory fid first. - _, dirfile, err := d.file.walk(ctx, nil) - if err != nil { - return nil, err - } creds := rp.Credentials() name := rp.Component() - // We only want the access mode for creating the file. - createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask - // If the parent is a setgid directory, use the parent's GID rather // than the caller's. kgid := creds.EffectiveKGID @@ -1081,51 +1291,87 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving kgid = auth.KGID(atomic.LoadUint32(&d.gid)) } - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) - if err != nil { - dirfile.close(ctx) - return nil, err - } - // Then we need to walk to the file we just created to get a non-open fid - // representing it, and to get its metadata. This must use d.file since, as - // explained above, dirfile was invalidated by dirfile.Create(). - _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) - if err != nil { - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() + var child *dentry + var openP9File p9file + openLisaFD := lisafs.InvalidFDID + openHostFD := int32(-1) + if d.fs.opts.lisaEnabled { + ino, openFD, hostFD, err := d.controlFDLisa.OpenCreateAt(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(kgid)) + if err != nil { + return nil, err + } + openHostFD = int32(hostFD) + openLisaFD = openFD + + child, err = d.fs.newDentryLisa(ctx, &ino) + if err != nil { + d.fs.clientLisa.CloseFDBatched(ctx, ino.ControlFD) + d.fs.clientLisa.CloseFDBatched(ctx, openFD) + if hostFD >= 0 { + unix.Close(hostFD) + } + return nil, err + } + } else { + // 9P2000.L's lcreate takes a fid representing the parent directory, and + // converts it into an open fid representing the created file, so we need + // to duplicate the directory fid first. + _, dirfile, err := d.file.walk(ctx, nil) + if err != nil { + return nil, err + } + // We only want the access mode for creating the file. + createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask + + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) + if err != nil { + dirfile.close(ctx) + return nil, err + } + // Then we need to walk to the file we just created to get a non-open fid + // representing it, and to get its metadata. This must use d.file since, as + // explained above, dirfile was invalidated by dirfile.Create(). + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + if err != nil { + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + + // Construct the new dentry. + child, err = d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) + if err != nil { + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err } - return nil, err - } - // Construct the new dentry. - child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) - if err != nil { - nonOpenFile.close(ctx) - openFile.close(ctx) if fdobj != nil { - fdobj.Close() + openHostFD = int32(fdobj.Release()) } - return nil, err + openP9File = openFile } // Incorporate the fid that was opened by lcreate. useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { - openFD := int32(-1) - if fdobj != nil { - openFD = int32(fdobj.Release()) - } child.handleMu.Lock() if vfs.MayReadFileWithOpenFlags(opts.Flags) { - child.readFile = openFile - if fdobj != nil { - child.readFD = openFD - child.mmapFD = openFD + child.readFile = openP9File + child.readFDLisa = d.fs.clientLisa.NewFD(openLisaFD) + if openHostFD != -1 { + child.readFD = openHostFD + child.mmapFD = openHostFD } } if vfs.MayWriteFileWithOpenFlags(opts.Flags) { - child.writeFile = openFile - child.writeFD = openFD + child.writeFile = openP9File + child.writeFDLisa = d.fs.clientLisa.NewFD(openLisaFD) + child.writeFD = openHostFD } child.handleMu.Unlock() } @@ -1147,11 +1393,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving childVFSFD = &fd.vfsfd } else { h := handle{ - file: openFile, - fd: -1, - } - if fdobj != nil { - h.fd = int32(fdobj.Release()) + file: openP9File, + fdLisa: d.fs.clientLisa.NewFD(openLisaFD), + fd: openHostFD, } fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) if err != nil { @@ -1268,7 +1512,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer newParent.dirMu.Unlock() } if newParent.isDeleted() { - return syserror.ENOENT + return linuxerr.ENOENT } replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds) if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { @@ -1282,7 +1526,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { return linuxerr.ENOTEMPTY @@ -1305,7 +1549,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Update the remote filesystem. if !renamed.isSynthetic() { - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + if fs.opts.lisaEnabled { + err = renamed.controlFDLisa.RenameTo(ctx, newParent.controlFDLisa.ID(), newName) + } else { + err = renamed.file.rename(ctx, newParent.file, newName) + } + if err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } @@ -1316,7 +1565,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced.isDir() { flags = linux.AT_REMOVEDIR } - if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + if fs.opts.lisaEnabled { + err = newParent.controlFDLisa.UnlinkAt(ctx, newName, flags) + } else { + err = newParent.file.unlinkAt(ctx, newName, flags) + } + if err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } @@ -1432,6 +1686,28 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu for d.isSynthetic() { d = d.parent } + if fs.opts.lisaEnabled { + var statFS lisafs.StatFS + if err := d.controlFDLisa.StatFSTo(ctx, &statFS); err != nil { + return linux.Statfs{}, err + } + if statFS.NameLength > maxFilenameLen { + statFS.NameLength = maxFilenameLen + } + return linux.Statfs{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + BlockSize: statFS.BlockSize, + Blocks: statFS.Blocks, + BlocksFree: statFS.BlocksFree, + BlocksAvailable: statFS.BlocksAvailable, + Files: statFS.Files, + FilesFree: statFS.FilesFree, + NameLength: statFS.NameLength, + }, nil + } fsstat, err := d.file.statFS(ctx) if err != nil { return linux.Statfs{}, err @@ -1457,11 +1733,21 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) { creds := rp.Credentials() + if fs.opts.lisaEnabled { + return parent.controlFDLisa.SymlinkAt(ctx, name, target, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) + } _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err - }, nil) + return nil, err + }, nil, func(child *dentry) { + if fs.opts.interop != InteropModeShared { + // lisafs caches the symlink target on creation. In practice, this + // helps avoid a lot of ReadLink RPCs. + child.haveTarget = true + child.target = target + } + }) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. @@ -1506,7 +1792,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si if err != nil { return nil, err } - return d.listXattr(ctx, rp.Credentials(), size) + return d.listXattr(ctx, size) } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. @@ -1613,6 +1899,9 @@ func (fs *filesystem) MountOptions() string { if fs.opts.overlayfsStaleRead { optsKV = append(optsKV, mopt{moptOverlayfsStaleRead, nil}) } + if fs.opts.lisaEnabled { + optsKV = append(optsKV, mopt{moptLisafs, nil}) + } opts := make([]string, 0, len(optsKV)) for _, opt := range optsKV { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 25d2e39d6..b98825e26 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -48,6 +48,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" @@ -62,7 +63,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" ) @@ -84,6 +84,7 @@ const ( moptForcePageCache = "force_page_cache" moptLimitHostFDTranslation = "limit_host_fd_translation" moptOverlayfsStaleRead = "overlayfs_stale_read" + moptLisafs = "lisafs" ) // Valid values for the "cache" mount option. @@ -119,6 +120,10 @@ type filesystem struct { // client is the client used by this filesystem. client is immutable. client *p9.Client `state:"nosave"` + // clientLisa is the client used for communicating with the server when + // lisafs is enabled. lisafsCient is immutable. + clientLisa *lisafs.Client `state:"nosave"` + // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock @@ -162,6 +167,12 @@ type filesystem struct { inoMu sync.Mutex `state:"nosave"` inoByQIDPath map[uint64]uint64 `state:"nosave"` + // inoByKey is the same as inoByQIDPath but only used by lisafs. It helps + // identify inodes based on the device ID and host inode number provided + // by the gofer process. It is not preserved across checkpoint/restore for + // the same reason as above. inoByKey is protected by inoMu. + inoByKey map[inoKey]uint64 `state:"nosave"` + // lastIno is the last inode number assigned to a file. lastIno is accessed // using atomic memory operations. lastIno uint64 @@ -215,6 +226,10 @@ type filesystemOptions struct { // way that application FDs representing "special files" such as sockets // do. Note that this disables client caching and mmap for regular files. regularFilesUseSpecialFileFD bool + + // lisaEnabled indicates whether the client will use lisafs protocol to + // communicate with the server instead of 9P. + lisaEnabled bool } // InteropMode controls the client's interaction with other remote filesystem @@ -428,6 +443,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, moptOverlayfsStaleRead) fsopts.overlayfsStaleRead = true } + if lisafs, ok := mopts[moptLisafs]; ok { + delete(mopts, moptLisafs) + fsopts.lisaEnabled, err = strconv.ParseBool(lisafs) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid lisafs option: %s", lisafs) + return nil, nil, linuxerr.EINVAL + } + } // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying // "cache=none". @@ -459,44 +482,83 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), inoByQIDPath: make(map[uint64]uint64), + inoByKey: make(map[inoKey]uint64), } fs.vfsfs.Init(vfsObj, &fstype, fs) + if err := fs.initClientAndRoot(ctx); err != nil { + fs.vfsfs.DecRef(ctx) + return nil, nil, err + } + + return &fs.vfsfs, &fs.root.vfsd, nil +} + +func (fs *filesystem) initClientAndRoot(ctx context.Context) error { + var err error + if fs.opts.lisaEnabled { + var rootInode *lisafs.Inode + rootInode, err = fs.initClientLisa(ctx) + if err != nil { + return err + } + fs.root, err = fs.newDentryLisa(ctx, rootInode) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, rootInode.ControlFD) + } + } else { + fs.root, err = fs.initClient(ctx) + } + + // Set the root's reference count to 2. One reference is returned to the + // caller, and the other is held by fs to prevent the root from being "cached" + // and subsequently evicted. + if err == nil { + fs.root.refs = 2 + } + return err +} + +func (fs *filesystem) initClientLisa(ctx context.Context) (*lisafs.Inode, error) { + sock, err := unet.NewSocket(fs.opts.fd) + if err != nil { + return nil, err + } + + var rootInode *lisafs.Inode + ctx.UninterruptibleSleepStart(false) + fs.clientLisa, rootInode, err = lisafs.NewClient(sock, fs.opts.aname) + ctx.UninterruptibleSleepFinish(false) + return rootInode, err +} + +func (fs *filesystem) initClient(ctx context.Context) (*dentry, error) { // Connect to the server. if err := fs.dial(ctx); err != nil { - return nil, nil, err + return nil, err } // Perform attach to obtain the filesystem root. ctx.UninterruptibleSleepStart(false) - attached, err := fs.client.Attach(fsopts.aname) + attached, err := fs.client.Attach(fs.opts.aname) ctx.UninterruptibleSleepFinish(false) if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err + return nil, err } attachFile := p9file{attached} qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef(ctx) - return nil, nil, err + return nil, err } // Construct the root dentry. root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef(ctx) - return nil, nil, err + return nil, err } - // Set the root's reference count to 2. One reference is returned to the - // caller, and the other is held by fs to prevent the root from being "cached" - // and subsequently evicted. - root.refs = 2 - fs.root = root - - return &fs.vfsfs, &root.vfsd, nil + return root, nil } func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { @@ -614,7 +676,11 @@ func (fs *filesystem) Release(ctx context.Context) { if !fs.iopts.LeakConnection { // Close the connection to the server. This implicitly clunks all fids. - fs.client.Close() + if fs.opts.lisaEnabled { + fs.clientLisa.Close() + } else { + fs.client.Close() + } } fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) @@ -645,6 +711,23 @@ func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { } } +// inoKey is the key used to identify the inode backed by this dentry. +// +// +stateify savable +type inoKey struct { + ino uint64 + devMinor uint32 + devMajor uint32 +} + +func inoKeyFromStat(stat *linux.Statx) inoKey { + return inoKey{ + ino: stat.Ino, + devMinor: stat.DevMinor, + devMajor: stat.DevMajor, + } +} + // dentry implements vfs.DentryImpl. // // +stateify savable @@ -675,6 +758,9 @@ type dentry struct { // qidPath is the p9.QID.Path for this file. qidPath is immutable. qidPath uint64 + // inoKey is used to identify this dentry's inode. + inoKey inoKey + // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file @@ -682,6 +768,14 @@ type dentry struct { // only files that can be synthetic are sockets, pipes, and directories. file p9file `state:"nosave"` + // controlFDLisa is used by lisafs to perform path based operations on this + // dentry. + // + // if !controlFDLisa.Ok(), this dentry represents a synthetic file, i.e. a + // file that does not exist on the remote filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. + controlFDLisa lisafs.ClientFD `state:"nosave"` + // If deleted is non-zero, the file represented by this dentry has been // deleted. deleted is accessed using atomic memory operations. deleted uint32 @@ -792,12 +886,14 @@ type dentry struct { // always either -1 or equal to readFD; if !writeFile.isNil() (the file has // been opened for writing), it is additionally either -1 or equal to // writeFD. - handleMu sync.RWMutex `state:"nosave"` - readFile p9file `state:"nosave"` - writeFile p9file `state:"nosave"` - readFD int32 `state:"nosave"` - writeFD int32 `state:"nosave"` - mmapFD int32 `state:"nosave"` + handleMu sync.RWMutex `state:"nosave"` + readFile p9file `state:"nosave"` + writeFile p9file `state:"nosave"` + readFDLisa lisafs.ClientFD `state:"nosave"` + writeFDLisa lisafs.ClientFD `state:"nosave"` + readFD int32 `state:"nosave"` + writeFD int32 `state:"nosave"` + mmapFD int32 `state:"nosave"` dataMu sync.RWMutex `state:"nosave"` @@ -865,11 +961,11 @@ func dentryAttrMask() p9.AttrMask { func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") - return nil, syserror.EIO + return nil, linuxerr.EIO } if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { ctx.Warningf("can't create regular file gofer.dentry without file size") - return nil, syserror.EIO + return nil, linuxerr.EIO } d := &dentry{ @@ -921,6 +1017,79 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma return d, nil } +func (fs *filesystem) newDentryLisa(ctx context.Context, ino *lisafs.Inode) (*dentry, error) { + if ino.Stat.Mask&linux.STATX_TYPE == 0 { + ctx.Warningf("can't create gofer.dentry without file type") + return nil, linuxerr.EIO + } + if ino.Stat.Mode&linux.FileTypeMask == linux.ModeRegular && ino.Stat.Mask&linux.STATX_SIZE == 0 { + ctx.Warningf("can't create regular file gofer.dentry without file size") + return nil, linuxerr.EIO + } + + inoKey := inoKeyFromStat(&ino.Stat) + d := &dentry{ + fs: fs, + inoKey: inoKey, + ino: fs.inoFromKey(inoKey), + mode: uint32(ino.Stat.Mode), + uid: uint32(fs.opts.dfltuid), + gid: uint32(fs.opts.dfltgid), + blockSize: hostarch.PageSize, + readFD: -1, + writeFD: -1, + mmapFD: -1, + controlFDLisa: fs.clientLisa.NewFD(ino.ControlFD), + } + + d.pf.dentry = d + if ino.Stat.Mask&linux.STATX_UID != 0 { + d.uid = dentryUIDFromLisaUID(lisafs.UID(ino.Stat.UID)) + } + if ino.Stat.Mask&linux.STATX_GID != 0 { + d.gid = dentryGIDFromLisaGID(lisafs.GID(ino.Stat.GID)) + } + if ino.Stat.Mask&linux.STATX_SIZE != 0 { + d.size = ino.Stat.Size + } + if ino.Stat.Blksize != 0 { + d.blockSize = ino.Stat.Blksize + } + if ino.Stat.Mask&linux.STATX_ATIME != 0 { + d.atime = dentryTimestampFromLisa(ino.Stat.Atime) + } + if ino.Stat.Mask&linux.STATX_MTIME != 0 { + d.mtime = dentryTimestampFromLisa(ino.Stat.Mtime) + } + if ino.Stat.Mask&linux.STATX_CTIME != 0 { + d.ctime = dentryTimestampFromLisa(ino.Stat.Ctime) + } + if ino.Stat.Mask&linux.STATX_BTIME != 0 { + d.btime = dentryTimestampFromLisa(ino.Stat.Btime) + } + if ino.Stat.Mask&linux.STATX_NLINK != 0 { + d.nlink = ino.Stat.Nlink + } + d.vfsd.Init(d) + refsvfs2.Register(d) + fs.syncMu.Lock() + fs.syncableDentries[d] = struct{}{} + fs.syncMu.Unlock() + return d, nil +} + +func (fs *filesystem) inoFromKey(key inoKey) uint64 { + fs.inoMu.Lock() + defer fs.inoMu.Unlock() + + if ino, ok := fs.inoByKey[key]; ok { + return ino + } + ino := fs.nextIno() + fs.inoByKey[key] = ino + return ino +} + func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 { fs.inoMu.Lock() defer fs.inoMu.Unlock() @@ -937,7 +1106,7 @@ func (fs *filesystem) nextIno() uint64 { } func (d *dentry) isSynthetic() bool { - return d.file.isNil() + return !d.isControlFileOk() } func (d *dentry) cachedMetadataAuthoritative() bool { @@ -987,6 +1156,50 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { } } +// updateFromLisaStatLocked is called to update d's metadata after an update +// from the remote filesystem. +// Precondition: d.metadataMu must be locked. +// +checklocks:d.metadataMu +func (d *dentry) updateFromLisaStatLocked(stat *linux.Statx) { + if stat.Mask&linux.STATX_TYPE != 0 { + if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { + panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) + } + } + if stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, uint32(stat.Mode)) + } + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, dentryUIDFromLisaUID(lisafs.UID(stat.UID))) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.uid, dentryGIDFromLisaGID(lisafs.GID(stat.GID))) + } + if stat.Blksize != 0 { + atomic.StoreUint32(&d.blockSize, stat.Blksize) + } + // Don't override newer client-defined timestamps with old server-defined + // ones. + if stat.Mask&linux.STATX_ATIME != 0 && atomic.LoadUint32(&d.atimeDirty) == 0 { + atomic.StoreInt64(&d.atime, dentryTimestampFromLisa(stat.Atime)) + } + if stat.Mask&linux.STATX_MTIME != 0 && atomic.LoadUint32(&d.mtimeDirty) == 0 { + atomic.StoreInt64(&d.mtime, dentryTimestampFromLisa(stat.Mtime)) + } + if stat.Mask&linux.STATX_CTIME != 0 { + atomic.StoreInt64(&d.ctime, dentryTimestampFromLisa(stat.Ctime)) + } + if stat.Mask&linux.STATX_BTIME != 0 { + atomic.StoreInt64(&d.btime, dentryTimestampFromLisa(stat.Btime)) + } + if stat.Mask&linux.STATX_NLINK != 0 { + atomic.StoreUint32(&d.nlink, stat.Nlink) + } + if stat.Mask&linux.STATX_SIZE != 0 { + d.updateSizeLocked(stat.Size) + } +} + // Preconditions: !d.isSynthetic(). // Preconditions: d.metadataMu is locked. // +checklocks:d.metadataMu @@ -996,7 +1209,10 @@ func (d *dentry) refreshSizeLocked(ctx context.Context) error { if d.writeFD < 0 { d.handleMu.RUnlock() // Ask the gofer if we don't have a host FD. - return d.updateFromGetattrLocked(ctx) + if d.fs.opts.lisaEnabled { + return d.updateFromStatLisaLocked(ctx, nil) + } + return d.updateFromGetattrLocked(ctx, p9file{}) } var stat unix.Statx_t @@ -1015,33 +1231,77 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { // updating stale attributes in d.updateFromP9AttrsLocked(). d.metadataMu.Lock() defer d.metadataMu.Unlock() - return d.updateFromGetattrLocked(ctx) + if d.fs.opts.lisaEnabled { + return d.updateFromStatLisaLocked(ctx, nil) + } + return d.updateFromGetattrLocked(ctx, p9file{}) } // Preconditions: // * !d.isSynthetic(). // * d.metadataMu is locked. // +checklocks:d.metadataMu -func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { - // Use d.readFile or d.writeFile, which represent 9P FIDs that have been - // opened, in preference to d.file, which represents a 9P fid that has not. - // This may be significantly more efficient in some implementations. Prefer - // d.writeFile over d.readFile since some filesystem implementations may - // update a writable handle's metadata after writes to that handle, without - // making metadata updates immediately visible to read-only handles - // representing the same file. - d.handleMu.RLock() - handleMuRLocked := true - var file p9file - switch { - case !d.writeFile.isNil(): - file = d.writeFile - case !d.readFile.isNil(): - file = d.readFile - default: - file = d.file - d.handleMu.RUnlock() - handleMuRLocked = false +func (d *dentry) updateFromStatLisaLocked(ctx context.Context, fdLisa *lisafs.ClientFD) error { + handleMuRLocked := false + if fdLisa == nil { + // Use open FDs in preferenece to the control FD. This may be significantly + // more efficient in some implementations. Prefer a writable FD over a + // readable one since some filesystem implementations may update a writable + // FD's metadata after writes, without making metadata updates immediately + // visible to read-only FDs representing the same file. + d.handleMu.RLock() + switch { + case d.writeFDLisa.Ok(): + fdLisa = &d.writeFDLisa + handleMuRLocked = true + case d.readFDLisa.Ok(): + fdLisa = &d.readFDLisa + handleMuRLocked = true + default: + fdLisa = &d.controlFDLisa + d.handleMu.RUnlock() + } + } + + var stat linux.Statx + err := fdLisa.StatTo(ctx, &stat) + if handleMuRLocked { + // handleMu must be released before updateFromLisaStatLocked(). + d.handleMu.RUnlock() // +checklocksforce: complex case. + } + if err != nil { + return err + } + d.updateFromLisaStatLocked(&stat) + return nil +} + +// Preconditions: +// * !d.isSynthetic(). +// * d.metadataMu is locked. +// +checklocks:d.metadataMu +func (d *dentry) updateFromGetattrLocked(ctx context.Context, file p9file) error { + handleMuRLocked := false + if file.isNil() { + // Use d.readFile or d.writeFile, which represent 9P FIDs that have + // been opened, in preference to d.file, which represents a 9P fid that + // has not. This may be significantly more efficient in some + // implementations. Prefer d.writeFile over d.readFile since some + // filesystem implementations may update a writable handle's metadata + // after writes to that handle, without making metadata updates + // immediately visible to read-only handles representing the same file. + d.handleMu.RLock() + switch { + case !d.writeFile.isNil(): + file = d.writeFile + handleMuRLocked = true + case !d.readFile.isNil(): + file = d.readFile + handleMuRLocked = true + default: + file = d.file + d.handleMu.RUnlock() + } } _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) @@ -1112,7 +1372,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs case linux.S_IFREG: // ok case linux.S_IFDIR: - return syserror.EISDIR + return linuxerr.EISDIR default: return linuxerr.EINVAL } @@ -1159,6 +1419,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } } + // failureMask indicates which attributes could not be set on the remote + // filesystem. p9 returns an error if any of the attributes could not be set + // but that leads to inconsistency as the server could have set a few + // attributes successfully but a later failure will cause the successful ones + // to not be updated in the dentry cache. + var failureMask uint32 + var failureErr error if !d.isSynthetic() { if stat.Mask != 0 { if stat.Mask&linux.STATX_SIZE != 0 { @@ -1168,35 +1435,50 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // the remote file has been truncated). d.dataMu.Lock() } - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - if stat.Mask&linux.STATX_SIZE != 0 { - d.dataMu.Unlock() // +checklocksforce: locked conditionally above + if d.fs.opts.lisaEnabled { + var err error + failureMask, failureErr, err = d.controlFDLisa.SetStat(ctx, stat) + if err != nil { + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } + return err + } + } else { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } + return err } - return err } if stat.Mask&linux.STATX_SIZE != 0 { - // d.size should be kept up to date, and privatized - // copy-on-write mappings of truncated pages need to be - // invalidated, even if InteropModeShared is in effect. - d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above + if failureMask&linux.STATX_SIZE == 0 { + // d.size should be kept up to date, and privatized + // copy-on-write mappings of truncated pages need to be + // invalidated, even if InteropModeShared is in effect. + d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above + } else { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } } } if d.fs.opts.interop == InteropModeShared { @@ -1207,13 +1489,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } } - if stat.Mask&linux.STATX_MODE != 0 { + if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } - if stat.Mask&linux.STATX_UID != 0 { + if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { atomic.StoreUint32(&d.uid, stat.UID) } - if stat.Mask&linux.STATX_GID != 0 { + if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { atomic.StoreUint32(&d.gid, stat.GID) } // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because @@ -1221,15 +1503,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // stat.Mtime to client-local timestamps above, and if // !d.cachedMetadataAuthoritative() then we returned after calling // d.file.setAttr(). For the same reason, now must have been initialized. - if stat.Mask&linux.STATX_ATIME != 0 { + if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { atomic.StoreInt64(&d.atime, stat.Atime.ToNsec()) atomic.StoreUint32(&d.atimeDirty, 0) } - if stat.Mask&linux.STATX_MTIME != 0 { + if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec()) atomic.StoreUint32(&d.mtimeDirty, 0) } atomic.StoreInt64(&d.ctime, now) + if failureMask != 0 { + // Setting some attribute failed on the remote filesystem. + return failureErr + } return nil } @@ -1309,7 +1595,10 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats // (b/148380782). Allow all other extended attributes to be passed through // to the remote filesystem. This is inconsistent with Linux's 9p client, // but consistent with other filesystems (e.g. FUSE). - if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) { + // + // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is + // consistent with the VFS1 gofer client. + if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { return linuxerr.EOPNOTSUPP } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) @@ -1345,6 +1634,20 @@ func dentryGIDFromP9GID(gid p9.GID) uint32 { return uint32(gid) } +func dentryUIDFromLisaUID(uid lisafs.UID) uint32 { + if !uid.Ok() { + return uint32(auth.OverflowUID) + } + return uint32(uid) +} + +func dentryGIDFromLisaGID(gid lisafs.GID) uint32 { + if !gid.Ok() { + return uint32(auth.OverflowGID) + } + return uint32(gid) +} + // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { // d.refs may be 0 if d.fs.renameMu is locked, which serializes against @@ -1653,15 +1956,24 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.dirty.RemoveAll() } d.dataMu.Unlock() - // Clunk open fids and close open host FDs. - if !d.readFile.isNil() { - _ = d.readFile.close(ctx) - } - if !d.writeFile.isNil() && d.readFile != d.writeFile { - _ = d.writeFile.close(ctx) + if d.fs.opts.lisaEnabled { + if d.readFDLisa.Ok() && d.readFDLisa.ID() != d.writeFDLisa.ID() { + d.readFDLisa.CloseBatched(ctx) + } + if d.writeFDLisa.Ok() { + d.writeFDLisa.CloseBatched(ctx) + } + } else { + // Clunk open fids and close open host FDs. + if !d.readFile.isNil() { + _ = d.readFile.close(ctx) + } + if !d.writeFile.isNil() && d.readFile != d.writeFile { + _ = d.writeFile.close(ctx) + } + d.readFile = p9file{} + d.writeFile = p9file{} } - d.readFile = p9file{} - d.writeFile = p9file{} if d.readFD >= 0 { _ = unix.Close(int(d.readFD)) } @@ -1673,7 +1985,7 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.mmapFD = -1 d.handleMu.Unlock() - if !d.file.isNil() { + if d.isControlFileOk() { // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, // i.e. client and server timestamps may differ (because e.g. a client // write was serviced by the page cache, and only written back to the @@ -1682,10 +1994,16 @@ func (d *dentry) destroyLocked(ctx context.Context) { // instantiated for the same file would remain coherent. Unfortunately, // this turns out to be too expensive in many cases, so for now we // don't do this. - if err := d.file.close(ctx); err != nil { - log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) + + // Close the control FD. + if d.fs.opts.lisaEnabled { + d.controlFDLisa.CloseBatched(ctx) + } else { + if err := d.file.close(ctx); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) + } + d.file = p9file{} } - d.file = p9file{} // Remove d from the set of syncable dentries. d.fs.syncMu.Lock() @@ -1711,10 +2029,29 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { - if d.file.isNil() { +func (d *dentry) isControlFileOk() bool { + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.Ok() + } + return !d.file.isNil() +} + +func (d *dentry) isReadFileOk() bool { + if d.fs.opts.lisaEnabled { + return d.readFDLisa.Ok() + } + return !d.readFile.isNil() +} + +func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { + if !d.isControlFileOk() { return nil, nil } + + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.ListXattr(ctx, size) + } + xattrMap, err := d.file.listXattr(ctx, size) if err != nil { return nil, err @@ -1727,32 +2064,41 @@ func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size ui } func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { - if d.file.isNil() { + if !d.isControlFileOk() { return "", linuxerr.ENODATA } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.GetXattr(ctx, opts.Name, opts.Size) + } return d.file.getXattr(ctx, opts.Name, opts.Size) } func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { - if d.file.isNil() { + if !d.isControlFileOk() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.SetXattr(ctx, opts.Name, opts.Value, opts.Flags) + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { - if d.file.isNil() { + if !d.isControlFileOk() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.RemoveXattr(ctx, name) + } return d.file.removeXattr(ctx, name) } @@ -1764,19 +2110,30 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // O_TRUNC). if !trunc { d.handleMu.RLock() - if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) { + var canReuseCurHandle bool + if d.fs.opts.lisaEnabled { + canReuseCurHandle = (!read || d.readFDLisa.Ok()) && (!write || d.writeFDLisa.Ok()) + } else { + canReuseCurHandle = (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) + } + d.handleMu.RUnlock() + if canReuseCurHandle { // Current handles are sufficient. - d.handleMu.RUnlock() return nil } - d.handleMu.RUnlock() } var fdsToCloseArr [2]int32 fdsToClose := fdsToCloseArr[:0] invalidateTranslations := false d.handleMu.Lock() - if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc { + var needNewHandle bool + if d.fs.opts.lisaEnabled { + needNewHandle = (read && !d.readFDLisa.Ok()) || (write && !d.writeFDLisa.Ok()) || trunc + } else { + needNewHandle = (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc + } + if needNewHandle { // Get a new handle. If this file has been opened for both reading and // writing, try to get a single handle that is usable for both: // @@ -1785,9 +2142,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // // - NOTE(b/141991141): Some filesystems may not ensure coherence // between multiple handles for the same file. - openReadable := !d.readFile.isNil() || read - openWritable := !d.writeFile.isNil() || write - h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) + var ( + openReadable bool + openWritable bool + h handle + err error + ) + if d.fs.opts.lisaEnabled { + openReadable = d.readFDLisa.Ok() || read + openWritable = d.writeFDLisa.Ok() || write + h, err = openHandleLisa(ctx, d.controlFDLisa, openReadable, openWritable, trunc) + } else { + openReadable = !d.readFile.isNil() || read + openWritable = !d.writeFile.isNil() || write + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { // It may not be possible to use a single handle for both // reading and writing, since permissions on the file may have @@ -1797,7 +2166,11 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) openReadable = read openWritable = write - h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + if d.fs.opts.lisaEnabled { + h, err = openHandleLisa(ctx, d.controlFDLisa, openReadable, openWritable, trunc) + } else { + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } } if err != nil { d.handleMu.Unlock() @@ -1859,9 +2232,16 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // previously opened for reading (without an FD), then existing // translations of the file may use the internal page cache; // invalidate those mappings. - if d.writeFile.isNil() { - invalidateTranslations = !d.readFile.isNil() - atomic.StoreInt32(&d.mmapFD, h.fd) + if d.fs.opts.lisaEnabled { + if !d.writeFDLisa.Ok() { + invalidateTranslations = d.readFDLisa.Ok() + atomic.StoreInt32(&d.mmapFD, h.fd) + } + } else { + if d.writeFile.isNil() { + invalidateTranslations = !d.readFile.isNil() + atomic.StoreInt32(&d.mmapFD, h.fd) + } } } else if openWritable && d.writeFD < 0 { atomic.StoreInt32(&d.writeFD, h.fd) @@ -1888,24 +2268,45 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool atomic.StoreInt32(&d.mmapFD, -1) } - // Switch to new fids. - var oldReadFile p9file - if openReadable { - oldReadFile = d.readFile - d.readFile = h.file - } - var oldWriteFile p9file - if openWritable { - oldWriteFile = d.writeFile - d.writeFile = h.file - } - // NOTE(b/141991141): Clunk old fids before making new fids visible (by - // unlocking d.handleMu). - if !oldReadFile.isNil() { - oldReadFile.close(ctx) - } - if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { - oldWriteFile.close(ctx) + // Switch to new fids/FDs. + if d.fs.opts.lisaEnabled { + oldReadFD := lisafs.InvalidFDID + if openReadable { + oldReadFD = d.readFDLisa.ID() + d.readFDLisa = h.fdLisa + } + oldWriteFD := lisafs.InvalidFDID + if openWritable { + oldWriteFD = d.writeFDLisa.ID() + d.writeFDLisa = h.fdLisa + } + // NOTE(b/141991141): Close old FDs before making new fids visible (by + // unlocking d.handleMu). + if oldReadFD.Ok() { + d.fs.clientLisa.CloseFDBatched(ctx, oldReadFD) + } + if oldWriteFD.Ok() && oldReadFD != oldWriteFD { + d.fs.clientLisa.CloseFDBatched(ctx, oldWriteFD) + } + } else { + var oldReadFile p9file + if openReadable { + oldReadFile = d.readFile + d.readFile = h.file + } + var oldWriteFile p9file + if openWritable { + oldWriteFile = d.writeFile + d.writeFile = h.file + } + // NOTE(b/141991141): Clunk old fids before making new fids visible (by + // unlocking d.handleMu). + if !oldReadFile.isNil() { + oldReadFile.close(ctx) + } + if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { + oldWriteFile.close(ctx) + } } } d.handleMu.Unlock() @@ -1929,27 +2330,29 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // Preconditions: d.handleMu must be locked. func (d *dentry) readHandleLocked() handle { return handle{ - file: d.readFile, - fd: d.readFD, + fdLisa: d.readFDLisa, + file: d.readFile, + fd: d.readFD, } } // Preconditions: d.handleMu must be locked. func (d *dentry) writeHandleLocked() handle { return handle{ - file: d.writeFile, - fd: d.writeFD, + fdLisa: d.writeFDLisa, + file: d.writeFile, + fd: d.writeFD, } } func (d *dentry) syncRemoteFile(ctx context.Context) error { d.handleMu.RLock() defer d.handleMu.RUnlock() - return d.syncRemoteFileLocked(ctx) + return d.syncRemoteFileLocked(ctx, nil /* accFsyncFDIDsLisa */) } // Preconditions: d.handleMu must be locked. -func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { +func (d *dentry) syncRemoteFileLocked(ctx context.Context, accFsyncFDIDsLisa *[]lisafs.FDID) error { // If we have a host FD, fsyncing it is likely to be faster than an fsync // RPC. Prefer syncing write handles over read handles, since some remote // filesystem implementations may not sync changes made through write @@ -1960,7 +2363,13 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { ctx.UninterruptibleSleepFinish(false) return err } - if !d.writeFile.isNil() { + if d.fs.opts.lisaEnabled && d.writeFDLisa.Ok() { + if accFsyncFDIDsLisa != nil { + *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, d.writeFDLisa.ID()) + return nil + } + return d.writeFDLisa.Sync(ctx) + } else if !d.fs.opts.lisaEnabled && !d.writeFile.isNil() { return d.writeFile.fsync(ctx) } if d.readFD >= 0 { @@ -1969,13 +2378,19 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { ctx.UninterruptibleSleepFinish(false) return err } - if !d.readFile.isNil() { + if d.fs.opts.lisaEnabled && d.readFDLisa.Ok() { + if accFsyncFDIDsLisa != nil { + *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, d.readFDLisa.ID()) + return nil + } + return d.readFDLisa.Sync(ctx) + } else if !d.fs.opts.lisaEnabled && !d.readFile.isNil() { return d.readFile.fsync(ctx) } return nil } -func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { +func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error { d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandleLocked() @@ -1988,7 +2403,7 @@ func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) err return err } } - if err := d.syncRemoteFileLocked(ctx); err != nil { + if err := d.syncRemoteFileLocked(ctx, accFsyncFDIDsLisa); err != nil { if !forFilesystemSync { return err } @@ -2045,10 +2460,33 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { - // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if - // available? - if err := d.updateFromGetattr(ctx); err != nil { - return linux.Statx{}, err + if d.fs.opts.lisaEnabled { + // Use specialFileFD.handle.fileLisa for the Stat if available, for the + // same reason that we try to use open FD in updateFromStatLisaLocked(). + var fdLisa *lisafs.ClientFD + if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { + fdLisa = &sffd.handle.fdLisa + } + d.metadataMu.Lock() + err := d.updateFromStatLisaLocked(ctx, fdLisa) + d.metadataMu.Unlock() + if err != nil { + return linux.Statx{}, err + } + } else { + // Use specialFileFD.handle.file for the getattr if available, for the + // same reason that we try to use open file handles in + // dentry.updateFromGetattrLocked(). + var file p9file + if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { + file = sffd.handle.file + } + d.metadataMu.Lock() + err := d.updateFromGetattrLocked(ctx, file) + d.metadataMu.Unlock() + if err != nil { + return linux.Statx{}, err + } } } var stat linux.Statx @@ -2069,7 +2507,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { - return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size) + return fd.dentry().listXattr(ctx, size) } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 806392d50..d5cc73f33 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -33,6 +33,7 @@ func TestDestroyIdempotent(t *testing.T) { }, syncableDentries: make(map[*dentry]struct{}), inoByQIDPath: make(map[uint64]uint64), + inoByKey: make(map[inoKey]uint64), } attr := &p9.Attr{ diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 5c57f6fea..394aecd62 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -17,18 +17,23 @@ package gofer import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sync" ) // handle represents a remote "open file descriptor", consisting of an opened // fid (p9.File) and optionally a host file descriptor. // +// If lisafs is being used, fdLisa points to an open file on the server. +// // These are explicitly not savable. type handle struct { - file p9file - fd int32 // -1 if unavailable + fdLisa lisafs.ClientFD + file p9file + fd int32 // -1 if unavailable } // Preconditions: read || write. @@ -64,13 +69,47 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand }, nil } +// Preconditions: read || write. +func openHandleLisa(ctx context.Context, fdLisa lisafs.ClientFD, read, write, trunc bool) (handle, error) { + var flags uint32 + switch { + case read && write: + flags = unix.O_RDWR + case read: + flags = unix.O_RDONLY + case write: + flags = unix.O_WRONLY + default: + panic("tried to open unreadable and unwritable handle") + } + if trunc { + flags |= unix.O_TRUNC + } + openFD, hostFD, err := fdLisa.OpenAt(ctx, flags) + if err != nil { + return handle{fd: -1}, err + } + h := handle{ + fdLisa: fdLisa.Client().NewFD(openFD), + fd: int32(hostFD), + } + return h, nil +} + func (h *handle) isOpen() bool { + if h.fdLisa.Client() != nil { + return h.fdLisa.Ok() + } return !h.file.isNil() } func (h *handle) close(ctx context.Context) { - h.file.close(ctx) - h.file = p9file{} + if h.fdLisa.Client() != nil { + h.fdLisa.CloseBatched(ctx) + } else { + h.file.close(ctx) + h.file = p9file{} + } if h.fd >= 0 { unix.Close(int(h.fd)) h.fd = -1 @@ -88,19 +127,27 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs return n, err } if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { - n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) - return uint64(n), err + if h.fdLisa.Client() != nil { + return h.fdLisa.Read(ctx, dsts.Head().ToSlice(), offset) + } + return h.file.readAt(ctx, dsts.Head().ToSlice(), offset) } // Buffer the read since p9.File.ReadAt() takes []byte. buf := make([]byte, dsts.NumBytes()) - n, err := h.file.readAt(ctx, buf, offset) + var n uint64 + var err error + if h.fdLisa.Client() != nil { + n, err = h.fdLisa.Read(ctx, buf, offset) + } else { + n, err = h.file.readAt(ctx, buf, offset) + } if n == 0 { return 0, err } if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { return cp, cperr } - return uint64(n), err + return n, err } func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { @@ -114,8 +161,10 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o return n, err } if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { - n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) - return uint64(n), err + if h.fdLisa.Client() != nil { + return h.fdLisa.Write(ctx, srcs.Head().ToSlice(), offset) + } + return h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) } // Buffer the write since p9.File.WriteAt() takes []byte. buf := make([]byte, srcs.NumBytes()) @@ -123,10 +172,56 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o if cp == 0 { return 0, cperr } - n, err := h.file.writeAt(ctx, buf[:cp], offset) + var n uint64 + var err error + if h.fdLisa.Client() != nil { + n, err = h.fdLisa.Write(ctx, buf[:cp], offset) + } else { + n, err = h.file.writeAt(ctx, buf[:cp], offset) + } // err takes precedence over cperr. if err != nil { - return uint64(n), err + return n, err } - return uint64(n), cperr + return n, cperr +} + +type handleReadWriter struct { + ctx context.Context + h *handle + off uint64 +} + +var handleReadWriterPool = sync.Pool{ + New: func() interface{} { + return &handleReadWriter{} + }, +} + +func getHandleReadWriter(ctx context.Context, h *handle, offset int64) *handleReadWriter { + rw := handleReadWriterPool.Get().(*handleReadWriter) + rw.ctx = ctx + rw.h = h + rw.off = uint64(offset) + return rw +} + +func putHandleReadWriter(rw *handleReadWriter) { + rw.ctx = nil + rw.h = nil + handleReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + n, err := rw.h.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.off += n + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + n, err := rw.h.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.off += n + return n, err } diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go index 398288ee3..505916a57 100644 --- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -22,7 +22,6 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/syserror" ) // Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create @@ -109,6 +108,6 @@ func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { return nil case <-cancel: ctx.SleepFinish(false) - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } } diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index b0a429d42..0d97b60fd 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -16,9 +16,9 @@ package gofer import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/syserror" ) // p9file is a wrapper around p9.File that provides methods that are @@ -59,7 +59,7 @@ func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file if newfile != nil { p9file{newfile}.close(ctx) } - return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, linuxerr.EIO } return qids[0], p9file{newfile}, attrMask, attr, nil } @@ -141,18 +141,18 @@ func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, u return fdobj, qid, iounit, err } -func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { +func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (uint64, error) { ctx.UninterruptibleSleepStart(false) n, err := f.file.ReadAt(p, offset) ctx.UninterruptibleSleepFinish(false) - return n, err + return uint64(n), err } -func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { +func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (uint64, error) { ctx.UninterruptibleSleepStart(false) n, err := f.file.WriteAt(p, offset) ctx.UninterruptibleSleepFinish(false) - return n, err + return uint64(n), err } func (f p9file) fsync(ctx context.Context) error { diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 947dbe05f..874f9873d 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -98,6 +98,12 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() + if d.fs.opts.lisaEnabled { + if !d.writeFDLisa.Ok() { + return nil + } + return d.writeFDLisa.Flush(ctx) + } if d.writeFile.isNil() { return nil } @@ -110,6 +116,9 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint return d.doAllocate(ctx, offset, length, func() error { d.handleMu.RLock() defer d.handleMu.RUnlock() + if d.fs.opts.lisaEnabled { + return d.writeFDLisa.Allocate(ctx, mode, offset, length) + } return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) }) } @@ -282,8 +291,19 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // changes to the host. if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { atomic.StoreUint32(&d.mode, newMode) - if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { - return 0, offset, err + if d.fs.opts.lisaEnabled { + stat := linux.Statx{Mask: linux.STATX_MODE, Mode: uint16(newMode)} + failureMask, failureErr, err := d.controlFDLisa.SetStat(ctx, &stat) + if err != nil { + return 0, offset, err + } + if failureMask != 0 { + return 0, offset, failureErr + } + } else { + if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { + return 0, offset, err + } } } } @@ -677,7 +697,7 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */) + return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 226790a11..5d4009832 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -15,7 +15,9 @@ package gofer import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -234,28 +236,54 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // Lock metadata on all dentries *before* getting attributes for them. state.lockAllMetadata() - stats, err := state.start.file.multiGetAttr(ctx, state.names) - if err != nil { - return err + + var ( + stats []p9.FullStat + statsLisa []linux.Statx + numStats int + ) + if fs.opts.lisaEnabled { + var err error + statsLisa, err = state.start.controlFDLisa.WalkStat(ctx, state.names) + if err != nil { + return err + } + numStats = len(statsLisa) + } else { + var err error + stats, err = state.start.file.multiGetAttr(ctx, state.names) + if err != nil { + return err + } + numStats = len(stats) } i := -1 for d := state.popFront(); d != nil; d = state.popFront() { i++ - found := i < len(stats) + found := i < numStats if i == 0 && len(state.names[0]) == 0 { if found && !d.isSynthetic() { // First dentry is where the search is starting, just update attributes // since it cannot be replaced. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. + if fs.opts.lisaEnabled { + d.updateFromLisaStatLocked(&statsLisa[i]) // +checklocksforce: acquired by lockAllMetadata. + } else { + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. + } } d.metadataMu.Unlock() // +checklocksforce: see above. continue } - // Note that synthetic dentries will always fails the comparison check - // below. - if !found || d.qidPath != stats[i].QID.Path { + // Note that synthetic dentries will always fail this comparison check. + var shouldInvalidate bool + if fs.opts.lisaEnabled { + shouldInvalidate = !found || d.inoKey != inoKeyFromStat(&statsLisa[i]) + } else { + shouldInvalidate = !found || d.qidPath != stats[i].QID.Path + } + if shouldInvalidate { d.metadataMu.Unlock() // +checklocksforce: see above. if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace @@ -298,7 +326,11 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // The file at this path hasn't changed. Just update cached metadata. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. + if fs.opts.lisaEnabled { + d.updateFromLisaStatLocked(&statsLisa[i]) // +checklocksforce: see above. + } else { + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. + } d.metadataMu.Unlock() } diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go index e67422a2f..475322527 100644 --- a/pkg/sentry/fsimpl/gofer/save_restore.go +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/safemem" @@ -112,10 +113,19 @@ func (d *dentry) prepareSaveRecursive(ctx context.Context) error { return err } } - if !d.readFile.isNil() || !d.writeFile.isNil() { - d.fs.savedDentryRW[d] = savedDentryRW{ - read: !d.readFile.isNil(), - write: !d.writeFile.isNil(), + if d.fs.opts.lisaEnabled { + if d.readFDLisa.Ok() || d.writeFDLisa.Ok() { + d.fs.savedDentryRW[d] = savedDentryRW{ + read: d.readFDLisa.Ok(), + write: d.writeFDLisa.Ok(), + } + } + } else { + if !d.readFile.isNil() || !d.writeFile.isNil() { + d.fs.savedDentryRW[d] = savedDentryRW{ + read: !d.readFile.isNil(), + write: !d.writeFile.isNil(), + } } } d.dirMu.Lock() @@ -158,6 +168,10 @@ func (d *dentryPlatformFile) afterLoad() { // afterLoad is invoked by stateify. func (fd *specialFileFD) afterLoad() { fd.handle.fd = -1 + if fd.hostFileMapper.IsInited() { + // Ensure that we don't call fd.hostFileMapper.Init() again. + fd.hostFileMapperInitOnce.Do(func() {}) + } } // CompleteRestore implements @@ -173,25 +187,37 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID) } fs.opts.fd = fd - if err := fs.dial(ctx); err != nil { - return err - } fs.inoByQIDPath = make(map[uint64]uint64) + fs.inoByKey = make(map[inoKey]uint64) - // Restore the filesystem root. - ctx.UninterruptibleSleepStart(false) - attached, err := fs.client.Attach(fs.opts.aname) - ctx.UninterruptibleSleepFinish(false) - if err != nil { - return err - } - attachFile := p9file{attached} - qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) - if err != nil { - return err - } - if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil { - return err + if fs.opts.lisaEnabled { + rootInode, err := fs.initClientLisa(ctx) + if err != nil { + return err + } + if err := fs.root.restoreFileLisa(ctx, rootInode, &opts); err != nil { + return err + } + } else { + if err := fs.dial(ctx); err != nil { + return err + } + + // Restore the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := fs.client.Attach(fs.opts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + return err + } + if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil { + return err + } } // Restore remaining dentries. @@ -279,6 +305,55 @@ func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrM return nil } +func (d *dentry) restoreFileLisa(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions) error { + d.controlFDLisa = d.fs.clientLisa.NewFD(inode.ControlFD) + + // Gofers do not preserve inoKey across checkpoint/restore, so: + // + // - We must assume that the remote filesystem did not change in a way that + // would invalidate dentries, since we can't revalidate dentries by + // checking inoKey. + // + // - We need to associate the new inoKey with the existing d.ino. + d.inoKey = inoKeyFromStat(&inode.Stat) + d.fs.inoMu.Lock() + d.fs.inoByKey[d.inoKey] = d.ino + d.fs.inoMu.Unlock() + + // Check metadata stability before updating metadata. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.isRegularFile() { + if opts.ValidateFileSizes { + if inode.Stat.Mask&linux.STATX_SIZE != 0 { + return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d)) + } + if d.size != inode.Stat.Size { + return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, inode.Stat.Size) + } + } + if opts.ValidateFileModificationTimestamps { + if inode.Stat.Mask&linux.STATX_MTIME != 0 { + return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d)) + } + if want := dentryTimestampFromLisa(inode.Stat.Mtime); d.mtime != want { + return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want)) + } + } + } + if !d.cachedMetadataAuthoritative() { + d.updateFromLisaStatLocked(&inode.Stat) + } + + if rw, ok := d.fs.savedDentryRW[d]; ok { + if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { + return err + } + } + + return nil +} + // Preconditions: d is not synthetic. func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { for _, child := range d.children { @@ -301,19 +376,35 @@ func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.Comp // only be detected by checking filesystem.syncableDentries). d.parent has been // restored. func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { - qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name) - if err != nil { - return err - } - if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil { - return err + if d.fs.opts.lisaEnabled { + inode, err := d.parent.controlFDLisa.Walk(ctx, d.name) + if err != nil { + return err + } + if err := d.restoreFileLisa(ctx, inode, opts); err != nil { + return err + } + } else { + qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name) + if err != nil { + return err + } + if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil { + return err + } } return d.restoreDescendantsRecursive(ctx, opts) } func (fd *specialFileFD) completeRestore(ctx context.Context) error { d := fd.dentry() - h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + var h handle + var err error + if d.fs.opts.lisaEnabled { + h, err = openHandleLisa(ctx, d.controlFDLisa, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + } else { + h, err = openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + } if err != nil { return err } diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index fe15f8583..86ab70453 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -59,11 +59,6 @@ func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { - cf, ok := sockTypeToP9(ce.Type()) - if !ok { - return syserr.ErrConnectionRefused - } - // No lock ordering required as only the ConnectingEndpoint has a mutex. ce.Lock() @@ -77,7 +72,7 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec return syserr.ErrInvalidEndpointState } - c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue()) + c, err := e.newConnectedEndpoint(ctx, ce.Type(), ce.WaiterQueue()) if err != nil { ce.Unlock() return err @@ -95,7 +90,7 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec // UnidirectionalConnect implements // transport.BoundEndpoint.UnidirectionalConnect. func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { - c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{}) + c, err := e.newConnectedEndpoint(ctx, linux.SOCK_DGRAM, &waiter.Queue{}) if err != nil { return nil, err } @@ -111,25 +106,39 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect return c, nil } -func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { - hostFile, err := e.dentry.file.connect(ctx, flags) - if err != nil { +func (e *endpoint) newConnectedEndpoint(ctx context.Context, sockType linux.SockType, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { + if e.dentry.fs.opts.lisaEnabled { + hostSockFD, err := e.dentry.controlFDLisa.Connect(ctx, sockType) + if err != nil { + return nil, syserr.ErrConnectionRefused + } + + c, serr := host.NewSCMEndpoint(ctx, hostSockFD, queue, e.path) + if serr != nil { + unix.Close(hostSockFD) + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v sockType %d: %v", e.dentry.file, sockType, serr) + return nil, serr + } + return c, nil + } + + flags, ok := sockTypeToP9(sockType) + if !ok { return nil, syserr.ErrConnectionRefused } - // Dup the fd so that the new endpoint can manage its lifetime. - hostFD, err := unix.Dup(hostFile.FD()) + hostFile, err := e.dentry.file.connect(ctx, flags) if err != nil { - log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err) - return nil, syserr.FromError(err) + return nil, syserr.ErrConnectionRefused } - // After duplicating, we no longer need hostFile. - hostFile.Close() - c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) + c, serr := host.NewSCMEndpoint(ctx, hostFile.FD(), queue, e.path) if serr != nil { - log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr) + hostFile.Close() + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v sockType %d: %v", e.dentry.file, sockType, serr) return nil, serr } + // Ownership has been transferred to c. + hostFile.Release() return c, nil } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 4b59c1c3c..c568bbfd2 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,13 +22,16 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsmetric" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -76,6 +79,16 @@ type specialFileFD struct { bufMu sync.Mutex `state:"nosave"` haveBuf uint32 buf []byte + + // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and + // hostFileMapperInitOnce is used to initialize it on first use. + hostFileMapperInitOnce sync.Once `state:"nosave"` + hostFileMapper fsutil.HostFileMapper + + // If handle.fd >= 0, fileRefs counts references on memmap.File offsets. + // fileRefs is protected by fileRefsMu. + fileRefsMu sync.Mutex `state:"nosave"` + fileRefs fsutil.FrameRefSet } func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { @@ -137,6 +150,9 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } + if fs := fd.filesystem(); fs.opts.lisaEnabled { + return fd.handle.fdLisa.Flush(ctx) + } return fd.handle.file.flush(ctx) } @@ -172,6 +188,9 @@ func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint if fd.isRegularFile { d := fd.dentry() return d.doAllocate(ctx, offset, length, func() error { + if d.fs.opts.lisaEnabled { + return fd.handle.fdLisa.Allocate(ctx, mode, offset, length) + } return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) }) } @@ -230,23 +249,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } } - // Going through dst.CopyOutFrom() would hold MM locks around file - // operations of unknown duration. For regularFileFD, doing so is necessary - // to support mmap due to lock ordering; MM locks precede dentry.dataMu. - // That doesn't hold here since specialFileFD doesn't client-cache data. - // Just buffer the read instead. - buf := make([]byte, dst.NumBytes()) - n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + rw := getHandleReadWriter(ctx, &fd.handle, offset) + n, err := dst.CopyOutFrom(ctx, rw) + putHandleReadWriter(rw) if linuxerr.Equals(linuxerr.EAGAIN, err) { - err = syserror.ErrWouldBlock - } - if n == 0 { - return bufN, err + err = linuxerr.ErrWouldBlock } - if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { - return bufN + int64(cp), cperr - } - return bufN + int64(n), err + return bufN + n, err } // Read implements vfs.FileDescriptionImpl.Read. @@ -317,20 +326,15 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } } - // Do a buffered write. See rationale in PRead. - buf := make([]byte, src.NumBytes()) - copied, copyErr := src.CopyIn(ctx, buf) - if copied == 0 && copyErr != nil { - // Only return the error if we didn't get any data. - return 0, offset, copyErr - } - n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset)) + rw := getHandleReadWriter(ctx, &fd.handle, offset) + n, err := src.CopyInTo(ctx, rw) + putHandleReadWriter(rw) if linuxerr.Equals(linuxerr.EAGAIN, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } // Update offset if the offset is valid. if offset >= 0 { - offset += int64(n) + offset += n } // Update file size for regular files. if fd.isRegularFile { @@ -341,10 +345,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off atomic.StoreUint64(&d.size, uint64(offset)) } } - if err != nil { - return int64(n), offset, err - } - return int64(n), offset, copyErr + return int64(n), offset, err } // Write implements vfs.FileDescriptionImpl.Write. @@ -377,10 +378,10 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - return fd.sync(ctx, false /* forFilesystemSync */) + return fd.sync(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */) } -func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { +func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error { // Locks to ensure it didn't race with fd.Release(). fd.releaseMu.RLock() defer fd.releaseMu.RUnlock() @@ -397,6 +398,13 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error ctx.UninterruptibleSleepFinish(false) return err } + if fs := fd.filesystem(); fs.opts.lisaEnabled { + if accFsyncFDIDsLisa != nil { + *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, fd.handle.fdLisa.ID()) + return nil + } + return fd.handle.fdLisa.Sync(ctx) + } return fd.handle.file.fsync(ctx) }() if err != nil { @@ -412,3 +420,85 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error } return nil } + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache { + return linuxerr.ENODEV + } + // After this point, fd may be used as a memmap.Mappable and memmap.File. + fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init) + return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { + fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { + fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { + return fd.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { + mr := optional + if fd.filesystem().opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: fd, + Offset: mr.Start, + Perms: hostarch.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// IncRef implements memmap.File.IncRef. +func (fd *specialFileFD) IncRef(fr memmap.FileRange) { + fd.fileRefsMu.Lock() + defer fd.fileRefsMu.Unlock() + fd.fileRefs.IncRefAndAccount(fr) +} + +// DecRef implements memmap.File.DecRef. +func (fd *specialFileFD) DecRef(fr memmap.FileRange) { + fd.fileRefsMu.Lock() + defer fd.fileRefsMu.Unlock() + fd.fileRefs.DecRefAndAccount(fr) +} + +// MapInternal implements memmap.File.MapInternal. +func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { + fd.requireHostFD() + return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write) +} + +// FD implements memmap.File.FD. +func (fd *specialFileFD) FD() int { + fd.requireHostFD() + return int(fd.handle.fd) +} + +func (fd *specialFileFD) requireHostFD() { + if fd.handle.fd < 0 { + // This is possible if fd was successfully mmapped before saving, then + // was restored without a host FD. This is unrecoverable: without a + // host FD, we can't mmap this file post-restore. + panic("gofer.specialFileFD can no longer be memory-mapped without a host FD") + } +} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index dbd834c67..27d9be5c4 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -35,7 +35,13 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { return target, nil } } - target, err := d.file.readlink(ctx) + var target string + var err error + if d.fs.opts.lisaEnabled { + target, err = d.controlFDLisa.ReadLinkAt(ctx) + } else { + target, err = d.file.readlink(ctx) + } if d.fs.opts.interop != InteropModeShared { if err == nil { d.haveTarget = true diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 9cbe805b9..07940b225 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -17,6 +17,7 @@ package gofer import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -24,6 +25,10 @@ func dentryTimestampFromP9(s, ns uint64) int64 { return int64(s*1e9 + ns) } +func dentryTimestampFromLisa(t linux.StatxTimestamp) int64 { + return t.Sec*1e9 + int64(t.Nsec) +} + // Preconditions: d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtime(mnt *vfs.Mount) { if mnt.Flags.NoATime || mnt.ReadOnly() { diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 476545d00..180a35583 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -70,7 +70,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 89aa7b3d9..984c6e8ee 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -37,7 +37,6 @@ import ( unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -712,7 +711,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts if total != 0 { err = nil } else { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } } return total, err @@ -766,7 +765,7 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt if !i.seekable { n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) if isBlockError(err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } return n, err } diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 7f6ce4ee5..04ac73255 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -346,7 +345,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // If the signal is SIGTTIN, then we are attempting to read // from the TTY. Don't send the signal and return EIO. if sig == linux.SIGTTIN { - return syserror.EIO + return linuxerr.EIO } // Otherwise, we are writing or changing terminal state. This is allowed. @@ -355,7 +354,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // If the process group is an orphan, return EIO. if pg.IsOrphan() { - return syserror.EIO + return linuxerr.EIO } // Otherwise, send the signal to the process group and return ERESTARTSYS. @@ -368,5 +367,5 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index 95d7ebe2e..9850f3f41 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -42,7 +42,7 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { } // isBlockError checks if an error is EAGAIN or EWOULDBLOCK. -// If so, they can be transformed into syserror.ErrWouldBlock. +// If so, they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { return linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) } diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index d53937db6..4b577ea43 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -119,7 +119,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -137,6 +136,7 @@ go_test( "//pkg/abi/linux", "//pkg/context", "//pkg/errors/linuxerr", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -144,7 +144,6 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "@com_github_google_go_cmp//cmp:go_default_library", ], diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 8b008dc10..7db1473c4 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -99,7 +98,7 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, l func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. - return syserror.EISDIR + return linuxerr.EISDIR } fd.LockFD.Init(locks) fd.seekEnd = fdOpts.SeekEnd diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a97473f7d..363ebc466 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // stepExistingLocked resolves rp.Component() in parent directory vfsd. @@ -224,7 +223,7 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string return linuxerr.EEXIST } if parent.VFSDentry().IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { return err @@ -241,7 +240,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) er return linuxerr.EBUSY } if parent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err @@ -362,7 +361,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if rp.Mount() != vd.Mount() { return linuxerr.EXDEV @@ -443,7 +442,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -509,7 +508,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer unlock() if rp.Done() { if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -536,11 +535,11 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } pc := rp.Component() if pc == "." || pc == ".." { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if len(pc) > linux.NAME_MAX { return nil, linuxerr.ENAMETOOLONG @@ -861,7 +860,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -895,7 +894,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if d.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } virtfs := rp.VirtualFilesystem() parentDentry := d.parent diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index a42fc79b4..b96dc9ef7 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -26,7 +26,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // InodeNoopRefCount partially implements the Inode interface, specifically the @@ -234,6 +233,11 @@ func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(atomic.LoadUint32(&a.mode)) } +// Links returns the link count. +func (a *InodeAttrs) Links() uint32 { + return atomic.LoadUint32(&a.nlink) +} + // TouchAtime updates a.atime to the current time. func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) { if mnt.Flags.NoATime || mnt.ReadOnly() { @@ -289,7 +293,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut return linuxerr.EPERM } if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err @@ -475,7 +479,7 @@ func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error s, ok := o.set[name] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. @@ -502,6 +506,30 @@ func (o *OrderedChildren) Insert(name string, child Inode) error { return o.insert(name, child, false) } +// Inserter is like Insert, but obtains the child to insert by calling +// makeChild. makeChild is only called if the insert will succeed. This allows +// the caller to atomically check and insert a child without having to +// clean up the child on failure. +func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return nil, linuxerr.EEXIST + } + + // Note: We must not fail after we call makeChild(). + + child := makeChild() + s := &slot{ + name: name, + inode: child, + static: false, + } + o.order.PushBack(s) + o.set[name] = s + return child, nil +} + // insert inserts child into o. // // Precondition: Caller must be holding a ref on child if static is true. @@ -559,7 +587,7 @@ func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, n func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if s.inode != child { panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child)) @@ -746,5 +774,5 @@ type InodeNoStatFS struct{} // StatFS implements Inode.StatFS. func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { - return linux.Statfs{}, syserror.ENOSYS + return linux.Statfs{}, linuxerr.ENOSYS } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 0e2867d49..544698694 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -61,6 +61,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -542,6 +543,63 @@ func (d *Dentry) FSLocalPath() string { return b.String() } +// WalkDentryTree traverses p in the dentry tree for this filesystem. Note that +// this only traverses the dentry tree and is not a general path traversal. No +// symlinks and dynamic children are resolved, and no permission checks are +// performed. The caller is responsible for ensuring the returned Dentry exists +// for an appropriate lifetime. +// +// p is interpreted starting at d, and may be absolute or relative (absolute vs +// relative paths both refer to the same target here, since p is absolute from +// d). p may contain "." and "..", but will not allow traversal above d (similar +// to ".." at the root dentry). +// +// This is useful for filesystem internals, where the filesystem may not be +// mounted yet. For a mounted filesystem, use GetDentryAt. +func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) { + d.fs.mu.RLock() + defer d.fs.processDeferredDecRefs(ctx) + defer d.fs.mu.RUnlock() + + target := d + + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + pc := pit.String() + + switch { + case target == nil: + return nil, linuxerr.ENOENT + case pc == ".": + // No-op, consume component and continue. + case pc == "..": + if target == d { + // Don't let .. traverse above the start point of the walk. + continue + } + target = target.parent + // Parent doesn't need revalidation since we revalidated it on the + // way to the child, and we're still holding fs.mu. + default: + var err error + + d.dirMu.Lock() + target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc]) + d.dirMu.Unlock() + + if err != nil { + return nil, err + } + } + } + + if target == nil { + return nil, linuxerr.ENOENT + } + + target.IncRef() + return target, nil +} + // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // @@ -667,12 +725,15 @@ type inodeDirectory interface { // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child - // is a directory, checking for an empty directory. + // is a directory, or checking for an empty directory. RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being - // renamed. child should point to the resolved child in the source - // directory. + // renamed. child points to the resolved child in the source directory. + // dstDir is guaranteed to be a directory inode. + // + // On a successful call to Rename, the caller updates the dentry tree to + // reflect the name change. // // Precondition: Caller must serialize concurrent calls to Rename. Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 609887943..a2aba9321 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" @@ -346,3 +347,63 @@ func TestDirFDIterDirents(t *testing.T) { "file1": linux.DT_REG, }) } + +func TestDirWalkDentryTree(t *testing.T) { + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "dir1": fs.newDir(ctx, creds, 0755, nil), + "dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "file1": fs.newFile(ctx, creds, staticFileContent), + "dir3": fs.newDir(ctx, creds, 0755, nil), + }), + }) + }) + defer sys.Destroy() + + testWalk := func(from *kernfs.Dentry, getDentryPath, walkPath string, expectedErr error) { + var d *kernfs.Dentry + if getDentryPath != "" { + pop := sys.PathOpAtRoot(getDentryPath) + vd := sys.GetDentryOrDie(pop) + defer vd.DecRef(sys.Ctx) + d = vd.Dentry().Impl().(*kernfs.Dentry) + } + + match, err := from.WalkDentryTree(sys.Ctx, sys.VFS, fspath.Parse(walkPath)) + if err == nil { + defer match.DecRef(sys.Ctx) + } + + if err != expectedErr { + t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) unexpected error, want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, expectedErr, err) + } + if expectedErr != nil { + return + } + + if d != match { + t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) found unexpected dentry; want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, d, match) + } + } + + rootD := sys.Root.Dentry().Impl().(*kernfs.Dentry) + + testWalk(rootD, "dir1", "/dir1", nil) + testWalk(rootD, "", "/dir-non-existent", linuxerr.ENOENT) + testWalk(rootD, "", "/dir1/child-non-existent", linuxerr.ENOENT) + testWalk(rootD, "", "/dir2/inner-non-existent/dir3", linuxerr.ENOENT) + + testWalk(rootD, "dir2/dir3", "/dir2/../dir2/dir3", nil) + testWalk(rootD, "dir2/dir3", "/dir2/././dir3", nil) + testWalk(rootD, "dir2/dir3", "/dir2/././dir3/.././dir3", nil) + + pop := sys.PathOpAtRoot("dir2") + dir2VD := sys.GetDentryOrDie(pop) + defer dir2VD.DecRef(sys.Ctx) + dir2D := dir2VD.Dentry().Impl().(*kernfs.Dentry) + + testWalk(dir2D, "dir2/dir3", "/dir3", nil) + testWalk(dir2D, "dir2/dir3", "/../../../dir3", nil) + testWalk(dir2D, "dir2/file1", "/file1", nil) + testWalk(dir2D, "dir2/file1", "file1", nil) +} diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD index ed730e215..d16dfef9b 100644 --- a/pkg/sentry/fsimpl/overlay/BUILD +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 1f85a1f0d..520487066 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isCopiedUp() bool { @@ -37,6 +36,10 @@ func (d *dentry) isCopiedUp() bool { // // Preconditions: filesystem.renameMu must be locked. func (d *dentry) copyUpLocked(ctx context.Context) error { + return d.copyUpMaybeSyntheticMountpointLocked(ctx, false /* forSyntheticMountpoint */) +} + +func (d *dentry) copyUpMaybeSyntheticMountpointLocked(ctx context.Context, forSyntheticMountpoint bool) error { // Fast path. if d.isCopiedUp() { return nil @@ -60,7 +63,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { // d is a filesystem root with no upper layer. return linuxerr.EROFS } - if err := d.parent.copyUpLocked(ctx); err != nil { + if err := d.parent.copyUpMaybeSyntheticMountpointLocked(ctx, forSyntheticMountpoint); err != nil { return err } @@ -72,7 +75,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if d.vfsd.IsDead() { // Raced with deletion of d. - return syserror.ENOENT + return linuxerr.ENOENT } // Obtain settable timestamps from the lower layer. @@ -169,7 +172,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { case linux.S_IFDIR: if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{ - Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + ForSyntheticMountpoint: forSyntheticMountpoint, }); err != nil { return err } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 5e89928c5..3b3dcf836 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs @@ -314,7 +313,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str } if !topLookupLayer.existsInOverlay() { child.destroyLocked(ctx) - return nil, topLookupLayer, syserror.ENOENT + return nil, topLookupLayer, linuxerr.ENOENT } // Device and inode numbers were copied from the topmost layer above. Remap @@ -463,13 +462,21 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, return d, nil } +type createType int + +const ( + createNonDirectory createType = iota + createDirectory + createSyntheticMountpoint +) + // doCreateAt checks that creating a file at rp is permitted, then invokes // create to do so. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, ct createType, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) @@ -483,7 +490,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } if parent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { @@ -505,8 +512,8 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } - if !dir && rp.MustBeDir() { - return syserror.ENOENT + if ct == createNonDirectory && rp.MustBeDir() { + return linuxerr.ENOENT } mnt := rp.Mount() @@ -519,7 +526,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } // Ensure that the parent directory is copied-up so that we can create the // new file in the upper layer. - if err := parent.copyUpLocked(ctx); err != nil { + if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, ct == createSyntheticMountpoint); err != nil { return err } @@ -530,7 +537,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir parent.dirents = nil ev := linux.IN_CREATE - if dir { + if ct != createNonDirectory { ev |= linux.IN_ISDIR } parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) @@ -619,7 +626,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { if rp.Mount() != vd.Mount() { return linuxerr.EXDEV } @@ -672,7 +679,11 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + ct := createDirectory + if opts.ForSyntheticMountpoint { + ct = createSyntheticMountpoint + } + return fs.doCreateAt(ctx, rp, ct, func(parent *dentry, childName string, haveUpperWhiteout bool) error { vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, @@ -723,7 +734,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { // Disallow attempts to create whiteouts. if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { return linuxerr.EPERM @@ -780,7 +791,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf start := rp.Start().Impl().(*dentry) if rp.Done() { if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -807,7 +818,7 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Determine whether or not we need to create a file. parent.dirMu.Lock() @@ -865,11 +876,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts * if ftype == linux.S_IFDIR { // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Can't open directories writably. if ats.MayWrite() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL @@ -919,7 +930,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving return nil, err } if parent.vfsd.IsDead() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -1086,7 +1097,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer newParent.dirMu.Unlock() } if newParent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } var ( replaced *dentry @@ -1105,7 +1116,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { return linuxerr.ENOTEMPTY @@ -1477,7 +1488,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, @@ -1533,7 +1544,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer rp.Mount().EndWrite() name := rp.Component() if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { return linuxerr.ENOTDIR @@ -1557,7 +1568,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if child.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 1d3d2d95f..95cfbdc42 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -102,7 +102,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index d99f90b36..e04ae6660 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // subtasksInode represents the inode for /proc/[pid]/task/ directory. @@ -71,15 +70,15 @@ func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) if subTask == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if subTask.ThreadGroup() != i.task.ThreadGroup() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers) } @@ -88,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { - return offset, syserror.ENOENT + return offset, linuxerr.ENOENT } if relOffset >= int64(len(tasks)) { return offset, nil @@ -124,7 +123,7 @@ type subtasksFD struct { func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { if fd.task.ExitState() >= kernel.TaskExitZombie { - return syserror.ENOENT + return linuxerr.ENOENT } return fd.GenericDirectoryFD.IterDirents(ctx, cb) } @@ -132,7 +131,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } return fd.GenericDirectoryFD.Seek(ctx, offset, whence) } @@ -140,7 +139,7 @@ func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { - return linux.Statx{}, syserror.ENOENT + return linux.Statx{}, linuxerr.ENOENT } return fd.GenericDirectoryFD.Stat(ctx, opts) } @@ -148,7 +147,7 @@ func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Sta // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { if fd.task.ExitState() >= kernel.TaskExitZombie { - return syserror.ENOENT + return linuxerr.ENOENT } return fd.GenericDirectoryFD.SetStat(ctx, opts) } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index dfc0a924e..5c6412fc0 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,11 +22,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { @@ -142,11 +142,11 @@ func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.Ite func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } fd := int32(fdInt) if !taskFDExists(ctx, i.fs, i.task, fd) { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil } @@ -218,7 +218,7 @@ func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd in func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } defer s.fs.SafeDecRefFD(ctx, file) root := vfs.RootFromContext(ctx) @@ -231,7 +231,7 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { - return vfs.VirtualDentry{}, "", syserror.ENOENT + return vfs.VirtualDentry{}, "", linuxerr.ENOENT } defer s.fs.SafeDecRefFD(ctx, file) vd := file.VirtualDentry() @@ -278,11 +278,11 @@ func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task) func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } fd := int32(fdInt) if !taskFDExists(ctx, i.fs, i.task, fd) { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } data := &fdInfoData{ fs: i.fs, @@ -330,7 +330,7 @@ var _ dynamicInode = (*fdInfoData)(nil) func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { file, descriptorFlags := getTaskFD(d.task, d.fd) if file == nil { - return syserror.ENOENT + return linuxerr.ENOENT } defer d.fs.SafeDecRefFD(ctx, file) // TODO(b/121266871): Include pos, locks, and other data. For now we only diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 0ce3ed797..d3f9cf489 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -33,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -41,7 +40,7 @@ import ( // Linux 3.18, the limit is five lines." - user_namespaces(7) const maxIDMapLines = 5 -// mm gets the kernel task's MemoryManager. No additional reference is taken on +// getMM gets the kernel task's MemoryManager. No additional reference is taken on // mm here. This is safe because MemoryManager.destroy is required to leave the // MemoryManager in a state where it's still usable as a DynamicBytesSource. func getMM(task *kernel.Task) *mm.MemoryManager { @@ -491,7 +490,7 @@ func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64 return int64(n), nil } if readErr != nil { - return 0, syserror.EIO + return 0, linuxerr.EIO } return 0, nil } @@ -609,12 +608,10 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 - s.task.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) + if mm := getMM(s.task); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) // rsslim. @@ -650,13 +647,10 @@ var _ dynamicInode = (*statmData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss uint64 - s.task.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) - + if mm := getMM(s.task); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) return nil } @@ -780,12 +774,12 @@ func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.CurrentMaxFDs() } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() - } }) + if mm := getMM(s.task); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } // Filesystem user/group IDs aren't implemented; effective UID/GID are used // instead. fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) @@ -946,25 +940,17 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent return vfs.VirtualDentry{}, "", err } - var err error - var exec fsbridge.File - s.task.WithMuLocked(func(t *kernel.Task) { - mm := t.MemoryManager() - if mm == nil { - err = linuxerr.EACCES - return - } + mm := getMM(s.task) + if mm == nil { + return vfs.VirtualDentry{}, "", linuxerr.EACCES + } - // The MemoryManager may be destroyed, in which case - // MemoryManager.destroy will simply set the executable to nil - // (with locks held). - exec = mm.Executable() - if exec == nil { - err = linuxerr.ESRCH - } - }) - if err != nil { - return vfs.VirtualDentry{}, "", err + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + exec := mm.Executable() + if exec == nil { + return vfs.VirtualDentry{}, "", linuxerr.ESRCH } defer exec.DecRef(ctx) diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index cf905fae4..7b0be9c14 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -116,12 +116,12 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err case threadSelfName: return i.newThreadSelfSymlink(ctx, root), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } task := i.pidns.TaskWithID(kernel.ThreadID(tid)) if task == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers) @@ -268,6 +268,6 @@ func cpuInfoData(k *kernel.Kernel) string { return buf.String() } -func shmData(v uint64) dynamicInode { +func ipcData(v uint64) dynamicInode { return newStaticFile(strconv.FormatUint(v, 10)) } diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 03bed22a3..4d3a2f7e6 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -58,7 +57,7 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return strconv.FormatUint(uint64(tgid), 10), nil } @@ -100,7 +99,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) if tid == 0 || tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 99f64a9d8..82e2857b3 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -47,9 +47,12 @@ func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k * "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), "sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), - "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)), - "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)), - "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)), + "shmall": fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)), + "shmmax": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)), + "shmmni": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)), + "msgmni": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)), + "msgmax": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)), + "msgmnb": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)), "yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root), }), diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD index adb610213..403c6f254 100644 --- a/pkg/sentry/fsimpl/signalfd/BUILD +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -9,10 +9,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index a7f5928b7..bdb03ef96 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -18,10 +18,10 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -91,7 +91,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) if err != nil { // There must be no signal available. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Copy out the signal info using the specified format. diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 1af0a5cbc..ab21f028e 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -36,7 +36,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index f322d2747..7fcb2d26b 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -84,6 +84,18 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.MaxCachedDentries = maxCachedDentries fs.VFSFilesystem().Init(vfsObj, &fsType, fs) + k := kernel.KernelFromContext(ctx) + fsDirChildren := make(map[string]kernfs.Inode) + // Create an empty directory to serve as the mount point for cgroupfs when + // cgroups are available. This emulates Linux behaviour, see + // kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically + // the init process) is ultimately responsible for actually mounting + // cgroupfs, but the kernel creates the mountpoint. For the sentry, the + // launcher mounts cgroupfs. + if k.CgroupRegistry() != nil { + fsDirChildren["cgroup"] = fs.newDir(ctx, creds, defaultSysDirMode, nil) + } + root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "block": fs.newDir(ctx, creds, defaultSysDirMode, nil), "bus": fs.newDir(ctx, creds, defaultSysDirMode, nil), @@ -97,7 +109,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt }), }), "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil), - "fs": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "fs": fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren), "kernel": kernelDir(ctx, fs, creds), "module": fs.newDir(ctx, creds, defaultSysDirMode, nil), "power": fs.newDir(ctx, creds, defaultSysDirMode, nil), diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 0a0d914cc..0c46a3a13 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -87,3 +87,17 @@ func TestSysRootContainsExpectedEntries(t *testing.T) { "power": linux.DT_DIR, }) } + +func TestCgroupMountpointExists(t *testing.T) { + // Note: The mountpoint is only created if cgroups are available. This is + // the VFS2 implementation of sysfs and the test runs with VFS2 enabled, so + // we expect to see the mount point unconditionally. + s := newTestSystem(t) + defer s.Destroy() + pop := s.PathOpAtRoot("/fs") + s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{ + "cgroup": linux.DT_DIR, + }) + pop = s.PathOpAtRoot("/fs/cgroup") + s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{ /*empty*/ }) +} diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD index e6980a314..2b83d7d9a 100644 --- a/pkg/sentry/fsimpl/timerfd/BUILD +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -12,7 +12,6 @@ go_library( "//pkg/hostarch", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 655a1c76a..68b785791 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -82,7 +81,7 @@ func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequenc } return sizeofUint64, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Clock returns the timer fd's Clock. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index dc8b9bfeb..94486bb63 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -82,7 +82,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sentry/vfs/memxattr", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -125,7 +124,6 @@ go_test( "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 8b04df038..e067f136e 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -75,7 +74,7 @@ afterSymlink: } child, ok := dir.childMap[name] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err @@ -171,12 +170,12 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only // be dead if it was deleted. if parentDir.dentry.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -258,7 +257,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if i.nlink == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if i.nlink == maxLinks { return linuxerr.EMLINK @@ -345,7 +344,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -366,11 +365,11 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } name := rp.Component() if name == "." || name == ".." { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if len(name) > linux.NAME_MAX { return nil, linuxerr.ENAMETOOLONG @@ -457,7 +456,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case *directory: // Can't open directories writably. if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } var fd directoryFD fd.LockFD.Init(&d.inode.locks) @@ -532,7 +531,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } renamed, ok := oldParentDir.childMap[oldName] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { return err @@ -567,7 +566,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replacedDir, ok := replaced.inode.impl.(*directory) if ok { if !renamed.inode.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if len(replacedDir.childMap) != 0 { return linuxerr.ENOTEMPTY @@ -588,7 +587,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can // only be dead if it was deleted. if newParentDir.dentry.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } // Linux places this check before some of those above; we do it here for @@ -654,7 +653,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } child, ok := parentDir.childMap[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err @@ -754,17 +753,17 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } name := rp.Component() if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } child, ok := parentDir.childMap[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err } if child.inode.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { return linuxerr.ENOTDIR diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 418c7994e..99afd9817 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -202,7 +201,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { readData := make([]byte, 1) dst := usermem.BytesIOSequence(readData) bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) } if bytesRead != 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 4393cc13b..cb7711b39 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -21,10 +21,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -146,7 +146,7 @@ func TestLocks(t *testing.T) { if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), linuxerr.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { @@ -165,7 +165,7 @@ func TestLocks(t *testing.T) { if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.WriteLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), linuxerr.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockPOSIX(ctx, uid1, lock.LockRange{Start: 0, End: 1}); err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index f2250c025..feafb06e4 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -44,7 +44,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. @@ -556,7 +555,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs. needsCtimeBump = true } case *directory: - return syserror.EISDIR + return linuxerr.EISDIR default: return linuxerr.EINVAL } diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD index 1d855234c..c12abdf33 100644 --- a/pkg/sentry/fsimpl/verity/BUILD +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -1,10 +1,24 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "verity", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + go_library( name = "verity", srcs = [ + "dentry_list.go", "filesystem.go", "save_restore.go", "verity.go", @@ -28,7 +42,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 930016a3e..52d47994d 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -32,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -67,40 +66,23 @@ func putDentrySlice(ds *[]*dentry) { dentrySlicePool.Put(ds) } -// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls -// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for // writing. // // ds is a pointer-to-pointer since defer evaluates its arguments immediately, // but dentry slices are allocated lazily, and it's much easier to say "defer -// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { -// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. // +checklocksrelease:fs.renameMu -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return } - if len(**ds) != 0 { - fs.renameMu.Lock() - for _, d := range **ds { - d.checkDropLocked(ctx) - } - fs.renameMu.Unlock() - } - putDentrySlice(*ds) -} - -// +checklocksrelease:fs.renameMu -func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { - if *ds == nil { - fs.renameMu.Unlock() - return - } for _, d := range **ds { - d.checkDropLocked(ctx) + d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } - fs.renameMu.Unlock() putDentrySlice(*ds) } @@ -166,7 +148,7 @@ afterSymlink: // verifyChildLocked verifies the hash of child against the already verified // hash of the parent to ensure the child is expected. verifyChild triggers a // sentry panic if unexpected modifications to the file system are detected. In -// ErrorOnViolation mode it returns a syserror instead. +// ErrorOnViolation mode it returns a linuxerr instead. // // Preconditions: // * fs.renameMu must be locked. @@ -547,7 +529,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, if parent.verityEnabled() { if _, ok := parent.childrenNames[name]; !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } } @@ -595,23 +577,6 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, } } - // Clear the Merkle tree file if they are to be generated at runtime. - // TODO(b/182315468): Optimize the Merkle tree generate process to - // allow only updating certain files/directories. - if fs.allowRuntimeEnable { - childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ - Root: childMerkleVD, - Start: childMerkleVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_TRUNC, - Mode: 0644, - }) - if err != nil { - return nil, err - } - childMerkleFD.DecRef(ctx) - } - // The dentry needs to be cleaned up if any error occurs. IncRef will be // called if a verity child dentry is successfully created. defer childMerkleVD.DecRef(ctx) @@ -718,7 +683,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds } var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -730,7 +695,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -751,7 +716,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -788,7 +753,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if rp.Done() { @@ -970,7 +935,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1000,7 +965,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1046,7 +1011,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { return nil, err } @@ -1057,7 +1022,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1073,7 +1038,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index c5fa9855b..d2526263c 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -23,10 +23,12 @@ // Lock order: // // filesystem.renameMu -// dentry.dirMu -// fileDescription.mu -// filesystem.verityMu -// dentry.hashMu +// dentry.cachingMu +// filesystem.cacheMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to @@ -60,7 +62,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -97,6 +98,9 @@ const ( // sizeOfStringInt32 is the size for a 32 bit integer stored as string in // extended attributes. The maximum value of a 32 bit integer has 10 digits. sizeOfStringInt32 = 10 + + // defaultMaxCachedDentries is the default limit of dentry cache. + defaultMaxCachedDentries = uint64(1000) ) var ( @@ -107,9 +111,10 @@ var ( // Mount option names for verityfs. const ( - moptLowerPath = "lower_path" - moptRootHash = "root_hash" - moptRootName = "root_name" + moptLowerPath = "lower_path" + moptRootHash = "root_hash" + moptRootName = "root_name" + moptDentryCacheLimit = "dentry_cache_limit" ) // HashAlgorithm is a type specifying the algorithm used to hash the file @@ -189,6 +194,17 @@ type filesystem struct { // dentries. renameMu sync.RWMutex `state:"nosave"` + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by cacheMu. + cacheMu sync.Mutex `state:"nosave"` + cachedDentries dentryList + cachedDentriesLen uint64 + + // maxCachedDentries is the maximum size of filesystem.cachedDentries. + maxCachedDentries uint64 + // verityMu synchronizes enabling verity files, protects files or // directories from being enabled by different threads simultaneously. // It also ensures that verity does not access files that are being @@ -199,6 +215,10 @@ type filesystem struct { // is for the whole file system to ensure that no more than one file is // enabled the same time. verityMu sync.RWMutex `state:"nosave"` + + // released is nonzero once filesystem.Release has been called. It is accessed + // with atomic memory operations. + released int32 } // InternalFilesystemOptions may be passed as @@ -239,7 +259,7 @@ func (FilesystemType) Release(ctx context.Context) {} // mode, it returns EIO, otherwise it panic. func (fs *filesystem) alertIntegrityViolation(msg string) error { if fs.action == ErrorOnViolation { - return syserror.EIO + return linuxerr.EIO } panic(msg) } @@ -267,6 +287,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, moptRootName) rootName = root } + maxCachedDentries := defaultMaxCachedDentries + if str, ok := mopts[moptDentryCacheLimit]; ok { + delete(mopts, moptDentryCacheLimit) + maxCD, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("verity.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str) + return nil, nil, linuxerr.EINVAL + } + maxCachedDentries = maxCD + } // Check for unparsed options. if len(mopts) != 0 { @@ -340,12 +370,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt action: iopts.Action, opts: opts.Data, allowRuntimeEnable: iopts.AllowRuntimeEnable, + maxCachedDentries: maxCachedDentries, } fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. d := fs.newDentry() - d.refs = 1 + // Set the root's reference count to 2. One reference is returned to + // the caller, and the other is held by fs to prevent the root from + // being "cached" and subsequently evicted. + d.refs = 2 lowerVD := vfs.MakeVirtualDentry(lowerMount, lowerMount.Root()) lowerVD.IncRef() d.lowerVD = lowerVD @@ -520,7 +554,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { + atomic.StoreInt32(&fs.released, 1) fs.lowerMount.DecRef(ctx) + + fs.renameMu.Lock() + fs.evictAllCachedDentriesLocked(ctx) + fs.renameMu.Unlock() + + // An extra reference was held by the filesystem on the root to prevent + // it from being cached/evicted. + fs.rootDentry.DecRef(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. @@ -534,6 +577,11 @@ func (fs *filesystem) MountOptions() string { type dentry struct { vfsd vfs.Dentry + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. When refs reaches 0, the dentry may be + // added to the cache or destroyed. If refs == -1, the dentry has + // already been destroyed. refs is accessed using atomic memory + // operations. refs int64 // fs is the owning filesystem. fs is immutable. @@ -588,13 +636,23 @@ type dentry struct { // is protected by hashMu. hashMu sync.RWMutex `state:"nosave"` hash []byte + + // cachingMu is used to synchronize concurrent dentry caching attempts on + // this dentry. + cachingMu sync.Mutex `state:"nosave"` + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // cachingMu. + cached bool + dentryEntry } // newDentry creates a new dentry representing the given verity file. The -// dentry initially has no references; it is the caller's responsibility to set -// the dentry's reference count and/or call dentry.destroy() as appropriate. -// The dentry is initially invalid in that it contains no underlying dentry; -// the caller is responsible for setting them. +// dentry initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.destroy() as appropriate. The dentry is initially invalid in that it +// contains no underlying dentry; the caller is responsible for setting them. func (fs *filesystem) newDentry() *dentry { d := &dentry{ fs: fs, @@ -630,42 +688,23 @@ func (d *dentry) TryIncRef() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { - r := atomic.AddInt64(&d.refs, -1) - if d.LogRefs() { - refsvfs2.LogDecRef(d, r) - } - if r == 0 { - d.fs.renameMu.Lock() - d.checkDropLocked(ctx) - d.fs.renameMu.Unlock() - } else if r < 0 { - panic("verity.dentry.DecRef() called without holding a reference") + if d.decRefNoCaching() == 0 { + d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } } -func (d *dentry) decRefLocked(ctx context.Context) { +// decRefNoCaching decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefNoCaching() int64 { r := atomic.AddInt64(&d.refs, -1) if d.LogRefs() { refsvfs2.LogDecRef(d, r) } - if r == 0 { - d.checkDropLocked(ctx) - } else if r < 0 { - panic("verity.dentry.decRefLocked() called without holding a reference") + if r < 0 { + panic("verity.dentry.decRefNoCaching() called without holding a reference") } -} - -// checkDropLocked should be called after d's reference count becomes 0 or it -// becomes deleted. -func (d *dentry) checkDropLocked(ctx context.Context) { - // Dentries with a positive reference count must be retained. Dentries - // with a negative reference count have already been destroyed. - if atomic.LoadInt64(&d.refs) != 0 { - return - } - // Refs is still zero; destroy it. - d.destroyLocked(ctx) - return + return r } // destroyLocked destroys the dentry. @@ -684,6 +723,12 @@ func (d *dentry) destroyLocked(ctx context.Context) { panic("verity.dentry.destroyLocked() called with references on the dentry") } + // Drop the reference held by d on its parent without recursively + // locking d.fs.renameMu. + if d.parent != nil && d.parent.decRefNoCaching() == 0 { + d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) + } + if d.lowerVD.Ok() { d.lowerVD.DecRef(ctx) } @@ -696,7 +741,6 @@ func (d *dentry) destroyLocked(ctx context.Context) { delete(d.parent.children, d.name) } d.parent.dirMu.Unlock() - d.parent.decRefLocked(ctx) } refsvfs2.Unregister(d) } @@ -735,6 +779,140 @@ func (d *dentry) OnZeroWatches(context.Context) { //TODO(b/159261227): Implement OnZeroWatches. } +// checkCachingLocked should be called after d's reference count becomes 0 or +// it becomes disowned. +// +// For performance, checkCachingLocked can also be called after d's reference +// count becomes non-zero, so that d can be removed from the LRU cache. This +// may help in reducing the size of the cache and hence reduce evictions. Note +// that this is not necessary for correctness. +// +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// +// Preconditions: d.fs.renameMu must be locked for writing if +// renameMuWriteLocked is true; it may be temporarily unlocked. +func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { + d.cachingMu.Lock() + refs := atomic.LoadInt64(&d.refs) + if refs == -1 { + // Dentry has already been destroyed. + d.cachingMu.Unlock() + return + } + if refs > 0 { + // fs.cachedDentries is permitted to contain dentries with non-zero refs, + // which are skipped by fs.evictCachedDentryLocked() upon reaching the end + // of the LRU. But it is still beneficial to remove d from the cache as we + // are already holding d.cachingMu. Keeping a cleaner cache also reduces + // the number of evictions (which is expensive as it acquires fs.renameMu). + d.removeFromCacheLocked() + d.cachingMu.Unlock() + return + } + + if atomic.LoadInt32(&d.fs.released) != 0 { + d.cachingMu.Unlock() + if !renameMuWriteLocked { + // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as + // needed by d.destroyLocked() later. + d.fs.renameMu.Lock() + defer d.fs.renameMu.Unlock() + } + if d.parent != nil { + d.parent.dirMu.Lock() + delete(d.parent.children, d.name) + d.parent.dirMu.Unlock() + } + d.destroyLocked(ctx) // +checklocksforce: see above. + return + } + + d.fs.cacheMu.Lock() + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + d.fs.cacheMu.Unlock() + d.cachingMu.Unlock() + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + shouldEvict := d.fs.cachedDentriesLen > d.fs.maxCachedDentries + d.fs.cacheMu.Unlock() + d.cachingMu.Unlock() + + if shouldEvict { + if !renameMuWriteLocked { + // Need to lock d.fs.renameMu for writing as needed by + // d.evictCachedDentryLocked(). + d.fs.renameMu.Lock() + defer d.fs.renameMu.Unlock() + } + d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. + } +} + +// Preconditions: d.cachingMu must be locked. +func (d *dentry) removeFromCacheLocked() { + if d.cached { + d.fs.cacheMu.Lock() + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.fs.cacheMu.Unlock() + d.cached = false + } +} + +// Precondition: fs.renameMu must be locked for writing; it may be temporarily +// unlocked. +// +checklocks:fs.renameMu +func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { + for fs.cachedDentriesLen != 0 { + fs.evictCachedDentryLocked(ctx) + } +} + +// Preconditions: +// * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// +checklocks:fs.renameMu +func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { + fs.cacheMu.Lock() + victim := fs.cachedDentries.Back() + fs.cacheMu.Unlock() + if victim == nil { + // fs.cachedDentries may have become empty between when it was + // checked and when we locked fs.cacheMu. + return + } + + victim.cachingMu.Lock() + victim.removeFromCacheLocked() + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) != 0 { + victim.cachingMu.Unlock() + return + } + if victim.parent != nil { + victim.parent.dirMu.Lock() + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) + delete(victim.parent.children, victim.name) + victim.parent.dirMu.Unlock() + } + victim.cachingMu.Unlock() + victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. +} + func (d *dentry) isSymlink() bool { return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK } @@ -1091,6 +1269,21 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { return 0, fd.d.fs.alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") } + // Populate children names here. We cannot rely on the children + // dentries to populate parent dentry's children names, because the + // parent dentry may be destroyed before users enable verity if its ref + // count drops to zero. + if fd.d.isDir() { + if err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name != "." && dirent.Name != ".." { + fd.d.childrenNames[dirent.Name] = struct{}{} + } + return nil + })); err != nil { + return 0, err + } + } + hash, dataSize, err := fd.generateMerkleLocked(ctx) if err != nil { return 0, err @@ -1118,9 +1311,6 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { }); err != nil { return 0, err } - - // Add the current child's name to parent's childrenNames. - fd.d.parent.childrenNames[fd.d.name] = struct{}{} } // Record the size of the data being hashed for fd. @@ -1215,7 +1405,7 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. case linux.FS_IOC_GETFLAGS: return fd.verityFlags(ctx, args[2].Pointer()) default: - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } } diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index 66fa1ad40..03c8e2f38 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -12,8 +12,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/fd", - "//pkg/hostarch", + "//pkg/eventfd", "//pkg/log", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go index 285ea9050..5df06a60f 100644 --- a/pkg/sentry/hostmm/hostmm.go +++ b/pkg/sentry/hostmm/hostmm.go @@ -21,9 +21,7 @@ import ( "os" "path" - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/log" ) @@ -54,7 +52,7 @@ func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) } defer eventControlFile.Close() - eventFD, err := newEventFD() + eventFD, err := eventfd.Create() if err != nil { return nil, err } @@ -75,20 +73,11 @@ func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) const stopVal = 1 << 63 stopCh := make(chan struct{}) go func() { // S/R-SAFE: f provides synchronization if necessary - rw := fd.NewReadWriter(eventFD.FD()) - var buf [sizeofUint64]byte for { - n, err := rw.Read(buf[:]) + val, err := eventFD.Read() if err != nil { - if err == unix.EINTR { - continue - } panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err)) } - if n != sizeofUint64 { - panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) - } - val := hostarch.ByteOrder.Uint64(buf[:]) if val >= stopVal { // Assume this was due to the notifier's "destructor" (the // function returned by NotifyCurrentMemcgPressureCallback @@ -101,30 +90,7 @@ func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) } }() return func() { - rw := fd.NewReadWriter(eventFD.FD()) - var buf [sizeofUint64]byte - hostarch.ByteOrder.PutUint64(buf[:], stopVal) - for { - n, err := rw.Write(buf[:]) - if err != nil { - if err == unix.EINTR { - continue - } - panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err)) - } - if n != sizeofUint64 { - panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) - } - break - } + eventFD.Write(stopVal) <-stopCh }, nil } - -func newEventFD() (*fd.FD, error) { - f, _, e := unix.Syscall(unix.SYS_EVENTFD2, 0, 0, 0) - if e != 0 { - return nil, fmt.Errorf("failed to create eventfd: %v", e) - } - return fd.New(int(f)), nil -} diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 5bba9de0b..2363cec5f 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -1,13 +1,26 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package( default_visibility = ["//:sandbox"], licenses = ["notice"], ) +go_template_instance( + name = "atomicptr_netns", + out = "atomicptr_netns_unsafe.go", + package = "inet", + prefix = "Namespace", + template = "//pkg/sync/atomicptr:generic_atomicptr", + types = { + "Value": "Namespace", + }, +) + go_library( name = "inet", srcs = [ + "atomicptr_netns_unsafe.go", "context.go", "inet.go", "namespace.go", diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e4e0dc04f..c0f13bf52 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -268,6 +268,7 @@ go_library( "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/seccheck", "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/time", @@ -281,7 +282,6 @@ go_library( "//pkg/state/wire", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/usermem", diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 7a1a36454..9aa03f506 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -66,6 +66,5 @@ go_library( "//pkg/errors/linuxerr", "//pkg/log", "//pkg/sync", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go index c93ef6ac1..a0e291f58 100644 --- a/pkg/sentry/kernel/cgroup.go +++ b/pkg/sentry/kernel/cgroup.go @@ -196,6 +196,7 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files // uniqueness of controllers enforced by Register, drop the // dying hierarchy now. The eventual unregister by the FS // teardown will become a no-op. + r.unregisterLocked(h.id) return nil } return h.fs diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index 564c3d42e..f240a68aa 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -9,13 +9,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 4466fbc9d..5ea44a2c2 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -22,13 +22,13 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -145,7 +145,7 @@ func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) if _, err := unix.Read(e.hostfd, buf[:]); err != nil { if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -165,7 +165,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro // We can't complete the read if the value is currently zero. if e.val == 0 { e.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } // Update the value based on the mode the event is operating in. @@ -198,7 +198,7 @@ func (e *EventOperations) hostWrite(val uint64) error { hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(e.hostfd, buf[:]) if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -230,7 +230,7 @@ func (e *EventOperations) Signal(val uint64) error { // uint64 minus 1. if val > math.MaxUint64-1-e.val { e.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } e.val += val diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index cfdea5cf7..c897e3a5f 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/log", "//pkg/sentry/memmap", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index f5c364c96..2c9ea65aa 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // KeyKind indicates the type of a Key. @@ -166,7 +165,7 @@ func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { case linux.FUTEX_OP_XOR: newVal = oldVal ^ opArg default: - return false, syserror.ENOSYS + return false, linuxerr.ENOSYS } prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) if err != nil { @@ -192,7 +191,7 @@ func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { case linux.FUTEX_OP_CMP_GE: return oldVal >= cmpArg, nil default: - return false, syserror.ENOSYS + return false, linuxerr.ENOSYS } } diff --git a/pkg/sentry/kernel/ipc/object.go b/pkg/sentry/kernel/ipc/object.go index 387b35e7e..facd157c7 100644 --- a/pkg/sentry/kernel/ipc/object.go +++ b/pkg/sentry/kernel/ipc/object.go @@ -19,6 +19,8 @@ package ipc import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -113,3 +115,36 @@ func (o *Object) CheckPermissions(creds *auth.Credentials, req fs.PermMask) bool } return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, o.UserNS) } + +// Set modifies attributes for an IPC object. See *ctl(IPC_SET). +// +// Precondition: Mechanism.mu must be held. +func (o *Object) Set(ctx context.Context, perm *linux.IPCPerm) error { + creds := auth.CredentialsFromContext(ctx) + uid := creds.UserNamespace.MapToKUID(auth.UID(perm.UID)) + gid := creds.UserNamespace.MapToKGID(auth.GID(perm.GID)) + if !uid.Ok() || !gid.Ok() { + // The man pages don't specify an errno for invalid uid/gid, but EINVAL + // is generally used for invalid arguments. + return linuxerr.EINVAL + } + + if !o.CheckOwnership(creds) { + // "The argument cmd has the value IPC_SET or IPC_RMID, but the + // effective user ID of the calling process is not the creator (as + // found in msg_perm.cuid) or the owner (as found in msg_perm.uid) + // of the message queue, and the caller is not privileged (Linux: + // does not have the CAP_SYS_ADMIN capability)." + return linuxerr.EPERM + } + + // User may only modify the lower 9 bits of the mode. All the other bits are + // always 0 for the underlying inode. + mode := linux.FileMode(perm.Mode & 0x1ff) + + o.Perms = fs.FilePermsFromMode(mode) + o.Owner.UID = uid + o.Owner.GID = gid + + return nil +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index df5160b67..f913d25db 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -78,11 +78,19 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" ) -// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow -// easy access everywhere. To be removed once VFS2 becomes the default. +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global to allow +// easy access everywhere. +// +// TODO(gvisor.dev/issue/1624): Remove when VFS1 is no longer used. var VFS2Enabled = false -// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow +// LISAFSEnabled is set to true when lisafs protocol is enabled. Added as a +// global to allow easy access everywhere. +// +// TODO(gvisor.dev/issue/6319): Remove when lisafs is default. +var LISAFSEnabled = false + +// FUSEEnabled is set to true when FUSE is enabled. Added as a global to allow // easy access everywhere. To be removed once FUSE is completed. var FUSEEnabled = false diff --git a/pkg/sentry/kernel/msgqueue/msgqueue.go b/pkg/sentry/kernel/msgqueue/msgqueue.go index fab396d7c..c7c5e41fb 100644 --- a/pkg/sentry/kernel/msgqueue/msgqueue.go +++ b/pkg/sentry/kernel/msgqueue/msgqueue.go @@ -129,6 +129,16 @@ type Message struct { Size uint64 } +func (m *Message) makeCopy() *Message { + new := &Message{ + Type: m.Type, + Size: m.Size, + } + new.Text = make([]byte, len(m.Text)) + copy(new.Text, m.Text) + return new +} + // Blocker is used for blocking Queue.Send, and Queue.Receive calls that serves // as an abstracted version of kernel.Task. kernel.Task is not directly used to // prevent circular dependencies. @@ -206,6 +216,48 @@ func (r *Registry) FindByID(id ipc.ID) (*Queue, error) { return mech.(*Queue), nil } +// IPCInfo reports global parameters for message queues. See msgctl(IPC_INFO). +func (r *Registry) IPCInfo(ctx context.Context) *linux.MsgInfo { + return &linux.MsgInfo{ + MsgPool: linux.MSGPOOL, + MsgMap: linux.MSGMAP, + MsgMax: linux.MSGMAX, + MsgMnb: linux.MSGMNB, + MsgMni: linux.MSGMNI, + MsgSsz: linux.MSGSSZ, + MsgTql: linux.MSGTQL, + MsgSeg: linux.MSGSEG, + } +} + +// MsgInfo reports global parameters for message queues. See msgctl(MSG_INFO). +func (r *Registry) MsgInfo(ctx context.Context) *linux.MsgInfo { + r.mu.Lock() + defer r.mu.Unlock() + + var messages, bytes uint64 + r.reg.ForAllObjects( + func(o ipc.Mechanism) { + q := o.(*Queue) + q.mu.Lock() + messages += q.messageCount + bytes += q.byteCount + q.mu.Unlock() + }, + ) + + return &linux.MsgInfo{ + MsgPool: int32(r.reg.ObjectCount()), + MsgMap: int32(messages), + MsgTql: int32(bytes), + MsgMax: linux.MSGMAX, + MsgMnb: linux.MSGMNB, + MsgMni: linux.MSGMNI, + MsgSsz: linux.MSGSSZ, + MsgSeg: linux.MSGSEG, + } +} + // Send appends a message to the message queue, and returns an error if sending // fails. See msgsnd(2). func (q *Queue) Send(ctx context.Context, m Message, b Blocker, wait bool, pid int32) error { @@ -413,7 +465,7 @@ func (q *Queue) Copy(mType int64) (*Message, error) { if msg == nil { return nil, linuxerr.ENOMSG } - return msg, nil + return msg.makeCopy(), nil } // msgOfType returns the first message with the specified type, nil if no @@ -465,6 +517,73 @@ func (q *Queue) msgAtIndex(mType int64) *Message { return msg } +// Set modifies some values of the queue. See msgctl(IPC_SET). +func (q *Queue) Set(ctx context.Context, ds *linux.MsqidDS) error { + q.mu.Lock() + defer q.mu.Unlock() + + creds := auth.CredentialsFromContext(ctx) + if ds.MsgQbytes > maxQueueBytes && !creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, q.obj.UserNS) { + // "An attempt (IPC_SET) was made to increase msg_qbytes beyond the + // system parameter MSGMNB, but the caller is not privileged (Linux: + // does not have the CAP_SYS_RESOURCE capability)." + return linuxerr.EPERM + } + + if err := q.obj.Set(ctx, &ds.MsgPerm); err != nil { + return err + } + + q.maxBytes = ds.MsgQbytes + q.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// Stat returns a MsqidDS object filled with information about the queue. See +// msgctl(IPC_STAT) and msgctl(MSG_STAT). +func (q *Queue) Stat(ctx context.Context) (*linux.MsqidDS, error) { + return q.stat(ctx, fs.PermMask{Read: true}) +} + +// StatAny is similar to Queue.Stat, but doesn't require read permission. See +// msgctl(MSG_STAT_ANY). +func (q *Queue) StatAny(ctx context.Context) (*linux.MsqidDS, error) { + return q.stat(ctx, fs.PermMask{}) +} + +// stat returns a MsqidDS object filled with information about the queue. An +// error is returned if the user doesn't have the specified permissions. +func (q *Queue) stat(ctx context.Context, mask fs.PermMask) (*linux.MsqidDS, error) { + q.mu.Lock() + defer q.mu.Unlock() + + creds := auth.CredentialsFromContext(ctx) + if !q.obj.CheckPermissions(creds, mask) { + // "The caller must have read permission on the message queue." + return nil, linuxerr.EACCES + } + + return &linux.MsqidDS{ + MsgPerm: linux.IPCPerm{ + Key: uint32(q.obj.Key), + UID: uint32(creds.UserNamespace.MapFromKUID(q.obj.Owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(q.obj.Owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(q.obj.Creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(q.obj.Creator.GID)), + Mode: uint16(q.obj.Perms.LinuxMode()), + Seq: 0, // IPC sequences not supported. + }, + MsgStime: q.sendTime.TimeT(), + MsgRtime: q.receiveTime.TimeT(), + MsgCtime: q.changeTime.TimeT(), + MsgCbytes: q.byteCount, + MsgQnum: q.messageCount, + MsgQbytes: q.maxBytes, + MsgLspid: q.sendPID, + MsgLrpid: q.receivePID, + }, nil +} + // Lock implements ipc.Mechanism.Lock. func (q *Queue) Lock() { q.mu.Lock() diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 94ebac7c5..5b2bac783 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -31,7 +31,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -51,7 +50,6 @@ go_test( "//pkg/errors/linuxerr", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 08786d704..615591507 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // inodeOperations implements fs.InodeOperations for pipes. @@ -95,7 +94,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { r.DecRef(ctx) - return nil, syserror.ErrInterrupted + return nil, linuxerr.ErrInterrupted } } @@ -118,7 +117,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if !waitFor(&i.mu, &i.rWakeup, ctx) { w.DecRef(ctx) - return nil, syserror.ErrInterrupted + return nil, linuxerr.ErrInterrupted } } return w, nil diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index d25cf658e..31bd7910a 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) type sleeper struct { @@ -240,7 +239,7 @@ func TestBlockedOpenIsCancellable(t *testing.T) { // If the cancel on the sleeper didn't work, the open for read would never // return. res := <-done - if res.error != syserror.ErrInterrupted { + if res.error != linuxerr.ErrInterrupted { t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.", res.error) } diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 85e3ce9f4..86beee6fe 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -201,7 +200,7 @@ func (p *Pipe) peekLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) if !p.HasWriters() { return 0, io.EOF } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } count = p.size } @@ -250,7 +249,7 @@ func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error) avail := p.max - p.size if avail == 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } short := false if count > avail { @@ -258,7 +257,7 @@ func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error) // (PIPE_BUF) be atomic, but requires no atomicity for writes // larger than this. if count <= atomicIOBytes { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } count = avail short = true @@ -307,7 +306,7 @@ func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error) // If we shortened the write, adjust the returned error appropriately. if short { - return done, syserror.ErrWouldBlock + return done, linuxerr.ErrWouldBlock } return done, nil diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index 867f4a76b..aa3ab305d 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -18,8 +18,8 @@ import ( "bytes" "testing" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -51,8 +51,8 @@ func TestPipeReadBlock(t *testing.T) { defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) - if n != 0 || err != syserror.ErrWouldBlock { - t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock) + if n != 0 || err != linuxerr.ErrWouldBlock { + t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, linuxerr.ErrWouldBlock) } } @@ -67,7 +67,7 @@ func TestPipeWriteBlock(t *testing.T) { msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) - if wantN, wantErr := int64(capacity), syserror.ErrWouldBlock; n != wantN || err != wantErr { + if wantN, wantErr := int64(capacity), linuxerr.ErrWouldBlock; n != wantN || err != wantErr { t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr) } } @@ -102,7 +102,7 @@ func TestPipeWriteUntilEnd(t *testing.T) { for { n, err := r.Readv(ctx, dst) dst = dst.DropFirst64(n) - if err == syserror.ErrWouldBlock { + if err == linuxerr.ErrWouldBlock { select { case <-ch: continue @@ -129,7 +129,7 @@ func TestPipeWriteUntilEnd(t *testing.T) { for src.NumBytes() != 0 { n, err := w.Writev(ctx, src) src = src.DropFirst64(n) - if err == syserror.ErrWouldBlock { + if err == linuxerr.ErrWouldBlock { <-ch continue } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 077d5fd7f..a6f1989f5 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -121,7 +120,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { fd.DecRef(ctx) - return nil, syserror.EINTR + return nil, linuxerr.EINTR } case writable: @@ -137,7 +136,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { fd.DecRef(ctx) - return nil, syserror.EINTR + return nil, linuxerr.EINTR } } diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 079294f81..717c9a6b3 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -465,7 +464,7 @@ func (t *Task) ptraceUnfreezeLocked() { // stop. func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { - return syserror.EIO + return linuxerr.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() @@ -532,7 +531,7 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { } if seize { if err := target.ptraceSetOptionsLocked(opts); err != nil { - return syserror.EIO + return linuxerr.EIO } } target.ptraceTracer.Store(t) @@ -569,7 +568,7 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { // ptrace stop. func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { - return syserror.EIO + return linuxerr.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() @@ -967,7 +966,7 @@ func (t *Task) ptraceInterrupt(target *Task) error { return linuxerr.ESRCH } if !target.ptraceSeized { - return syserror.EIO + return linuxerr.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() @@ -1030,7 +1029,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { seize := req == linux.PTRACE_SEIZE if seize && addr != 0 { - return syserror.EIO + return linuxerr.EIO } return t.ptraceAttach(target, seize, uintptr(data)) } @@ -1120,13 +1119,13 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !target.ptraceSeized { - return syserror.EIO + return linuxerr.EIO } if target.ptraceSiginfo == nil { - return syserror.EIO + return linuxerr.EIO } if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { - return syserror.EIO + return linuxerr.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 63422e155..564add01b 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -19,8 +19,8 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -88,6 +88,6 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) err return err default: - return syserror.EIO + return linuxerr.EIO } } diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 27514d67b..7c2b94339 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -18,11 +18,11 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/syserror" ) // ptraceArch implements arch-specific ptrace commands. func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error { - return syserror.EIO + return linuxerr.EIO } diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 54ca43c2e..0d66648c3 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -18,9 +18,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/syserror" ) const maxSyscallFilterInstructions = 1 << 15 @@ -176,7 +176,7 @@ func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { } if totalLength > maxSyscallFilterInstructions { - return syserror.ENOMEM + return linuxerr.ENOMEM } newFilters = append(newFilters, p) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 2ae08ed12..6aa74219e 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -31,7 +31,6 @@ go_library( "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/time", "//pkg/sync", - "//pkg/syserror", ], ) @@ -43,9 +42,9 @@ go_test( deps = [ "//pkg/abi/linux", # keep "//pkg/context", # keep + "//pkg/errors/linuxerr", #keep "//pkg/sentry/contexttest", # keep "//pkg/sentry/kernel/auth", # keep "//pkg/sentry/kernel/ipc", # keep - "//pkg/syserror", # keep ], ) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 8610d3fc1..28e466948 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -151,10 +150,10 @@ func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, nsems int32, m // Map reg.objects and map indexes in a registry are of the same size, // check map reg.objects only here for the system limit. if r.reg.ObjectCount() >= setsMax { - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } if r.totalSems() > int(semsTotalMax-nsems) { - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } // Finally create a new set. @@ -337,19 +336,15 @@ func (s *Set) Size() int { return len(s.sems) } -// Change changes some fields from the set atomically. -func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error { +// Set modifies attributes for a semaphore set. See semctl(IPC_SET). +func (s *Set) Set(ctx context.Context, ds *linux.SemidDS) error { s.mu.Lock() defer s.mu.Unlock() - // "The effective UID of the calling process must match the owner or creator - // of the semaphore set, or the caller must be privileged." - if !s.obj.CheckOwnership(creds) { - return linuxerr.EACCES + if err := s.obj.Set(ctx, &ds.SemPerm); err != nil { + return err } - s.obj.Owner = owner - s.obj.Perms = perms s.changeTime = ktime.NowFromContext(ctx) return nil } @@ -549,7 +544,7 @@ func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Cr // Did it race with a removal operation? if s.dead { - return nil, 0, syserror.EIDRM + return nil, 0, linuxerr.EIDRM } // Validate the operations. @@ -588,7 +583,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch if tmpVals[op.SemNum] != 0 { // Semaphore isn't 0, must wait. if op.SemFlg&linux.IPC_NOWAIT != 0 { - return nil, 0, syserror.ErrWouldBlock + return nil, 0, linuxerr.ErrWouldBlock } w := newWaiter(op.SemOp) @@ -604,7 +599,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch if -op.SemOp > tmpVals[op.SemNum] { // Not enough resources, must wait. if op.SemFlg&linux.IPC_NOWAIT != 0 { - return nil, 0, syserror.ErrWouldBlock + return nil, 0, linuxerr.ErrWouldBlock } w := newWaiter(op.SemOp) diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go index 2e4ab8121..59ac92ef1 100644 --- a/pkg/sentry/kernel/semaphore/semaphore_test.go +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" - "gvisor.dev/gvisor/pkg/syserror" ) func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} { @@ -124,14 +124,14 @@ func TestNoWait(t *testing.T) { ops[0].SemOp = -2 ops[0].SemFlg = linux.IPC_NOWAIT - if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock { - t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + if _, _, err := set.executeOps(ctx, ops, 123); err != linuxerr.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, linuxerr.ErrWouldBlock) } ops[0].SemOp = 0 ops[0].SemFlg = linux.IPC_NOWAIT - if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock { - t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + if _, _, err := set.executeOps(ctx, ops, 123); err != linuxerr.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, linuxerr.ErrWouldBlock) } } diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 4e8deac4c..2547957ba 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 2abf467d7..ab938fa3c 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -49,7 +49,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Registry tracks all shared memory segments in an IPC namespace. The registry @@ -151,7 +150,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, siz if r.reg.ObjectCount() >= linux.SHMMNI { // "All possible shared memory IDs have been taken (SHMMNI) ..." // - man shmget(2) - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } if !private { @@ -184,7 +183,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, siz // "... allocating a segment of the requested size would cause the // system to exceed the system-wide limit on shared memory (SHMALL)." // - man shmget(2) - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } // Need to create a new segment. @@ -521,7 +520,7 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta s.mu.Lock() defer s.mu.Unlock() if s.pendingDestruction && s.ReadRefs() == 0 { - return memmap.MMapOpts{}, syserror.EIDRM + return memmap.MMapOpts{}, linuxerr.EIDRM } creds := auth.CredentialsFromContext(ctx) @@ -619,25 +618,10 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { s.mu.Lock() defer s.mu.Unlock() - creds := auth.CredentialsFromContext(ctx) - if !s.obj.CheckOwnership(creds) { - return linuxerr.EPERM - } - - uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) - gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) - if !uid.Ok() || !gid.Ok() { - return linuxerr.EINVAL + if err := s.obj.Set(ctx, &ds.ShmPerm); err != nil { + return err } - // User may only modify the lower 9 bits of the mode. All the other bits are - // always 0 for the underlying inode. - mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) - s.obj.Perms = fs.FilePermsFromMode(mode) - - s.obj.Owner.UID = uid - s.obj.Owner.GID = gid - s.changeTime = ktime.NowFromContext(ctx) return nil } diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 1110ecca5..4180ca28e 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 47958e2d4..9c5e6698c 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -99,7 +98,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS info, err := s.target.Sigtimedwait(s.Mask(), 0) if err != nil { // There must be no signal available. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Copy out the signal info using the specified format. diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 59eeb253d..b0004482c 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -510,9 +511,7 @@ type Task struct { numaNodeMask uint64 // netns is the task's network namespace. netns is never nil. - // - // netns is protected by mu. - netns *inet.Namespace + netns inet.NamespaceAtomicPtr // If rseqPreempted is true, before the next call to p.Switch(), // interrupt rseq critical regions as defined by rseqAddr and @@ -874,3 +873,23 @@ func (t *Task) ResetKcov() { t.kcov = nil } } + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) loadSeccheckInfoLocked(req seccheck.TaskFieldSet, mask *seccheck.TaskFieldSet, info *seccheck.TaskInfo) { + if req.Contains(seccheck.TaskFieldThreadID) { + info.ThreadID = int32(t.k.tasks.Root.tids[t]) + mask.Add(seccheck.TaskFieldThreadID) + } + if req.Contains(seccheck.TaskFieldThreadStartTime) { + info.ThreadStartTime = t.startTime + mask.Add(seccheck.TaskFieldThreadStartTime) + } + if req.Contains(seccheck.TaskFieldThreadGroupID) { + info.ThreadGroupID = int32(t.k.tasks.Root.tgids[t.tg]) + mask.Add(seccheck.TaskFieldThreadGroupID) + } + if req.Contains(seccheck.TaskFieldThreadGroupStartTime) { + info.ThreadGroupStartTime = t.tg.leader.startTime + mask.Add(seccheck.TaskFieldThreadGroupStartTime) + } +} diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index b2520eecf..9bfc155e4 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // BlockWithTimeout blocks t until an event is received from C, the application @@ -33,7 +32,7 @@ import ( // and is unspecified if haveTimeout is false. // // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout -// expired, and syserror.ErrInterrupted if t is interrupted. +// expired, and linuxerr.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { @@ -67,7 +66,7 @@ func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time. // application monotonic clock indicates a time of deadline (only if // haveDeadline is true), or t is interrupted. It returns nil if an event is // received from C, ETIMEDOUT if the deadline expired, and -// syserror.ErrInterrupted if t is interrupted. +// linuxerr.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline ktime.Time) error { @@ -95,7 +94,7 @@ func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline // BlockWithTimer blocks t until an event is received from C or tchan, or t is // interrupted. It returns nil if an event is received from C, ETIMEDOUT if an -// event is received from tchan, and syserror.ErrInterrupted if t is +// event is received from tchan, and linuxerr.ErrInterrupted if t is // interrupted. // // Most clients should use BlockWithDeadline or BlockWithTimeout instead. @@ -106,7 +105,7 @@ func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error { } // Block blocks t until an event is received from C or t is interrupted. It -// returns nil if an event is received from C and syserror.ErrInterrupted if t +// returns nil if an event is received from C and linuxerr.ErrInterrupted if t // is interrupted. // // Preconditions: The caller must be running on the task goroutine. @@ -157,7 +156,7 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { region.End() t.SleepFinish(false) // Return the indicated error on interrupt. - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted case <-timerChan: region.End() diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index da4b77ca2..a6d8fb163 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/usermem" ) @@ -235,7 +236,23 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) - t.traceCloneEvent(tid) + + if seccheck.Global.Enabled(seccheck.PointClone) { + mask, info := getCloneSeccheckInfo(t, nt, args) + if err := seccheck.Global.Clone(t, mask, &info); err != nil { + // nt has been visible to the rest of the system since NewTask, so + // it may be blocking execve or a group stop, have been notified + // for group signal delivery, had children reparented to it, etc. + // Thus we can't just drop it on the floor. Instead, instruct the + // task goroutine to exit immediately, as quietly as possible. + nt.exitTracerNotified = true + nt.exitTracerAcked = true + nt.exitParentNotified = true + nt.exitParentAcked = true + nt.runState = (*runExitMain)(nil) + return 0, nil, err + } + } // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - @@ -260,6 +277,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) } + t.traceCloneEvent(tid) kind := ptraceCloneKindClone if args.Flags&linux.CLONE_VFORK != 0 { kind = ptraceCloneKindVfork @@ -279,6 +297,22 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { return ntid, nil, nil } +func getCloneSeccheckInfo(t, nt *Task, args *linux.CloneArgs) (seccheck.CloneFieldSet, seccheck.CloneInfo) { + req := seccheck.Global.CloneReq() + info := seccheck.CloneInfo{ + Credentials: t.Credentials(), + Args: *args, + } + var mask seccheck.CloneFieldSet + mask.Add(seccheck.CloneFieldCredentials) + mask.Add(seccheck.CloneFieldArgs) + t.k.tasks.mu.RLock() + defer t.k.tasks.mu.RUnlock() + t.loadSeccheckInfoLocked(req.Invoker, &mask.Invoker, &info.Invoker) + nt.loadSeccheckInfoLocked(req.Created, &mask.Created, &info.Created) + return mask, info +} + // maybeBeginVforkStop checks if a previously-started vfork child is still // running and has not yet released its MM, such that its parent t should enter // a vforkStop. @@ -410,7 +444,7 @@ func (t *Task) Unshare(flags int32) error { t.mu.Unlock() return linuxerr.EPERM } - t.netns = inet.NewNamespace(t.netns) + t.netns.Store(inet.NewNamespace(t.netns.Load())) } if flags&linux.CLONE_NEWUTS != 0 { if !haveCapSysAdmin { diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index cf8571262..db91fc4d8 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -66,10 +66,10 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // execStop is a TaskStop that a task sets on itself when it wants to execve @@ -97,7 +97,7 @@ func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) { // We lost to a racing group-exit, kill, or exec from another thread // and should just exit. newImage.release() - return nil, syserror.EINTR + return nil, linuxerr.EINTR } // Cancel any racing group stops. @@ -222,9 +222,15 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // Update credentials to reflect the execve. This should precede switching // MMs to ensure that dumpability has been reset first, if needed. t.updateCredsForExecLocked() - t.image.release() + oldImage := t.image t.image = *r.image t.mu.Unlock() + + // Don't hold t.mu while calling t.image.release(), that may + // attempt to acquire TaskImage.MemoryManager.mappingMu, a lock order + // violation. + oldImage.release() + t.unstopVforkParent() t.p.FullStateChanged() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index fbfcc19e5..b3931445b 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -32,7 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) @@ -230,9 +230,16 @@ func (*runExitMain) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Lock() t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() + + // Release the task image resources. Accessing these fields must be + // done with t.mu held, but the mm.DecUsers() call must be done outside + // of that lock. t.mu.Lock() - t.image.release() + mm := t.image.MemoryManager + t.image.MemoryManager = nil + t.image.fu = nil t.mu.Unlock() + mm.DecUsers(t) // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() @@ -859,7 +866,7 @@ func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { return wr, err } if err := t.Block(ch); err != nil { - return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) + return wr, syserr.ConvertIntr(err, opts.BlockInterruptErr) } } } diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go index c132c27ef..6002ffb42 100644 --- a/pkg/sentry/kernel/task_image.go +++ b/pkg/sentry/kernel/task_image.go @@ -53,7 +53,7 @@ type TaskImage struct { } // release releases all resources held by the TaskImage. release is called by -// the task when it execs into a new TaskImage or exits. +// the task when it execs into a new TaskImage. func (image *TaskImage) release() { // Nil out pointers so that if the task is saved after release, it doesn't // follow the pointers to possibly now-invalid objects. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 8de08151a..f0c168ecc 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -191,9 +191,11 @@ const ( // // Preconditions: The task's owning TaskSet.mu must be locked. func (t *Task) updateInfoLocked() { - // Use the task's TID in the root PID namespace for logging. + // Use the task's TID and PID in the root PID namespace for logging. + pid := t.tg.pidns.owner.Root.tgids[t.tg] tid := t.tg.pidns.owner.Root.tids[t] - t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.logPrefix.Store(fmt.Sprintf("[% 4d:% 4d] ", pid, tid)) + t.rebuildTraceContext(tid) } @@ -249,5 +251,9 @@ func (t *Task) traceExecEvent(image *TaskImage) { return } defer file.DecRef(t) - trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) + + // traceExecEvent function may be called before the task goroutine + // starts, so we must use the async context. + name := file.PathnameWithDeleted(t.AsyncContext()) + trace.Logf(t.traceContext, traceCategory, "exec: %s", name) } diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go index f7711232c..e31e2b2e8 100644 --- a/pkg/sentry/kernel/task_net.go +++ b/pkg/sentry/kernel/task_net.go @@ -20,9 +20,7 @@ import ( // IsNetworkNamespaced returns true if t is in a non-root network namespace. func (t *Task) IsNetworkNamespaced() bool { - t.mu.Lock() - defer t.mu.Unlock() - return !t.netns.IsRoot() + return !t.netns.Load().IsRoot() } // NetworkContext returns the network stack used by the task. NetworkContext @@ -31,14 +29,10 @@ func (t *Task) IsNetworkNamespaced() bool { // TODO(gvisor.dev/issue/1833): Migrate callers of this method to // NetworkNamespace(). func (t *Task) NetworkContext() inet.Stack { - t.mu.Lock() - defer t.mu.Unlock() - return t.netns.Stack() + return t.netns.Load().Stack() } // NetworkNamespace returns the network namespace observed by the task. func (t *Task) NetworkNamespace() *inet.Namespace { - t.mu.Lock() - defer t.mu.Unlock() - return t.netns + return t.netns.Load() } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 054ff212f..7b336a46b 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -22,6 +22,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -29,7 +30,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/syserror" ) // A taskRunState is a reified state in the task state machine. See README.md @@ -197,8 +197,8 @@ func (app *runApp) execute(t *Task) taskRunState { // a pending signal, causing another interruption, but that signal should // not interact with the interrupted syscall.) if t.haveSyscallReturn { - if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { - if sre == syserror.ERESTART_RESTARTBLOCK { + if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { + if sre == linuxerr.ERESTART_RESTARTBLOCK { t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) t.Arch().RestartSyscallWithRestartBlock() } else { diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 7065ac79c..eeb3c5e69 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -161,7 +160,7 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu sigact := computeAction(sig, act) if t.haveSyscallReturn { - if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { // Signals that are ignored, cause a thread group stop, or // terminate the thread group do not interact with interrupted // syscalls; in Linux terms, they are never returned to the signal @@ -170,13 +169,13 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu // signal that is actually handled (by userspace). if sigact == SignalActionHandler { switch { - case sre == syserror.ERESTARTNOHAND: + case sre == linuxerr.ERESTARTNOHAND: fallthrough - case sre == syserror.ERESTART_RESTARTBLOCK: + case sre == linuxerr.ERESTART_RESTARTBLOCK: fallthrough - case (sre == syserror.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0): + case (sre == linuxerr.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) - t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) + t.Arch().SetReturn(uintptr(-ExtractErrno(linuxerr.EINTR, -1))) default: t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().RestartSyscall() diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 0565059c1..4919dea7c 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // TaskConfig defines the configuration of a new Task (see below). @@ -141,7 +140,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, niceness: cfg.Niceness, - netns: cfg.NetworkNamespace, utsns: cfg.UTSNamespace, ipcns: cfg.IPCNamespace, abstractSockets: cfg.AbstractSocketNamespace, @@ -153,6 +151,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { containerID: cfg.ContainerID, cgroups: make(map[Cgroup]struct{}), } + t.netns.Store(cfg.NetworkNamespace) t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu t.ptraceTracer.Store((*Task)(nil)) @@ -170,7 +169,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { // doesn't matter too much since the caller will exit before it returns // to userspace. If the caller isn't in the same thread group, then // we're in uncharted territory and can return whatever we want. - return nil, syserror.EINTR + return nil, linuxerr.EINTR } if err := ts.assignTIDsLocked(t); err != nil { return nil, err @@ -268,7 +267,7 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) { // fail with the error ENOMEM; it is not possible to create a new // processes [sic] in a PID namespace whose init process has // terminated." - pid_namespaces(7) - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } tid := ns.last for { diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 0586c9def..2b1d7e114 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" ) // SyscallRestartBlock represents the restart block for a syscall restartable @@ -383,8 +382,6 @@ func ExtractErrno(err error, sysno int) int { return int(err) case *errors.Error: return int(err.Errno()) - case syserror.SyscallRestartErrno: - return int(err) case *memmap.BusError: // Bus errors may generate SIGBUS, but for syscalls they still // return EFAULT. See case in task_run.go where the fault is @@ -397,8 +394,8 @@ func ExtractErrno(err error, sysno int) int { case *os.SyscallError: return ExtractErrno(err.Err, sysno) default: - if errno, ok := syserror.TranslateError(err); ok { - return int(errno) + if errno, ok := linuxerr.TranslateError(err); ok { + return int(errno.Errno()) } } panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 8e2c36598..bff226a11 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -105,7 +104,7 @@ func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ( // Each string has a zero terminating byte counted, so copying out a string // requires at least one byte of space. Also, see the calculation below. if maxTotalSize <= 0 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } thisMax := maxElemSize if maxTotalSize < thisMax { @@ -148,7 +147,7 @@ func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) erro } default: - return syserror.ENOSYS + return linuxerr.ENOSYS } return nil @@ -220,7 +219,7 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan } default: - return hostarch.AddrRangeSeq{}, syserror.ENOSYS + return hostarch.AddrRangeSeq{}, linuxerr.ENOSYS } // Truncate to MAX_RW_COUNT. diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 2eda15303..5814a4eca 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -489,11 +489,6 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() - // TODO(gvisor.dev/issue/6148): "If tcsetpgrp() is called by a member of a - // background process group in its session, and the calling process is not - // blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all members of - // this background process group." - // tty must be the controlling terminal. if tg.tty != tty { return -1, linuxerr.ENOTTY @@ -516,6 +511,16 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) return -1, linuxerr.EPERM } + signalAction := tg.signalHandlers.actions[linux.SIGTTOU] + // If the calling process is a member of a background group, a SIGTTOU + // signal is sent to all members of this background process group. + // We need also need to check whether it is ignoring or blocking SIGTTOU. + ignored := signalAction.Handler == linux.SIG_IGN + blocked := tg.leader.signalMask == linux.SignalSetOf(linux.SIGTTOU) + if tg.processGroup.id != tg.processGroup.session.foreground.id && !ignored && !blocked { + tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGTTOU), true) + } + tg.processGroup.session.foreground.id = pgid return 0, nil } diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 54bfed644..560a0f33c 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -37,7 +37,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/syserr", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 577374fa4..fb213d109 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -32,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -116,7 +115,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { log.Infof("Error reading ELF ident: %v", err) // The entire ident array always exists. if err == io.EOF || err == io.ErrUnexpectedEOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return elfInfo{}, err } @@ -124,22 +123,22 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { // Only some callers pre-check the ELF magic. if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) { log.Infof("File is not an ELF") - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // We only support 64-bit, little endian binaries if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 { log.Infof("Unsupported ELF class: %v", class) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB { log.Infof("Unsupported ELF endianness: %v", endian) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT { log.Infof("Unsupported ELF version: %v", version) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // EI_OSABI is ignored by Linux, which is the only OS supported. os := abi.Linux @@ -151,7 +150,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { log.Infof("Error reading ELF header: %v", err) // The entire header always exists. if err == io.EOF || err == io.ErrUnexpectedEOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return elfInfo{}, err } @@ -166,7 +165,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { a = arch.ARM64 default: log.Infof("Unsupported ELF machine %d", machine) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } var sharedObject bool @@ -178,25 +177,25 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { sharedObject = true default: log.Infof("Unsupported ELF type %v", elfType) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if int(hdr.Phentsize) != prog64Size { log.Infof("Unsupported phdr size %d", hdr.Phentsize) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } totalPhdrSize := prog64Size * int(hdr.Phnum) if totalPhdrSize < prog64Size { log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum)) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if totalPhdrSize > maxTotalPhdrSize { log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 { ctx.Infof("Unsupported phdr offset %d", hdr.Phoff) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } phdrBuf := make([]byte, totalPhdrSize) @@ -205,7 +204,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { log.Infof("Error reading ELF phdrs: %v", err) // If phdrs were specified, they should all exist. if err == io.EOF || err == io.ErrUnexpectedEOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return elfInfo{}, err } @@ -248,19 +247,19 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr if !ok { // If offset != 0 we should have ensured this would fit. ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } addr -= hostarch.Addr(adjust) fileSize := phdr.Filesz + adjust if fileSize < phdr.Filesz { ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } ms, ok := hostarch.Addr(fileSize).RoundUp() if !ok { ctx.Infof("fileSize %#x too large", fileSize) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } mapSize := uint64(ms) @@ -321,7 +320,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr memSize := phdr.Memsz + adjust if memSize < phdr.Memsz { ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } // Allocate more anonymous pages if necessary. @@ -333,7 +332,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr anonSize, ok := hostarch.Addr(memSize - mapSize).RoundUp() if !ok { ctx.Infof("extra anon pages too large: %#x", memSize-mapSize) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } // N.B. Linux uses vm_brk_flags to map these pages, which only @@ -423,27 +422,27 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in // NOTE(b/37474556): Linux allows out-of-order // segments, in violation of the spec. ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } var ok bool end, ok = vaddr.AddLength(phdr.Memsz) if !ok { ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } case elf.PT_INTERP: if phdr.Filesz < 2 { ctx.Infof("PT_INTERP path too small: %v", phdr.Filesz) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } if phdr.Filesz > linux.PATH_MAX { ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 { ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } path := make([]byte, phdr.Filesz) @@ -451,12 +450,12 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in if err != nil { // If an interpreter was specified, it should exist. ctx.Infof("Error reading PT_INTERP path: %v", err) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } if path[len(path)-1] != 0 { ctx.Infof("PT_INTERP path not NUL-terminated: %v", path) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } // Strip NUL-terminator and everything beyond from @@ -498,7 +497,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in totalSize, ok := totalSize.RoundUp() if !ok { ctx.Infof("ELF PT_LOAD segments too big") - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } var err error @@ -592,7 +591,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS // Check Image Compatibility. if arch.Host != info.arch { ctx.Warningf("Found mismatch for platform %s with ELF type %s", arch.Host.String(), info.arch.String()) - return loadedELF{}, nil, syserror.ENOEXEC + return loadedELF{}, nil, linuxerr.ENOEXEC } // Create the arch.Context now so we can prepare the mmap layout before @@ -681,7 +680,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error if interp.interpreter != "" { // No recursive interpreters! ctx.Infof("Interpreter requires an interpreter") - return loadedELF{}, nil, syserror.ENOEXEC + return loadedELF{}, nil, linuxerr.ENOEXEC } } diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index 3e302d92c..1ec0d7019 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -19,8 +19,8 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsbridge" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,14 +43,14 @@ func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.Fil // Short read is OK. if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return "", []string{}, err } line = line[:n] if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) { - return "", []string{}, syserror.ENOEXEC + return "", []string{}, linuxerr.ENOEXEC } // Ignore #!. line = line[2:] @@ -82,7 +82,7 @@ func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.Fil if string(interp) == "" { ctx.Infof("Interpreter script contains no interpreter: %v", line) - return "", []string{}, syserror.ENOEXEC + return "", []string{}, linuxerr.ENOEXEC } // Build the new argument list: diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 86d0c54cd..2759ef71e 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -35,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -91,7 +90,7 @@ type LoadArgs struct { func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) { if args.Filename == "" { ctx.Infof("cannot open empty name") - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // TODO(gvisor.dev/issue/160): Linux requires only execute permission, @@ -172,7 +171,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return loadedELF{}, nil, nil, nil, err } @@ -190,7 +189,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): if args.CloseOnExec { - return loadedELF{}, nil, nil, nil, syserror.ENOENT + return loadedELF{}, nil, nil, nil, linuxerr.ENOENT } args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv) if err != nil { @@ -202,7 +201,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context default: ctx.Infof("Unknown magic: %v", hdr) - return loadedELF{}, nil, nil, nil, syserror.ENOEXEC + return loadedELF{}, nil, nil, nil, linuxerr.ENOEXEC } // Set to nil in case we loop on a Interpreter Script. args.File = nil @@ -296,15 +295,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) m.SetExecutable(ctx, file) - - symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn") - if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to find rt_sigreturn in vdso: %v", err), syserr.FromError(err).ToLinux()) - } - - // Found rt_sigretrun. - addr := uint64(vdsoAddr) + symbolValue - vdsoPrelink - m.SetVDSOSigReturn(addr) + m.SetVDSOSigReturn(uint64(vdsoAddr) + vdsoSigreturnOffset - vdsoPrelink) ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index 054ef1723..bcee6aef6 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -19,7 +19,6 @@ import ( "debug/elf" "fmt" "io" - "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/context" @@ -34,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -102,14 +100,14 @@ func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, erro first = &info.phdrs[i] if phdr.Off != 0 { log.Warningf("First PT_LOAD segment has non-zero file offset") - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } } memoryOffset := phdr.Vaddr - first.Vaddr if memoryOffset != phdr.Off { log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // memsz larger than filesz means that extra zeroed space should be @@ -118,24 +116,24 @@ func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, erro // zeroes. if phdr.Memsz != phdr.Filesz { log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } start := hostarch.Addr(memoryOffset) end, ok := start.AddLength(phdr.Memsz) if !ok { log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if uint64(end) > size { log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if prev != nil { if start < prevEnd { log.Warningf("PT_LOAD segments out of order") - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // We mprotect entire pages, so each segment must be in @@ -144,7 +142,7 @@ func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, erro startPage := start.RoundDown() if prevEndPage >= startPage { log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } } prev = &info.phdrs[i] @@ -178,27 +176,6 @@ type VDSO struct { phdrs []elf.ProgHeader `state:".([]elfProgHeader)"` } -// getSymbolValueFromVDSO returns the specific symbol value in vdso.so. -func getSymbolValueFromVDSO(symbol string) (uint64, error) { - f, err := elf.NewFile(bytes.NewReader(vdsodata.Binary)) - if err != nil { - return 0, err - } - syms, err := f.Symbols() - if err != nil { - return 0, err - } - - for _, sym := range syms { - if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF { - if strings.Contains(sym.Name, symbol) { - return sym.Value, nil - } - } - } - return 0, fmt.Errorf("no %v in vdso.so", symbol) -} - // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) { @@ -271,11 +248,11 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) { func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (hostarch.Addr, error) { if v.os != bin.os { ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } if v.arch != bin.arch { ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } // Reserve address space for the VDSO and its parameter page, which is @@ -348,35 +325,35 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) segAddr, ok := vdsoAddr.AddLength(memoryOffset) if !ok { ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } segPage := segAddr.RoundDown() segSize := hostarch.Addr(phdr.Memsz) segSize, ok = segSize.AddLength(segAddr.PageOffset()) if !ok { ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset()) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } segSize, ok = segSize.RoundUp() if !ok { ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset()) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } segEnd, ok := segPage.AddLength(uint64(segSize)) if !ok { ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } if segEnd > vdsoEnd { ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } perms := progFlagsAsPerms(phdr.Flags) if perms != hostarch.Read { if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil { ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } } } @@ -389,3 +366,21 @@ func (v *VDSO) Release(ctx context.Context) { v.ParamPage.DecRef(ctx) v.vdso.DecRef(ctx) } + +var vdsoSigreturnOffset = func() uint64 { + f, err := elf.NewFile(bytes.NewReader(vdsodata.Binary)) + if err != nil { + panic(fmt.Sprintf("failed to parse vdso.so as ELF file: %v", err)) + } + syms, err := f.Symbols() + if err != nil { + panic(fmt.Sprintf("failed to read symbols from vdso.so: %v", err)) + } + const sigreturnSymbol = "__kernel_rt_sigreturn" + for _, sym := range syms { + if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF && sym.Name == sigreturnSymbol { + return sym.Value + } + } + panic(fmt.Sprintf("no symbol %q in vdso.so", sigreturnSymbol)) +}() diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index c30e88725..a89bfa680 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -54,7 +54,6 @@ go_library( "//pkg/hostarch", "//pkg/log", "//pkg/safemem", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 69aff21b6..b7d782b7f 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -144,7 +144,6 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/buffer", "//pkg/usermem", ], diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index b7f765cd7..d71d64580 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -77,15 +77,6 @@ func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) return nil } - // Only unmaps after it assured that the address is a valid aio context to - // prevent random memory from been unmapped. - // - // Note: It's possible to unmap this address and map something else into - // the same address. Then it would be unmapping memory that it doesn't own. - // This is, however, the way Linux implements AIO. Keeps the same [weird] - // semantics in case anyone relies on it. - mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) - delete(mm.aioManager.contexts, id) aioCtx.destroy() return aioCtx @@ -411,6 +402,15 @@ func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOC return nil } + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) + mm.aioManager.mu.Lock() defer mm.aioManager.mu.Unlock() return mm.destroyAIOContextLocked(ctx, id) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 57969b26c..0fca59b64 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -28,6 +28,7 @@ // memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu +// kernel.TaskSet.mu // // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in // multiple mm.MemoryManagers, as it does so in a well-defined order (forked diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 9f4cc238f..05cdcd8ae 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -324,20 +324,37 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) } } - // The majority of copy-on-write breaks on executable pages - // come from: - // - // - The ELF loader, which must zero out bytes on the last - // page of each segment after the end of the segment. - // - // - gdb's use of ptrace to insert breakpoints. - // - // Neither of these cases has enough spatial locality to - // benefit from copying nearby pages, so if the vma is - // executable, only copy the pages required. var copyAR hostarch.AddrRange - if vseg.ValuePtr().effectivePerms.Execute { + if vma := vseg.ValuePtr(); vma.effectivePerms.Execute { + // The majority of copy-on-write breaks on executable + // pages come from: + // + // - The ELF loader, which must zero out bytes on the + // last page of each segment after the end of the + // segment. + // + // - gdb's use of ptrace to insert breakpoints. + // + // Neither of these cases has enough spatial locality + // to benefit from copying nearby pages, so if the vma + // is executable, only copy the pages required. copyAR = pseg.Range().Intersect(ar) + } else if vma.growsDown { + // In most cases, the new process will not use most of + // its stack before exiting or invoking execve(); it is + // especially unlikely to return very far down its call + // stack, since async-signal-safety concerns in + // multithreaded programs prevent the new process from + // being able to do much. So only copy up to one page + // before and after the pages required. + stackMaskAR := ar + if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start { + stackMaskAR.Start = newStart + } + if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End { + stackMaskAR.End = newEnd + } + copyAR = pseg.Range().Intersect(stackMaskAR) } else { copyAR = pseg.Range().Intersect(maskAR) } diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 256eb4afb..dc12ad357 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" ) // HandleUserFault handles an application page fault. sp is the faulting @@ -79,7 +78,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostar } length, ok := hostarch.Addr(opts.Length).RoundUp() if !ok { - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } opts.Length = uint64(length) @@ -90,7 +89,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostar } // Offset + length must not overflow. if end := opts.Offset + opts.Length; end < opts.Offset { - return 0, syserror.ENOMEM + return 0, linuxerr.EOVERFLOW } } else { opts.Offset = 0 @@ -253,7 +252,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, erro ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize) sz = maxStackSize } else if sz == 0 { - return hostarch.AddrRange{}, syserror.ENOMEM + return hostarch.AddrRange{}, linuxerr.ENOMEM } szaddr := hostarch.Addr(sz) ctx.Debugf("Allocating stack with size of %v bytes", sz) @@ -262,7 +261,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, erro // randomization can't be disabled. stackEnd := mm.layout.MaxAddr - hostarch.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown() if stackEnd < szaddr { - return hostarch.AddrRange{}, syserror.ENOMEM + return hostarch.AddrRange{}, linuxerr.ENOMEM } stackStart := stackEnd - szaddr mm.mappingMu.Lock() @@ -500,7 +499,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // Check against RLIMIT_AS. newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } if vma := vseg.ValuePtr(); vma.mappable != nil { @@ -599,11 +598,11 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h } rlength, ok := hostarch.Addr(length).RoundUp() if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(rlength)) if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } effectivePerms := realPerms.Effective() @@ -616,19 +615,19 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h // the non-growsDown case. vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { - return syserror.ENOMEM + return linuxerr.ENOMEM } if growsDown { if !vseg.ValuePtr().growsDown { return linuxerr.EINVAL } if ar.End <= vseg.Start() { - return syserror.ENOMEM + return linuxerr.ENOMEM } ar.Start = vseg.Start() } else { if ar.Start < vseg.Start() { - return syserror.ENOMEM + return linuxerr.ENOMEM } } @@ -688,7 +687,7 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h } vseg, _ = vseg.NextNonEmpty() if !vseg.Ok() { - return syserror.ENOMEM + return linuxerr.ENOMEM } } } @@ -724,7 +723,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch. if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { addr = mm.brk.End mm.mappingMu.Unlock() - return addr, syserror.ENOMEM + return addr, linuxerr.ENOMEM } oldbrkpg, _ := mm.brk.End.RoundUp() @@ -798,7 +797,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u } if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { mm.mappingMu.Unlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } } } @@ -835,7 +834,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u mm.vmas.MergeAdjacent(ar) if unmapped { mm.mappingMu.Unlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } if mode == memmap.MLockEager { @@ -850,7 +849,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u // case, which is converted to ENOMEM by mlock. mm.activeMu.Unlock() mm.mappingMu.RUnlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), hostarch.NoAccess) if err != nil { @@ -858,7 +857,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u mm.mappingMu.RUnlock() // Linux: mm/mlock.c:__mlock_posix_error_return() if linuxerr.Equals(linuxerr.EFAULT, err) { - return syserror.ENOMEM + return linuxerr.ENOMEM } if linuxerr.Equals(linuxerr.ENOMEM, err) { return linuxerr.EAGAIN @@ -917,7 +916,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error } if uint64(mm.vmas.Span()) > mlockLimit { mm.mappingMu.Unlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } } } @@ -1040,7 +1039,7 @@ func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork } if mm.vmas.SpanRange(ar) != ar.Length() { - return syserror.ENOMEM + return linuxerr.ENOMEM } return nil } @@ -1099,7 +1098,7 @@ func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { // to the rest (but returns ENOMEM from the system call, as it should)." - // madvise(2) if mm.vmas.SpanRange(ar) != ar.Length() { - return syserror.ENOMEM + return linuxerr.ENOMEM } return nil } @@ -1123,11 +1122,11 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u } la, ok := hostarch.Addr(length).RoundUp() if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } mm.mappingMu.RLock() @@ -1135,7 +1134,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { mm.mappingMu.RUnlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } var unmapped bool lastEnd := ar.Start @@ -1184,7 +1183,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u } if unmapped { - return syserror.ENOMEM + return linuxerr.ENOMEM } return nil } diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 5f8ab7ca3..e34b7a2f7 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" ) // Preconditions: @@ -59,7 +58,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp newUsageAS -= uint64(mm.vmas.SpanRange(ar)) } if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { - return vmaIterator{}, hostarch.AddrRange{}, syserror.ENOMEM + return vmaIterator{}, hostarch.AddrRange{}, linuxerr.ENOMEM } if opts.MLockMode != memmap.MLockNone { @@ -178,7 +177,7 @@ func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOp // Fixed mappings accept only the requested address. if opts.Fixed { - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } // Prefer hugepage alignment if a hugepage or more is requested. @@ -216,7 +215,7 @@ func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bou return gr.Start, nil } } - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } // Preconditions: mm.mappingMu must be locked. @@ -236,7 +235,7 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo return start, nil } } - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } // Preconditions: mm.mappingMu must be locked. diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index d351869ef..496a9fd97 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -97,7 +97,6 @@ go_library( "//pkg/state", "//pkg/state/wire", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 0c8542485..68e17d343 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -39,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // MemoryFile is a memmap.File whose pages may be allocated to arbitrary @@ -404,7 +403,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.File // Find a range in the underlying file. fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment) if !ok { - return memmap.FileRange{}, syserror.ENOMEM + return memmap.FileRange{}, linuxerr.ENOMEM } // Expand the file if needed. diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 8a490b3de..834d72408 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -1,13 +1,26 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "atomicptr_machine", + out = "atomicptr_machine_unsafe.go", + package = "kvm", + prefix = "machine", + template = "//pkg/sync/atomicptr:generic_atomicptr", + types = { + "Value": "machine", + }, +) + go_library( name = "kvm", srcs = [ "address_space.go", "address_space_amd64.go", "address_space_arm64.go", + "atomicptr_machine_unsafe.go", "bluepill.go", "bluepill_allocator.go", "bluepill_amd64.go", @@ -50,7 +63,6 @@ go_library( "//pkg/procid", "//pkg/ring0", "//pkg/ring0/pagetables", - "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/arch/fpu", @@ -58,6 +70,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/time", + "//pkg/sighandling", "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], @@ -69,10 +82,17 @@ go_test( "kvm_amd64_test.go", "kvm_amd64_test.s", "kvm_arm64_test.go", + "kvm_safecopy_test.go", "kvm_test.go", "virtual_map_test.go", ], library = ":kvm", + # FIXME(gvisor.dev/issue/3374): Not working with all build systems. + nogo = False, + # cgo has to be disabled. We have seen libc that blocks all signals and + # calls mmap from pthread_create, but we use SIGSYS to trap mmap system + # calls. + pure = True, tags = [ "manual", "nogotsan", @@ -81,8 +101,10 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/hostarch", + "//pkg/memutil", "//pkg/ring0", "//pkg/ring0/pagetables", + "//pkg/safecopy", "//pkg/sentry/arch", "//pkg/sentry/arch/fpu", "//pkg/sentry/platform", diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index bb9967b9f..5be2215ed 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -19,8 +19,8 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" - "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sighandling" ) // bluepill enters guest mode. @@ -61,6 +61,9 @@ var ( // This is called by bluepillHandler. savedHandler uintptr + // savedSigsysHandler is a pointer to the previos handler of the SIGSYS signals. + savedSigsysHandler uintptr + // dieTrampolineAddr is the address of dieTrampoline. dieTrampolineAddr uintptr ) @@ -94,7 +97,7 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { func init() { // Install the handler. - if err := safecopy.ReplaceSignalHandler(bluepillSignal, addrOfSighandler(), &savedHandler); err != nil { + if err := sighandling.ReplaceSignalHandler(bluepillSignal, addrOfSighandler(), &savedHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 0567c8d32..b2db2bb9f 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -71,10 +71,6 @@ func (c *vCPU) KernelSyscall() { if regs.Rax != ^uint64(0) { regs.Rip -= 2 // Rewind. } - // We only trigger a bluepill entry in the bluepill function, and can - // therefore be guaranteed that there is no floating point state to be - // loaded on resuming from halt. We only worry about saving on exit. - ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. // N.B. Since KernelSyscall is called when the kernel makes a syscall, // FS_BASE is already set for correct execution of this function. // @@ -112,8 +108,6 @@ func (c *vCPU) KernelException(vector ring0.Vector) { regs.Rip = 0 } // See above. - ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - // See above. ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } @@ -144,5 +138,5 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { // Set the context pointer to the saved floating point state. This is // where the guest data has been serialized, the kernel will restore // from this new pointer value. - context.Fpstate = uint64(uintptrValue(c.floatingPointState.BytePointer())) + context.Fpstate = uint64(uintptrValue(c.FloatingPointState().BytePointer())) // escapes: no. } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s index c2a1dca11..5d8358f64 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -32,6 +32,8 @@ // This is checked as the source of the fault. #define CLI $0xfa +#define SYS_MMAP 9 + // See bluepill.go. TEXT ·bluepill(SB),NOSPLIT,$0 begin: @@ -95,6 +97,31 @@ TEXT ·addrOfSighandler(SB), $0-8 MOVQ AX, ret+0(FP) RET +TEXT ·sigsysHandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel. + MOVQ $1, CX + CMPL CX, 0x8(SI) + JNE fallback + + MOVL CONTEXT_RAX(DX), CX + CMPL CX, $SYS_MMAP + JNE fallback + PUSHQ DX // First argument (context). + CALL ·seccompMmapHandler(SB) // Call the handler. + POPQ DX // Discard the argument. + RET +fallback: + // Jump to the previous signal handler. + XORQ CX, CX + MOVQ ·savedSigsysHandler(SB), AX + JMP AX + +// func addrOfSighandler() uintptr +TEXT ·addrOfSigsysHandler(SB), $0-8 + MOVQ $·sigsysHandler(SB), AX + MOVQ AX, ret+0(FP) + RET + // dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation. TEXT ·dieTrampoline(SB),NOSPLIT,$0 PUSHQ BX // First argument (vCPU). diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index acb0cb05f..df772d620 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -70,7 +70,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { - fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) + fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no context.Fpsimd64.Fpsr = fpsimd.Fpsr context.Fpsimd64.Fpcr = fpsimd.Fpcr context.Fpsimd64.Vregs = fpsimd.Vregs @@ -90,12 +90,12 @@ func (c *vCPU) KernelSyscall() { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) + fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs(c.floatingPointState.BytePointer()) + ring0.SaveVRegs(c.FloatingPointState().BytePointer()) // escapes: no } ring0.Halt() @@ -114,12 +114,12 @@ func (c *vCPU) KernelException(vector ring0.Vector) { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) + fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs(c.floatingPointState.BytePointer()) + ring0.SaveVRegs(c.FloatingPointState().BytePointer()) // escapes: no } ring0.Halt() diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s index 308f2a951..9690e3772 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.s +++ b/pkg/sentry/platform/kvm/bluepill_arm64.s @@ -29,9 +29,12 @@ // Only limited use of the context is done in the assembly stub below, most is // done in the Go handlers. #define SIGINFO_SIGNO 0x0 +#define SIGINFO_CODE 0x8 #define CONTEXT_PC 0x1B8 #define CONTEXT_R0 0xB8 +#define SYS_MMAP 222 + // getTLS returns the value of TPIDR_EL0 register. TEXT ·getTLS(SB),NOSPLIT,$0-8 MRS TPIDR_EL0, R1 @@ -98,6 +101,37 @@ TEXT ·addrOfSighandler(SB), $0-8 MOVD R0, ret+0(FP) RET +// The arguments are the following: +// +// R0 - The signal number. +// R1 - Pointer to siginfo_t structure. +// R2 - Pointer to ucontext structure. +// +TEXT ·sigsysHandler(SB),NOSPLIT,$0 + // si_code should be SYS_SECCOMP. + MOVD SIGINFO_CODE(R1), R7 + CMPW $1, R7 + BNE fallback + + CMPW $SYS_MMAP, R8 + BNE fallback + + MOVD R2, 8(RSP) + BL ·seccompMmapHandler(SB) // Call the handler. + + RET + +fallback: + // Jump to the previous signal handler. + MOVD ·savedHandler(SB), R7 + B (R7) + +// func addrOfSighandler() uintptr +TEXT ·addrOfSigsysHandler(SB), $0-8 + MOVD $·sigsysHandler(SB), R0 + MOVD R0, ret+0(FP) + RET + // dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation. TEXT ·dieTrampoline(SB),NOSPLIT,$0 // R0: Fake the old PC as caller diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index 8fd8287b3..7a3c97c5a 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -55,11 +55,7 @@ func calculateBluepillFault(physical uintptr, phyRegions []physicalRegion) (virt } // Adjust the block to match our size. - physicalStart = alignedPhysical & faultBlockMask - if physicalStart < pr.physical { - // Bound the starting point to the start of the region. - physicalStart = pr.physical - } + physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask virtualStart = pr.virtual + (physicalStart - pr.physical) physicalEnd := physicalStart + faultBlockSize if physicalEnd > end { diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 0f0c1e73b..e38ca05c0 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -193,36 +193,8 @@ func bluepillHandler(context unsafe.Pointer) { return } - // Increment the fault count. - atomic.AddUint32(&c.faults, 1) - - // For MMIO, the physical address is the first data item. - physical = uintptr(c.runData.data[0]) - virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE) - if !ok { - c.die(bluepillArchContext(context), "invalid physical address") - return - } - - // We now need to fill in the data appropriately. KVM - // expects us to provide the result of the given MMIO - // operation in the runData struct. This is safe - // because, if a fault occurs here, the same fault - // would have occurred in guest mode. The kernel should - // not create invalid page table mappings. - data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1])) - length := (uintptr)((uint32)(c.runData.data[2])) - write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0 - for i := uintptr(0); i < length; i++ { - b := bytePtr(uintptr(virtual) + i) - if write { - // Write to the given address. - *b = data[i] - } else { - // Read from the given address. - data[i] = *b - } - } + c.die(bluepillArchContext(context), "exit_mmio") + return case _KVM_EXIT_IRQ_WINDOW_OPEN: bluepillStopGuest(c) case _KVM_EXIT_SHUTDOWN: diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index aac0fdffe..ad6863646 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -77,7 +77,11 @@ var ( // OpenDevice opens the KVM device at /dev/kvm and returns the File. func OpenDevice() (*os.File, error) { - f, err := os.OpenFile("/dev/kvm", unix.O_RDWR, 0) + dev, ok := os.LookupEnv("GVISOR_KVM_DEV") + if !ok { + dev = "/dev/kvm" + } + f, err := os.OpenFile(dev, unix.O_RDWR, 0) if err != nil { return nil, fmt.Errorf("error opening /dev/kvm: %v", err) } diff --git a/pkg/sentry/platform/kvm/kvm_safecopy_test.go b/pkg/sentry/platform/kvm/kvm_safecopy_test.go new file mode 100644 index 000000000..9a87c9e6f --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_safecopy_test.go @@ -0,0 +1,104 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// FIXME(gvisor.dev/issue//6629): These tests don't pass on ARM64. +// +//go:build amd64 +// +build amd64 + +package kvm + +import ( + "fmt" + "os" + "testing" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/safecopy" +) + +func testSafecopy(t *testing.T, mapSize uintptr, fileSize uintptr, testFunc func(t *testing.T, c *vCPU, addr uintptr)) { + memfd, err := memutil.CreateMemFD(fmt.Sprintf("kvm_test_%d", os.Getpid()), 0) + if err != nil { + t.Errorf("error creating memfd: %v", err) + } + + memfile := os.NewFile(uintptr(memfd), "kvm_test") + memfile.Truncate(int64(fileSize)) + kvmTest(t, nil, func(c *vCPU) bool { + const n = 10 + mappings := make([]uintptr, n) + defer func() { + for i := 0; i < n && mappings[i] != 0; i++ { + unix.RawSyscall( + unix.SYS_MUNMAP, + mappings[i], mapSize, 0) + } + }() + for i := 0; i < n; i++ { + addr, _, errno := unix.RawSyscall6( + unix.SYS_MMAP, + 0, + mapSize, + unix.PROT_READ|unix.PROT_WRITE, + unix.MAP_SHARED|unix.MAP_FILE, + uintptr(memfile.Fd()), + 0) + if errno != 0 { + t.Errorf("error mapping file: %v", errno) + } + mappings[i] = addr + testFunc(t, c, addr) + } + return false + }) +} + +func TestSafecopySigbus(t *testing.T) { + mapSize := uintptr(faultBlockSize) + fileSize := mapSize - hostarch.PageSize + buf := make([]byte, hostarch.PageSize) + testSafecopy(t, mapSize, fileSize, func(t *testing.T, c *vCPU, addr uintptr) { + want := safecopy.BusError{addr + fileSize} + bluepill(c) + _, err := safecopy.CopyIn(buf, unsafe.Pointer(addr+fileSize)) + if err != want { + t.Errorf("expected error: got %v, want %v", err, want) + } + }) +} + +func TestSafecopy(t *testing.T) { + mapSize := uintptr(faultBlockSize) + fileSize := mapSize + testSafecopy(t, mapSize, fileSize, func(t *testing.T, c *vCPU, addr uintptr) { + want := uint32(0x12345678) + bluepill(c) + _, err := safecopy.SwapUint32(unsafe.Pointer(addr+fileSize-8), want) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + bluepill(c) + val, err := safecopy.LoadUint32(unsafe.Pointer(addr + fileSize - 8)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if val != want { + t.Errorf("incorrect value: got %x, want %x", val, want) + } + }) +} diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index e7092a756..f1f7e4ea4 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -17,16 +17,20 @@ package kvm import ( "fmt" "runtime" + gosync "sync" "sync/atomic" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" + "gvisor.dev/gvisor/pkg/seccomp" ktime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" ) @@ -35,6 +39,9 @@ type machine struct { // fd is the vm fd. fd int + // machinePoolIndex is the index in the machinePool array. + machinePoolIndex uint32 + // nextSlot is the next slot for setMemoryRegion. // // This must be accessed atomically. If nextSlot is ^uint32(0), then @@ -192,6 +199,10 @@ func (m *machine) newVCPU() *vCPU { return c // Done. } +// readOnlyGuestRegions contains regions that have to be mapped read-only into +// the guest physical address space. Right now, it is used on arm64 only. +var readOnlyGuestRegions []region + // newMachine returns a new VM context. func newMachine(vm int) (*machine, error) { // Create the machine. @@ -227,6 +238,10 @@ func newMachine(vm int) (*machine, error) { m.upperSharedPageTables.MarkReadOnlyShared() m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) + // Install seccomp rules to trap runtime mmap system calls. They will + // be handled by seccompMmapHandler. + seccompMmapRules(m) + // Apply the physical mappings. Note that these mappings may point to // guest physical addresses that are not actually available. These // physical pages are mapped on demand, see kernel_unsafe.go. @@ -241,32 +256,11 @@ func newMachine(vm int) (*machine, error) { return true // Keep iterating. }) - var physicalRegionsReadOnly []physicalRegion - var physicalRegionsAvailable []physicalRegion - - physicalRegionsReadOnly = rdonlyRegionsForSetMem() - physicalRegionsAvailable = availableRegionsForSetMem() - - // Map all read-only regions. - for _, r := range physicalRegionsReadOnly { - m.mapPhysical(r.physical, r.length, physicalRegionsReadOnly, _KVM_MEM_READONLY) - } - // Ensure that the currently mapped virtual regions are actually // available in the VM. Note that this doesn't guarantee no future // faults, however it should guarantee that everything is available to // ensure successful vCPU entry. - applyVirtualRegions(func(vr virtualRegion) { - if excludeVirtualRegion(vr) { - return // skip region. - } - - for _, r := range physicalRegionsReadOnly { - if vr.virtual == r.virtual { - return - } - } - + mapRegion := func(vr region, flags uint32) { for virtual := vr.virtual; virtual < vr.virtual+vr.length; { physical, length, ok := translateToPhysical(virtual) if !ok { @@ -280,9 +274,32 @@ func newMachine(vm int) (*machine, error) { } // Ensure the physical range is mapped. - m.mapPhysical(physical, length, physicalRegionsAvailable, _KVM_MEM_FLAGS_NONE) + m.mapPhysical(physical, length, physicalRegions, flags) virtual += length } + } + + for _, vr := range readOnlyGuestRegions { + mapRegion(vr, _KVM_MEM_READONLY) + } + + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + return // skip region. + } + for _, r := range readOnlyGuestRegions { + if vr.virtual == r.virtual { + return + } + } + // Take into account that the stack can grow down. + if vr.filename == "[stack]" { + vr.virtual -= 1 << 20 + vr.length += 1 << 20 + } + + mapRegion(vr.region, 0) + }) // Initialize architecture state. @@ -352,6 +369,10 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg func (m *machine) Destroy() { runtime.SetFinalizer(m, nil) + machinePoolMu.Lock() + machinePool[m.machinePoolIndex].Store(nil) + machinePoolMu.Unlock() + // Destroy vCPUs. for _, c := range m.vCPUsByID { if c == nil { @@ -519,15 +540,21 @@ func (c *vCPU) lock() { // //go:nosplit func (c *vCPU) unlock() { - if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) { + origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) + if origState == vCPUUser|vCPUGuest { // Happy path: no exits are forced, and we can continue // executing on our merry way with a single atomic access. return } // Clear the lock. - origState := atomic.LoadUint32(&c.state) - atomicbitops.AndUint32(&c.state, ^vCPUUser) + for { + state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) + if state == origState { + break + } + origState = state + } switch origState { case vCPUUser: // Normal state. @@ -677,3 +704,72 @@ func (c *vCPU) setSystemTimeLegacy() error { } } } + +const machinePoolSize = 16 + +// machinePool is enumerated from the seccompMmapHandler signal handler +var ( + machinePool [machinePoolSize]machineAtomicPtr + machinePoolLen uint32 + machinePoolMu sync.Mutex + seccompMmapRulesOnce gosync.Once +) + +func sigsysHandler() +func addrOfSigsysHandler() uintptr + +// seccompMmapRules adds seccomp rules to trap mmap system calls that will be +// handled in seccompMmapHandler. +func seccompMmapRules(m *machine) { + seccompMmapRulesOnce.Do(func() { + // Install the handler. + if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) + } + rules := []seccomp.RuleSet{} + rules = append(rules, []seccomp.RuleSet{ + // Trap mmap system calls and handle them in sigsysGoHandler + { + Rules: seccomp.SyscallRules{ + unix.SYS_MMAP: { + { + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + /* MAP_DENYWRITE is ignored and used only for filtering. */ + seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0), + }, + }, + }, + Action: linux.SECCOMP_RET_TRAP, + }, + }...) + instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW) + if err != nil { + panic(fmt.Sprintf("failed to build rules: %v", err)) + } + // Perform the actual installation. + if err := seccomp.SetFilter(instrs); err != nil { + panic(fmt.Sprintf("failed to set filter: %v", err)) + } + }) + + machinePoolMu.Lock() + n := atomic.LoadUint32(&machinePoolLen) + i := uint32(0) + for ; i < n; i++ { + if machinePool[i].Load() == nil { + break + } + } + if i == n { + if i == machinePoolSize { + machinePoolMu.Unlock() + panic("machinePool is full") + } + atomic.AddUint32(&machinePoolLen, 1) + } + machinePool[i].Store(m) + m.machinePoolIndex = i + machinePoolMu.Unlock() +} diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index a96634381..5bc023899 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" ) @@ -72,10 +71,6 @@ type vCPUArchState struct { // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs - - // floatingPointState is the floating point state buffer used in guest - // to host transitions. See usage in bluepill_amd64.go. - floatingPointState fpu.State } const ( @@ -152,12 +147,6 @@ func (c *vCPU) initArchState() error { return fmt.Errorf("error setting user registers: %v", errno) } - // Allocate some floating point state save area for the local vCPU. - // This will be saved prior to leaving the guest, and we restore from - // this always. We cannot use the pointer in the context alone because - // we don't know how large the area there is in reality. - c.floatingPointState = fpu.NewState() - // Set the time offset to the host native time. return c.setSystemTime() } @@ -309,22 +298,6 @@ func loadByte(ptr *byte) byte { return *ptr } -// prefaultFloatingPointState touches each page of the floating point state to -// be sure that its physical pages are mapped. -// -// Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that -// triggered a fault will be emulated by the kvm kernel code, but it can't -// emulate instructions like xsave and xrstor. -// -//go:nosplit -func prefaultFloatingPointState(data *fpu.State) { - size := len(*data) - for i := 0; i < size; i += hostarch.PageSize { - loadByte(&(*data)[i]) - } - loadByte(&(*data)[size-1]) -} - // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { // Check for canonical addresses. @@ -355,11 +328,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) // allocations occur. entersyscall() bluepill(c) - // The root table physical page has to be mapped to not fault in iret - // or sysret after switching into a user address space. sysret and - // iret are in the upper half that is global and already mapped. - switchOpts.PageTables.PrefaultRootTable() - prefaultFloatingPointState(switchOpts.FloatingPointState) vector = c.CPU.SwitchToUser(switchOpts) exitsyscall() @@ -522,3 +490,7 @@ func (m *machine) getNewVCPU() *vCPU { } return nil } + +func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { + return physicalRegions +} diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index de798bb2c..fbacea9ad 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -161,3 +161,15 @@ func (c *vCPU) getSystemRegisters(sregs *systemRegs) unix.Errno { } return 0 } + +//go:nosplit +func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) { + ctx := bluepillArchContext(context) + + // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters. + addr, _, e := unix.RawSyscall6(uintptr(ctx.Rax), uintptr(ctx.Rdi), uintptr(ctx.Rsi), + uintptr(ctx.Rdx), uintptr(ctx.R10)|unix.MAP_DENYWRITE, uintptr(ctx.R8), uintptr(ctx.R9)) + ctx.Rax = uint64(addr) + + return addr, uintptr(ctx.Rsi), e +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 7937a8481..31998a600 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ) @@ -40,10 +39,6 @@ type vCPUArchState struct { // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs - - // floatingPointState is the floating point state buffer used in guest - // to host transitions. See usage in bluepill_arm64.go. - floatingPointState fpu.State } const ( @@ -110,18 +105,128 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { return phyRegions } +// archPhysicalRegions fills readOnlyGuestRegions and allocates separate +// physical regions form them. +func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + return // skip region. + } + if !vr.accessType.Write { + readOnlyGuestRegions = append(readOnlyGuestRegions, vr.region) + } + }) + + rdRegions := readOnlyGuestRegions[:] + + // Add an unreachable region. + rdRegions = append(rdRegions, region{ + virtual: 0xffffffffffffffff, + length: 0, + }) + + var regions []physicalRegion + addValidRegion := func(r *physicalRegion, virtual, length uintptr) { + if length == 0 { + return + } + regions = append(regions, physicalRegion{ + region: region{ + virtual: virtual, + length: length, + }, + physical: r.physical + (virtual - r.virtual), + }) + } + i := 0 + for _, pr := range physicalRegions { + start := pr.virtual + end := pr.virtual + pr.length + for start < end { + rdRegion := rdRegions[i] + rdStart := rdRegion.virtual + rdEnd := rdRegion.virtual + rdRegion.length + if rdEnd <= start { + i++ + continue + } + if rdStart > start { + newEnd := rdStart + if end < rdStart { + newEnd = end + } + addValidRegion(&pr, start, newEnd-start) + start = rdStart + continue + } + if rdEnd < end { + addValidRegion(&pr, start, rdEnd-start) + start = rdEnd + continue + } + addValidRegion(&pr, start, end-start) + start = end + } + } + + return regions +} + // Get all available physicalRegions. -func availableRegionsForSetMem() (phyRegions []physicalRegion) { - var excludeRegions []region +func availableRegionsForSetMem() []physicalRegion { + var excludedRegions []region applyVirtualRegions(func(vr virtualRegion) { if !vr.accessType.Write { - excludeRegions = append(excludeRegions, vr.region) + excludedRegions = append(excludedRegions, vr.region) } }) - phyRegions = computePhysicalRegions(excludeRegions) + // Add an unreachable region. + excludedRegions = append(excludedRegions, region{ + virtual: 0xffffffffffffffff, + length: 0, + }) - return phyRegions + var regions []physicalRegion + addValidRegion := func(r *physicalRegion, virtual, length uintptr) { + if length == 0 { + return + } + regions = append(regions, physicalRegion{ + region: region{ + virtual: virtual, + length: length, + }, + physical: r.physical + (virtual - r.virtual), + }) + } + i := 0 + for _, pr := range physicalRegions { + start := pr.virtual + end := pr.virtual + pr.length + for start < end { + er := excludedRegions[i] + excludeEnd := er.virtual + er.length + excludeStart := er.virtual + if excludeEnd < start { + i++ + continue + } + if excludeStart < start { + start = excludeEnd + i++ + continue + } + rend := excludeStart + if rend > end { + rend = end + } + addValidRegion(&pr, start, rend-start) + start = excludeEnd + } + } + + return regions } // nonCanonical generates a canonical address return. diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 1a4a9ce7d..e73d5c544 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" ) @@ -159,8 +158,6 @@ func (c *vCPU) initArchState() error { c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) } - c.floatingPointState = fpu.NewState() - return c.setSystemTime() } @@ -333,3 +330,15 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) } } + +//go:nosplit +func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) { + ctx := bluepillArchContext(context) + + // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters. + addr, _, e := unix.RawSyscall6(uintptr(ctx.Regs[8]), uintptr(ctx.Regs[0]), uintptr(ctx.Regs[1]), + uintptr(ctx.Regs[2]), uintptr(ctx.Regs[3])|unix.MAP_DENYWRITE, uintptr(ctx.Regs[4]), uintptr(ctx.Regs[5])) + ctx.Regs[0] = uint64(addr) + + return addr, uintptr(ctx.Regs[1]), e +} diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index cc3a1253b..cf3a4e7c9 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -171,3 +171,46 @@ func (c *vCPU) setSignalMask() error { return nil } + +// seccompMmapHandler is a signal handler for runtime mmap system calls +// that are trapped by seccomp. +// +// It executes the mmap syscall with specified arguments and maps a new region +// to the guest. +// +//go:nosplit +func seccompMmapHandler(context unsafe.Pointer) { + addr, length, errno := seccompMmapSyscall(context) + if errno != 0 { + return + } + + for i := uint32(0); i < atomic.LoadUint32(&machinePoolLen); i++ { + m := machinePool[i].Load() + if m == nil { + continue + } + + // Map the new region to the guest. + vr := region{ + virtual: addr, + length: length, + } + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { + physical, length, ok := translateToPhysical(virtual) + if !ok { + // This must be an invalid region that was + // knocked out by creation of the physical map. + return + } + if virtual+length > vr.virtual+vr.length { + // Cap the length to the end of the area. + length = vr.virtual + vr.length - virtual + } + + // Ensure the physical range is mapped. + m.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE) + virtual += length + } + } +} diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index d812e6c26..9864d1258 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -168,6 +168,9 @@ func computePhysicalRegions(excludedRegions []region) (physicalRegions []physica } addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd) + // Do arch-specific actions on physical regions. + physicalRegions = archPhysicalRegions(physicalRegions) + // Dump our all physical regions. for _, r := range physicalRegions { log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)", diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index 6d0ba8252..346a10043 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -30,8 +30,8 @@ import ( func TLSWorks() bool // SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *arch.Registers, fn func()) { - regs.Pc = uint64(reflect.ValueOf(fn).Pointer()) +func SetTestTarget(regs *arch.Registers, fn uintptr) { + regs.Pc = uint64(fn) } // SetTouchTarget sets rax appropriately. diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s index 7348c29a5..42876245a 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s @@ -28,6 +28,11 @@ TEXT ·Getpid(SB),NOSPLIT,$0 SVC RET +TEXT ·AddrOfGetpid(SB),NOSPLIT,$0-8 + MOVD $·Getpid(SB), R0 + MOVD R0, ret+0(FP) + RET + TEXT ·Touch(SB),NOSPLIT,$0 start: MOVD 0(R8), R1 @@ -35,21 +40,41 @@ start: SVC B start +TEXT ·AddrOfTouch(SB),NOSPLIT,$0-8 + MOVD $·Touch(SB), R0 + MOVD R0, ret+0(FP) + RET + TEXT ·HaltLoop(SB),NOSPLIT,$0 start: HLT B start +TEXT ·AddOfHaltLoop(SB),NOSPLIT,$0-8 + MOVD $·HaltLoop(SB), R0 + MOVD R0, ret+0(FP) + RET + // This function simulates a loop of syscall. TEXT ·SyscallLoop(SB),NOSPLIT,$0 start: SVC B start +TEXT ·AddrOfSyscallLoop(SB),NOSPLIT,$0-8 + MOVD $·SyscallLoop(SB), R0 + MOVD R0, ret+0(FP) + RET + TEXT ·SpinLoop(SB),NOSPLIT,$0 start: B start +TEXT ·AddrOfSpinLoop(SB),NOSPLIT,$0-8 + MOVD $·SpinLoop(SB), R0 + MOVD R0, ret+0(FP) + RET + TEXT ·TLSWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS MOVD $0x6789, R5 @@ -125,6 +150,11 @@ TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 SVC RET // never reached +TEXT ·AddrOfTwiddleRegsSyscall(SB),NOSPLIT,$0-8 + MOVD $·TwiddleRegsSyscall(SB), R0 + MOVD R0, ret+0(FP) + RET + TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 TWIDDLE_REGS() MSR R10, TPIDR_EL0 @@ -132,3 +162,8 @@ TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 // Branch to Register branches unconditionally to an address in <Rn>. JMP (R6) // <=> br x6, must fault RET // never reached + +TEXT ·AddrOfTwiddleRegsFault(SB),NOSPLIT,$0-8 + MOVD $·TwiddleRegsFault(SB), R0 + MOVD R0, ret+0(FP) + RET diff --git a/pkg/sentry/seccheck/BUILD b/pkg/sentry/seccheck/BUILD new file mode 100644 index 000000000..35feb969f --- /dev/null +++ b/pkg/sentry/seccheck/BUILD @@ -0,0 +1,58 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_fieldenum:defs.bzl", "go_fieldenum") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_fieldenum( + name = "seccheck_fieldenum", + srcs = [ + "clone.go", + "execve.go", + "exit.go", + "task.go", + ], + out = "seccheck_fieldenum.go", + package = "seccheck", +) + +go_template_instance( + name = "seqatomic_checkerslice", + out = "seqatomic_checkerslice_unsafe.go", + package = "seccheck", + suffix = "CheckerSlice", + template = "//pkg/sync/seqatomic:generic_seqatomic", + types = { + "Value": "[]Checker", + }, +) + +go_library( + name = "seccheck", + srcs = [ + "clone.go", + "execve.go", + "exit.go", + "seccheck.go", + "seccheck_fieldenum.go", + "seqatomic_checkerslice_unsafe.go", + "task.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/gohacks", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sync", + ], +) + +go_test( + name = "seccheck_test", + size = "small", + srcs = ["seccheck_test.go"], + library = ":seccheck", + deps = ["//pkg/context"], +) diff --git a/pkg/sentry/seccheck/clone.go b/pkg/sentry/seccheck/clone.go new file mode 100644 index 000000000..7546fa021 --- /dev/null +++ b/pkg/sentry/seccheck/clone.go @@ -0,0 +1,53 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// CloneInfo contains information used by the Clone checkpoint. +// +// +fieldenum Clone +type CloneInfo struct { + // Invoker identifies the invoking thread. + Invoker TaskInfo + + // Credentials are the invoking thread's credentials. + Credentials *auth.Credentials + + // Args contains the arguments to kernel.Task.Clone(). + Args linux.CloneArgs + + // Created identifies the created thread. + Created TaskInfo +} + +// CloneReq returns fields required by the Clone checkpoint. +func (s *state) CloneReq() CloneFieldSet { + return s.cloneReq.Load() +} + +// Clone is called at the Clone checkpoint. +func (s *state) Clone(ctx context.Context, mask CloneFieldSet, info *CloneInfo) error { + for _, c := range s.getCheckers() { + if err := c.Clone(ctx, mask, *info); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/seccheck/execve.go b/pkg/sentry/seccheck/execve.go new file mode 100644 index 000000000..f36e0730e --- /dev/null +++ b/pkg/sentry/seccheck/execve.go @@ -0,0 +1,65 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// ExecveInfo contains information used by the Execve checkpoint. +// +// +fieldenum Execve +type ExecveInfo struct { + // Invoker identifies the invoking thread. + Invoker TaskInfo + + // Credentials are the invoking thread's credentials. + Credentials *auth.Credentials + + // BinaryPath is a path to the executable binary file being switched to in + // the mount namespace in which it was opened. + BinaryPath string + + // Argv is the new process image's argument vector. + Argv []string + + // Env is the new process image's environment variables. + Env []string + + // BinaryMode is the executable binary file's mode. + BinaryMode uint16 + + // BinarySHA256 is the SHA-256 hash of the executable binary file. + // + // Note that this requires reading the entire file into memory, which is + // likely to be extremely slow. + BinarySHA256 [32]byte +} + +// ExecveReq returns fields required by the Execve checkpoint. +func (s *state) ExecveReq() ExecveFieldSet { + return s.execveReq.Load() +} + +// Execve is called at the Execve checkpoint. +func (s *state) Execve(ctx context.Context, mask ExecveFieldSet, info *ExecveInfo) error { + for _, c := range s.getCheckers() { + if err := c.Execve(ctx, mask, *info); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/seccheck/exit.go b/pkg/sentry/seccheck/exit.go new file mode 100644 index 000000000..69cb6911c --- /dev/null +++ b/pkg/sentry/seccheck/exit.go @@ -0,0 +1,57 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" +) + +// ExitNotifyParentInfo contains information used by the ExitNotifyParent +// checkpoint. +// +// +fieldenum ExitNotifyParent +type ExitNotifyParentInfo struct { + // Exiter identifies the exiting thread. Note that by the checkpoint's + // definition, Exiter.ThreadID == Exiter.ThreadGroupID and + // Exiter.ThreadStartTime == Exiter.ThreadGroupStartTime, so requesting + // ThreadGroup* fields is redundant. + Exiter TaskInfo + + // ExitStatus is the exiting thread group's exit status, as reported + // by wait*(). + ExitStatus linux.WaitStatus +} + +// ExitNotifyParentReq returns fields required by the ExitNotifyParent +// checkpoint. +func (s *state) ExitNotifyParentReq() ExitNotifyParentFieldSet { + return s.exitNotifyParentReq.Load() +} + +// ExitNotifyParent is called at the ExitNotifyParent checkpoint. +// +// The ExitNotifyParent checkpoint occurs when a zombied thread group leader, +// not waiting for exit acknowledgement from a non-parent ptracer, becomes the +// last non-dead thread in its thread group and notifies its parent of its +// exiting. +func (s *state) ExitNotifyParent(ctx context.Context, mask ExitNotifyParentFieldSet, info *ExitNotifyParentInfo) error { + for _, c := range s.getCheckers() { + if err := c.ExitNotifyParent(ctx, mask, *info); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/seccheck/seccheck.go b/pkg/sentry/seccheck/seccheck.go new file mode 100644 index 000000000..e13274096 --- /dev/null +++ b/pkg/sentry/seccheck/seccheck.go @@ -0,0 +1,158 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package seccheck defines a structure for dynamically-configured security +// checks in the sentry. +package seccheck + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sync" +) + +// A Point represents a checkpoint, a point at which a security check occurs. +type Point uint + +// PointX represents the checkpoint X. +const ( + PointClone Point = iota + PointExecve + PointExitNotifyParent + // Add new Points above this line. + pointLength + + numPointBitmaskUint32s = (int(pointLength)-1)/32 + 1 +) + +// A Checker performs security checks at checkpoints. +// +// Each Checker method X is called at checkpoint X; if the method may return a +// non-nil error and does so, it causes the checked operation to fail +// immediately (without calling subsequent Checkers) and return the error. The +// info argument contains information relevant to the check. The mask argument +// indicates what fields in info are valid; the mask should usually be a +// superset of fields requested by the Checker's corresponding CheckerReq, but +// may be missing requested fields in some cases (e.g. if the Checker is +// registered concurrently with invocations of checkpoints). +type Checker interface { + Clone(ctx context.Context, mask CloneFieldSet, info CloneInfo) error + Execve(ctx context.Context, mask ExecveFieldSet, info ExecveInfo) error + ExitNotifyParent(ctx context.Context, mask ExitNotifyParentFieldSet, info ExitNotifyParentInfo) error +} + +// CheckerDefaults may be embedded by implementations of Checker to obtain +// no-op implementations of Checker methods that may be explicitly overridden. +type CheckerDefaults struct{} + +// Clone implements Checker.Clone. +func (CheckerDefaults) Clone(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + return nil +} + +// Execve implements Checker.Execve. +func (CheckerDefaults) Execve(ctx context.Context, mask ExecveFieldSet, info ExecveInfo) error { + return nil +} + +// ExitNotifyParent implements Checker.ExitNotifyParent. +func (CheckerDefaults) ExitNotifyParent(ctx context.Context, mask ExitNotifyParentFieldSet, info ExitNotifyParentInfo) error { + return nil +} + +// CheckerReq indicates what checkpoints a corresponding Checker runs at, and +// what information it requires at those checkpoints. +type CheckerReq struct { + // Points are the set of checkpoints for which the corresponding Checker + // must be called. Note that methods not specified in Points may still be + // called; implementations of Checker may embed CheckerDefaults to obtain + // no-op implementations of Checker methods. + Points []Point + + // All of the following fields indicate what fields in the corresponding + // XInfo struct will be requested at the corresponding checkpoint. + Clone CloneFields + Execve ExecveFields + ExitNotifyParent ExitNotifyParentFields +} + +// Global is the method receiver of all seccheck functions. +var Global state + +// state is the type of global, and is separated out for testing. +type state struct { + // registrationMu serializes all changes to the set of registered Checkers + // for all checkpoints. + registrationMu sync.Mutex + + // enabledPoints is a bitmask of checkpoints for which at least one Checker + // is registered. + // + // enabledPoints is accessed using atomic memory operations. Mutation of + // enabledPoints is serialized by registrationMu. + enabledPoints [numPointBitmaskUint32s]uint32 + + // registrationSeq supports store-free atomic reads of registeredCheckers. + registrationSeq sync.SeqCount + + // checkers is the set of all registered Checkers in order of execution. + // + // checkers is accessed using instantiations of SeqAtomic functions. + // Mutation of checkers is serialized by registrationMu. + checkers []Checker + + // All of the following xReq variables indicate what fields in the + // corresponding XInfo struct have been requested by any registered + // checker, are accessed using atomic memory operations, and are mutated + // with registrationMu locked. + cloneReq CloneFieldSet + execveReq ExecveFieldSet + exitNotifyParentReq ExitNotifyParentFieldSet +} + +// AppendChecker registers the given Checker to execute at checkpoints. The +// Checker will execute after all previously-registered Checkers, and only if +// those Checkers return a nil error. +func (s *state) AppendChecker(c Checker, req *CheckerReq) { + s.registrationMu.Lock() + defer s.registrationMu.Unlock() + + s.cloneReq.AddFieldsLoadable(req.Clone) + s.execveReq.AddFieldsLoadable(req.Execve) + s.exitNotifyParentReq.AddFieldsLoadable(req.ExitNotifyParent) + + s.appendCheckerLocked(c) + for _, p := range req.Points { + word, bit := p/32, p%32 + atomic.StoreUint32(&s.enabledPoints[word], s.enabledPoints[word]|(uint32(1)<<bit)) + } +} + +// Enabled returns true if any Checker is registered for the given checkpoint. +func (s *state) Enabled(p Point) bool { + word, bit := p/32, p%32 + return atomic.LoadUint32(&s.enabledPoints[word])&(uint32(1)<<bit) != 0 +} + +func (s *state) getCheckers() []Checker { + return SeqAtomicLoadCheckerSlice(&s.registrationSeq, &s.checkers) +} + +// Preconditions: s.registrationMu must be locked. +func (s *state) appendCheckerLocked(c Checker) { + s.registrationSeq.BeginWrite() + s.checkers = append(s.checkers, c) + s.registrationSeq.EndWrite() +} diff --git a/pkg/sentry/seccheck/seccheck_test.go b/pkg/sentry/seccheck/seccheck_test.go new file mode 100644 index 000000000..687810d18 --- /dev/null +++ b/pkg/sentry/seccheck/seccheck_test.go @@ -0,0 +1,157 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + "errors" + "testing" + + "gvisor.dev/gvisor/pkg/context" +) + +type testChecker struct { + CheckerDefaults + + onClone func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error +} + +// Clone implements Checker.Clone. +func (c *testChecker) Clone(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + if c.onClone == nil { + return nil + } + return c.onClone(ctx, mask, info) +} + +func TestNoChecker(t *testing.T) { + var s state + if s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got true, wanted false") + } +} + +func TestCheckerNotRegisteredForPoint(t *testing.T) { + var s state + s.AppendChecker(&testChecker{}, &CheckerReq{}) + if s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got true, wanted false") + } +} + +func TestCheckerRegistered(t *testing.T) { + var s state + checkerCalled := false + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkerCalled = true + return nil + }}, &CheckerReq{ + Points: []Point{PointClone}, + Clone: CloneFields{ + Credentials: true, + }, + }) + + if !s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got false, wanted true") + } + if !s.CloneReq().Contains(CloneFieldCredentials) { + t.Errorf("CloneReq().Contains(CloneFieldCredentials): got false, wanted true") + } + if err := s.Clone(context.Background(), CloneFieldSet{}, &CloneInfo{}); err != nil { + t.Errorf("Clone(): got %v, wanted nil", err) + } + if !checkerCalled { + t.Errorf("Clone() did not call Checker.Clone()") + } +} + +func TestMultipleCheckersRegistered(t *testing.T) { + var s state + checkersCalled := [2]bool{} + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[0] = true + return nil + }}, &CheckerReq{ + Points: []Point{PointClone}, + Clone: CloneFields{ + Args: true, + }, + }) + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[1] = true + return nil + }}, &CheckerReq{ + Points: []Point{PointClone}, + Clone: CloneFields{ + Created: TaskFields{ + ThreadID: true, + }, + }, + }) + + if !s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got false, wanted true") + } + // CloneReq() should return the union of requested fields from all calls to + // AppendChecker. + req := s.CloneReq() + if !req.Contains(CloneFieldArgs) { + t.Errorf("req.Contains(CloneFieldArgs): got false, wanted true") + } + if !req.Created.Contains(TaskFieldThreadID) { + t.Errorf("req.Created.Contains(TaskFieldThreadID): got false, wanted true") + } + if err := s.Clone(context.Background(), CloneFieldSet{}, &CloneInfo{}); err != nil { + t.Errorf("Clone(): got %v, wanted nil", err) + } + for i := range checkersCalled { + if !checkersCalled[i] { + t.Errorf("Clone() did not call Checker.Clone() index %d", i) + } + } +} + +func TestCheckpointReturnsFirstCheckerError(t *testing.T) { + errFirstChecker := errors.New("first Checker error") + errSecondChecker := errors.New("second Checker error") + + var s state + checkersCalled := [2]bool{} + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[0] = true + return errFirstChecker + }}, &CheckerReq{ + Points: []Point{PointClone}, + }) + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[1] = true + return errSecondChecker + }}, &CheckerReq{ + Points: []Point{PointClone}, + }) + + if !s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got false, wanted true") + } + if err := s.Clone(context.Background(), CloneFieldSet{}, &CloneInfo{}); err != errFirstChecker { + t.Errorf("Clone(): got %v, wanted %v", err, errFirstChecker) + } + if !checkersCalled[0] { + t.Errorf("Clone() did not call first Checker") + } + if checkersCalled[1] { + t.Errorf("Clone() called second Checker") + } +} diff --git a/pkg/sentry/seccheck/task.go b/pkg/sentry/seccheck/task.go new file mode 100644 index 000000000..1dee33203 --- /dev/null +++ b/pkg/sentry/seccheck/task.go @@ -0,0 +1,39 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TaskInfo contains information unambiguously identifying a single thread +// and/or its containing process. +// +// +fieldenum Task +type TaskInfo struct { + // ThreadID is the thread's ID in the root PID namespace. + ThreadID int32 + + // ThreadStartTime is the thread's CLOCK_REALTIME start time. + ThreadStartTime ktime.Time + + // ThreadGroupID is the thread's group leader's ID in the root PID + // namespace. + ThreadGroupID int32 + + // ThreadGroupStartTime is the thread's group leader's CLOCK_REALTIME start + // time. + ThreadGroupStartTime ktime.Time +} diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD deleted file mode 100644 index 1790d57c9..000000000 --- a/pkg/sentry/sighandling/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "sighandling", - srcs = [ - "sighandling.go", - "sighandling_unsafe.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "@org_golang_x_sys//unix:go_default_library", - ], -) diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go deleted file mode 100644 index bdaf8af29..000000000 --- a/pkg/sentry/sighandling/sighandling.go +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package sighandling contains helpers for handling signals to applications. -package sighandling - -import ( - "os" - "os/signal" - "reflect" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" -) - -// numSignals is the number of normal (non-realtime) signals on Linux. -const numSignals = 32 - -// handleSignals listens for incoming signals and calls the given handler -// function. -// -// It stops when the stop channel is closed. The done channel is closed once it -// will no longer deliver signals to k. -func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) { - // Build a select case. - sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}} - for _, sigchan := range sigchans { - sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)}) - } - - for { - // Wait for a notification. - index, _, ok := reflect.Select(sc) - - // Was it the stop channel? - if index == 0 { - if !ok { - // Stop forwarding and notify that it's done. - close(done) - return - } - continue - } - - // How about a different close? - if !ok { - panic("signal channel closed unexpectedly") - } - - // Otherwise, it was a signal on channel N. Index 0 represents the stop - // channel, so index N represents the channel for signal N. - handler(linux.Signal(index)) - } -} - -// StartSignalForwarding ensures that synchronous signals are passed to the -// given handler function and returns a callback that stops signal delivery. -// -// Note that this function permanently takes over signal handling. After the -// stop callback, signals revert to the default Go runtime behavior, which -// cannot be overridden with external calls to signal.Notify. -func StartSignalForwarding(handler func(linux.Signal)) func() { - stop := make(chan struct{}) - done := make(chan struct{}) - - // Register individual channels. One channel per standard signal is - // required as os.Notify() is non-blocking and may drop signals. To avoid - // this, standard signals have to be queued separately. Channel size 1 is - // enough for standard signals as their semantics allow de-duplication. - // - // External real-time signals are not supported. We rely on the go-runtime - // for their handling. - var sigchans []chan os.Signal - for sig := 1; sig <= numSignals+1; sig++ { - sigchan := make(chan os.Signal, 1) - sigchans = append(sigchans, sigchan) - - // SIGURG is used by Go's runtime scheduler. - if sig == int(linux.SIGURG) { - continue - } - signal.Notify(sigchan, unix.Signal(sig)) - } - // Start up our listener. - go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu. - - return func() { - close(stop) - <-done - } -} diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go deleted file mode 100644 index 3fe5c6770..000000000 --- a/pkg/sentry/sighandling/sighandling_unsafe.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package sighandling - -import ( - "unsafe" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" -) - -// IgnoreChildStop sets the SA_NOCLDSTOP flag, causing child processes to not -// generate SIGCHLD when they stop. -func IgnoreChildStop() error { - var sa linux.SigAction - - // Get the existing signal handler information, and set the flag. - if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(unix.SIGCHLD), 0, uintptr(unsafe.Pointer(&sa)), linux.SignalSetSize, 0, 0); e != 0 { - return e - } - sa.Flags |= linux.SA_NOCLDSTOP - if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(unix.SIGCHLD), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 { - return e - } - - return nil -} diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 7ee89a735..00f925166 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -4,7 +4,10 @@ package(licenses = ["notice"]) go_library( name = "socket", - srcs = ["socket.go"], + srcs = [ + "socket.go", + "socket_state.go", + ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 00a5e729a..6077b2150 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -29,10 +29,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "time" ) -const maxInt = int(^uint(0) >> 1) - // SCMCredentials represents a SCM_CREDENTIALS socket control message. type SCMCredentials interface { transport.CredentialsControlMessage @@ -78,7 +77,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { } // Files implements SCMRights.Files. -func (fs *RightsFiles) Files(ctx context.Context, max int) (RightsFiles, bool) { +func (fs *RightsFiles) Files(_ context.Context, max int) (RightsFiles, bool) { n := max var trunc bool if l := len(*fs); n > l { @@ -124,7 +123,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 break } - fds = append(fds, int32(fd)) + fds = append(fds, fd) } return fds, trunc } @@ -300,8 +299,8 @@ func alignSlice(buf []byte, align uint) []byte { } // PackTimestamp packs a SO_TIMESTAMP socket control message. -func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte { - timestampP := linux.NsecToTimeval(timestamp) +func PackTimestamp(t *kernel.Task, timestamp time.Time, buf []byte) []byte { + timestampP := linux.NsecToTimeval(timestamp.UnixNano()) return putCmsgStruct( buf, linux.SOL_SOCKET, @@ -355,6 +354,17 @@ func PackIPPacketInfo(t *kernel.Task, packetInfo *linux.ControlMessageIPPacketIn ) } +// PackIPv6PacketInfo packs an IPV6_PKTINFO socket control message. +func PackIPv6PacketInfo(t *kernel.Task, packetInfo *linux.ControlMessageIPv6PacketInfo, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IPV6, + linux.IPV6_PKTINFO, + t.Arch().Width(), + packetInfo, + ) +} + // PackOriginalDstAddress packs an IP_RECVORIGINALDSTADDR socket control message. func PackOriginalDstAddress(t *kernel.Task, originalDstAddress linux.SockAddr, buf []byte) []byte { var level uint32 @@ -412,6 +422,10 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt buf = PackIPPacketInfo(t, &cmsgs.IP.PacketInfo, buf) } + if cmsgs.IP.HasIPv6PacketInfo { + buf = PackIPv6PacketInfo(t, &cmsgs.IP.IPv6PacketInfo, buf) + } + if cmsgs.IP.OriginalDstAddress != nil { buf = PackOriginalDstAddress(t, cmsgs.IP.OriginalDstAddress, buf) } @@ -453,6 +467,10 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { space += cmsgSpace(t, linux.SizeOfControlMessageIPPacketInfo) } + if cmsgs.IP.HasIPv6PacketInfo { + space += cmsgSpace(t, linux.SizeOfControlMessageIPv6PacketInfo) + } + if cmsgs.IP.OriginalDstAddress != nil { space += cmsgSpace(t, cmsgs.IP.OriginalDstAddress.SizeBytes()) } @@ -526,7 +544,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) } var ts linux.Timeval ts.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval]) - cmsgs.IP.Timestamp = ts.ToNsecCapped() + cmsgs.IP.Timestamp = ts.ToTime() cmsgs.IP.HasTimestamp = true i += bits.AlignUp(length, width) diff --git a/pkg/sentry/socket/control/control_test.go b/pkg/sentry/socket/control/control_test.go index 7e28a0cef..1b04e1bbc 100644 --- a/pkg/sentry/socket/control/control_test.go +++ b/pkg/sentry/socket/control/control_test.go @@ -50,7 +50,7 @@ func TestParse(t *testing.T) { want := socket.ControlMessages{ IP: socket.IPControlMessages{ HasTimestamp: true, - Timestamp: ts.ToNsecCapped(), + Timestamp: ts.ToTime(), }, } if diff := cmp.Diff(want, cmsg); diff != "" { diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 3950caa0f..4ea89f9d0 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -38,7 +38,6 @@ go_library( "//pkg/sentry/socket/control", "//pkg/sentry/vfs", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/usermem", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 38cb2c99c..6e2318f75 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -35,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -112,7 +111,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) })) - return int64(n), err + return n, err } // Write implements fs.FileOperations.Write. @@ -135,7 +134,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) })) - return int64(n), err + return n, err } // Socket implements socket.Provider.Socket. @@ -181,7 +180,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, proto } // Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (p *socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } @@ -208,7 +207,7 @@ type socketOpsCommon struct { // Release implements fs.FileOperations.Release. func (s *socketOpsCommon) Release(context.Context) { fdnotifier.RemoveFD(int32(s.fd)) - unix.Close(s.fd) + _ = unix.Close(s.fd) } // Readiness implements waiter.Waitable.Readiness. @@ -219,13 +218,13 @@ func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { // EventRegister implements waiter.Waitable.EventRegister. func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.queue.EventRegister(e, mask) - fdnotifier.UpdateFD(int32(s.fd)) + _ = fdnotifier.UpdateFD(int32(s.fd)) } // EventUnregister implements waiter.Waitable.EventUnregister. func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.queue.EventUnregister(e) - fdnotifier.UpdateFD(int32(s.fd)) + _ = fdnotifier.UpdateFD(int32(s.fd)) } // Connect implements socket.Socket.Connect. @@ -288,7 +287,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) if blocking { var ch chan struct{} - for syscallErr == syserror.ErrWouldBlock { + for syscallErr == linuxerr.ErrWouldBlock { if ch != nil { if syscallErr = t.Block(ch); syscallErr != nil { break @@ -317,7 +316,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, if kernel.VFS2Enabled { f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&unix.SOCK_NONBLOCK)) if err != nil { - unix.Close(fd) + _ = unix.Close(fd) return 0, nil, 0, err } defer f.DecRef(t) @@ -329,7 +328,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, } else { f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&unix.SOCK_NONBLOCK != 0) if err != nil { - unix.Close(fd) + _ = unix.Close(fd) return 0, nil, 0, err } defer f.DecRef(t) @@ -344,7 +343,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, } // Bind implements socket.Socket.Bind. -func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -357,12 +356,12 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Listen implements socket.Socket.Listen. -func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(_ *kernel.Task, backlog int) *syserr.Error { return syserr.FromError(unix.Listen(s.fd, backlog)) } // Shutdown implements socket.Socket.Shutdown. -func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(_ *kernel.Task, how int) *syserr.Error { switch how { case unix.SHUT_RD, unix.SHUT_WR, unix.SHUT_RDWR: return syserr.FromError(unix.Shutdown(s.fd, how)) @@ -372,7 +371,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, _ hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } @@ -402,7 +401,7 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr case linux.TCP_NODELAY: optlen = sizeofInt32 case linux.TCP_INFO: - optlen = int(linux.SizeOfTCPInfo) + optlen = linux.SizeOfTCPInfo } } @@ -535,7 +534,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags n, err := copyToDst() // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT. if flags&(unix.MSG_DONTWAIT|unix.MSG_ERRQUEUE) == 0 { - for err == syserror.ErrWouldBlock { + for err == linuxerr.ErrWouldBlock { // We only expect blocking to come from the actual syscall, in which // case it can't have returned any data. if n != 0 { @@ -580,7 +579,7 @@ func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) s controlMessages.IP.HasTimestamp = true ts := linux.Timeval{} ts.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfTimeval]) - controlMessages.IP.Timestamp = ts.ToNsecCapped() + controlMessages.IP.Timestamp = ts.ToTime() } case linux.SOL_IP: @@ -707,7 +706,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b var ch chan struct{} n, err := src.CopyInTo(t, sendmsgFromBlocks) if flags&unix.MSG_DONTWAIT == 0 { - for err == syserror.ErrWouldBlock { + for err == linuxerr.ErrWouldBlock { // We only expect blocking to come from the actual syscall, in which // case it can't have returned any data. if n != 0 { @@ -716,7 +715,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if ch != nil { if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -735,7 +734,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b func translateIOSyscallError(err error) error { if err == unix.EAGAIN || err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go index ea56f39c1..b9c15daab 100644 --- a/pkg/sentry/socket/netfilter/targets.go +++ b/pkg/sentry/socket/netfilter/targets.go @@ -647,7 +647,7 @@ func (jt *JumpTarget) id() targetID { } // Action implements stack.Target.Action. -func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) { +func (jt *JumpTarget) Action(*stack.PacketBuffer, stack.Hook, *stack.Route, stack.AddressableEndpoint) (stack.RuleVerdict, int) { return stack.RuleJump, jt.RuleNum } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index ed85404da..9710a15ee 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -36,7 +36,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 5c3ae26f8..ed5fa9c38 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -39,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -530,7 +529,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } } - if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + if n, err := doRead(); err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { var mflags int if n < int64(r.MsgSize) { mflags |= linux.MSG_TRUNC @@ -548,7 +547,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags defer s.EventUnregister(&e) for { - if n, err := doRead(); err != syserror.ErrWouldBlock { + if n, err := doRead(); err != linuxerr.ErrWouldBlock { var mflags int if n < int64(r.MsgSize) { mflags |= linux.MSG_TRUNC diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index e828982eb..075f61cda 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "device.go", "netstack.go", + "netstack_state.go", "netstack_vfs2.go", "provider.go", "provider_vfs2.go", @@ -42,13 +43,13 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/header", "//pkg/tcpip/link/tun", "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", + "//pkg/tcpip/transport", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/usermem", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 9b844b0c0..030c6c8e4 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -56,12 +56,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" - "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -275,6 +274,7 @@ var Metrics = tcpip.Stats{ ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), + SpuriousRecovery: mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."), }, UDP: tcpip.UDPStats{ PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), @@ -379,9 +379,9 @@ type socketOpsCommon struct { // timestampValid indicates whether timestamp for SIOCGSTAMP has been // set. It is protected by readMu. timestampValid bool - // timestampNS holds the timestamp to use with SIOCTSTAMP. It is only + // timestamp holds the timestamp to use with SIOCTSTAMP. It is only // valid when timestampValid is true. It is protected by readMu. - timestampNS int64 + timestamp time.Time `state:".(int64)"` // TODO(b/153685824): Move this to SocketOptions. // sockOptInq corresponds to TCP_INQ. @@ -411,13 +411,25 @@ var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes() var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes() var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes() -// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the -// netstack representation taking any addresses into account. -func bytesToIPAddress(addr []byte) tcpip.Address { - if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) { - return "" +// minSockAddrLen returns the minimum length in bytes of a socket address for +// the socket's family. +func (s *socketOpsCommon) minSockAddrLen() int { + const addressFamilySize = 2 + + switch s.family { + case linux.AF_UNIX: + return addressFamilySize + case linux.AF_INET: + return sockAddrInetSize + case linux.AF_INET6: + return sockAddrInet6Size + case linux.AF_PACKET: + return sockAddrLinkSize + case linux.AF_UNSPEC: + return addressFamilySize + default: + panic(fmt.Sprintf("s.family unrecognized = %d", s.family)) } - return tcpip.Address(addr) } func (s *socketOpsCommon) isPacketBased() bool { @@ -448,7 +460,7 @@ func (s *socketOpsCommon) Release(ctx context.Context) { t := kernel.TaskFromContext(ctx) start := t.Kernel().MonotonicClock().Now() deadline := start.Add(v.Timeout) - t.BlockWithDeadline(ch, true, deadline) + _ = t.BlockWithDeadline(ch, true, deadline) } } @@ -459,7 +471,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) if err == syserr.ErrWouldBlock { - return int64(n), syserror.ErrWouldBlock + return int64(n), linuxerr.ErrWouldBlock } if err != nil { return 0, err.ToError() @@ -468,7 +480,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } // WriteTo implements fs.FileOperations.WriteTo. -func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) { +func (s *SocketOperations) WriteTo(_ context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) { s.readMu.Lock() defer s.readMu.Unlock() @@ -492,14 +504,14 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO r := src.Reader(ctx) n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) if _, ok := err.(*tcpip.ErrWouldBlock); ok { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } if n < src.NumBytes() { - return n, syserror.ErrWouldBlock + return n, linuxerr.ErrWouldBlock } return n, nil @@ -523,7 +535,7 @@ func (l *limitedPayloader) Len() int { } // ReadFrom implements fs.FileOperations.ReadFrom. -func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { +func (s *SocketOperations) ReadFrom(_ context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { f := limitedPayloader{ inner: io.LimitedReader{ R: r, @@ -546,16 +558,21 @@ func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return s.Endpoint.Readiness(mask) } -func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { +// checkFamily returns true iff the specified address family may be used with +// the socket. +// +// If exact is true, then the specified address family must be an exact match +// with the socket's family. +func (s *socketOpsCommon) checkFamily(family uint16, exact bool) bool { if family == uint16(s.family) { - return nil + return true } if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { if !s.Endpoint.SocketOptions().GetV6Only() { - return nil + return true } } - return syserr.ErrInvalidArgument + return false } // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the @@ -588,8 +605,8 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool return syserr.TranslateNetstackError(err) } - if err := s.checkFamily(family, false /* exact */); err != nil { - return err + if !s.checkFamily(family, false /* exact */) { + return syserr.ErrInvalidArgument } addr = s.mapFamily(addr, family) @@ -629,7 +646,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) < 2 { return syserr.ErrInvalidArgument } @@ -647,23 +664,24 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } a.UnmarshalBytes(sockaddr[:sockAddrLinkSize]) - if a.Protocol != uint16(s.protocol) { - return syserr.ErrInvalidArgument - } - addr = tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + Port: socket.Ntohs(a.Protocol), } } else { + if s.minSockAddrLen() > len(sockaddr) { + return syserr.ErrInvalidArgument + } + var err *syserr.Error addr, family, err = socket.AddressAndFamily(sockaddr) if err != nil { return err } - if err = s.checkFamily(family, true /* exact */); err != nil { - return err + if !s.checkFamily(family, true /* exact */) { + return syserr.ErrAddressFamilyNotSupported } addr = s.mapFamily(addr, family) @@ -688,7 +706,7 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Listen implements the linux syscall listen(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(_ *kernel.Task, backlog int) *syserr.Error { return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog)) } @@ -779,7 +797,7 @@ func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { // Shutdown implements the linux syscall shutdown(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(_ *kernel.Task, how int) *syserr.Error { f, err := ConvertShutdown(how) if err != nil { return err @@ -860,7 +878,7 @@ func boolToInt32(v bool) int32 { } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -1345,6 +1363,14 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) return &v, nil + case linux.IPV6_RECVPKTINFO: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo())) + return &v, nil + case linux.IP6T_ORIGINAL_DST: if outLen < sockAddrInet6Size { return nil, syserr.ErrInvalidArgument @@ -1368,11 +1394,11 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true) + info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true) if err != nil { return nil, err } @@ -1388,11 +1414,11 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen) + entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen) if err != nil { return nil, err } @@ -1408,8 +1434,8 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) @@ -1425,7 +1451,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name } // getSockOptIP implements GetSockOpt when level is SOL_IP. -func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) { +func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) return nil, syserr.ErrUnknownProtocolOption @@ -1565,11 +1591,11 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false) + info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false) if err != nil { return nil, err } @@ -1585,11 +1611,11 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen) + entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen) if err != nil { return nil, err } @@ -1605,8 +1631,8 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) @@ -2046,7 +2072,7 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name if isTCPSocket(skType, skProto) && tcp.EndpointState(ep.State()) != tcp.StateInitial { return syserr.ErrInvalidEndpointState - } else if isUDPSocket(skType, skProto) && udp.EndpointState(ep.State()) != udp.StateInitial { + } else if isUDPSocket(skType, skProto) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial { return syserr.ErrInvalidEndpointState } @@ -2101,6 +2127,15 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) return nil + case linux.IPV6_RECVPKTINFO: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(hostarch.ByteOrder.Uint32(optVal)) + + ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0) + return nil + case linux.IPV6_TCLASS: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument @@ -2143,12 +2178,12 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, true) + return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, true) case linux.IP6T_SO_SET_ADD_COUNTERS: log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") @@ -2386,12 +2421,12 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, false) + return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, false) case linux.IPT_SO_SET_ADD_COUNTERS: log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") @@ -2490,7 +2525,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) { linux.IPV6_RECVHOPLIMIT, linux.IPV6_RECVHOPOPTS, linux.IPV6_RECVPATHMTU, - linux.IPV6_RECVPKTINFO, linux.IPV6_RECVRTHDR, linux.IPV6_RTHDR, linux.IPV6_RTHDRDSTOPTS, @@ -2559,7 +2593,7 @@ func emitUnimplementedEventIP(t *kernel.Task, name int) { // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2571,7 +2605,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, * // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2716,6 +2750,8 @@ func (s *socketOpsCommon) controlMessages(cm tcpip.ControlMessages) socket.Contr TClass: readCM.TClass, HasIPPacketInfo: readCM.HasIPPacketInfo, PacketInfo: readCM.PacketInfo, + HasIPv6PacketInfo: readCM.HasIPv6PacketInfo, + IPv6PacketInfo: readCM.IPv6PacketInfo, OriginalDstAddress: readCM.OriginalDstAddress, SockErr: readCM.SockErr, }, @@ -2730,7 +2766,7 @@ func (s *socketOpsCommon) updateTimestamp(cm tcpip.ControlMessages) { // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. if !s.sockOptTimestamp { s.timestampValid = true - s.timestampNS = cm.Timestamp + s.timestamp = cm.Timestamp } } @@ -2789,7 +2825,7 @@ func (s *socketOpsCommon) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { if flags&linux.MSG_ERRQUEUE != 0 { return s.recvErr(t, dst) } @@ -2873,8 +2909,8 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if err != nil { return 0, err } - if err := s.checkFamily(family, false /* exact */); err != nil { - return 0, err + if !s.checkFamily(family, false /* exact */) { + return 0, syserr.ErrInvalidArgument } addrBuf = s.mapFamily(addrBuf, family) @@ -2951,10 +2987,10 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy s.readMu.Lock() defer s.readMu.Unlock() if !s.timestampValid { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } - tv := linux.NsecToTimeval(s.timestampNS) + tv := linux.NsecToTimeval(s.timestamp.UnixNano()) _, err := tv.CopyOut(t, args[2].Pointer()) return 0, err @@ -3061,7 +3097,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc } // interfaceIoctl implements interface requests. -func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { +func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { var ( iface inet.Interface index int32 @@ -3069,8 +3105,8 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe ) // Find the relevant device. - stack := inet.StackFromContext(ctx) - if stack == nil { + stk := inet.StackFromContext(ctx) + if stk == nil { return syserr.ErrNoDevice } @@ -3080,7 +3116,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe // Gets the name of the interface given the interface index // stored in ifr_ifindex. index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) - if iface, ok := stack.Interfaces()[index]; ok { + if iface, ok := stk.Interfaces()[index]; ok { ifr.SetName(iface.Name) return nil } @@ -3088,7 +3124,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } // Find the relevant device. - for index, iface = range stack.Interfaces() { + for index, iface = range stk.Interfaces() { if iface.Name == ifr.Name() { found = true break @@ -3121,7 +3157,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } case linux.SIOCGIFFLAGS: - f, err := interfaceStatusFlags(stack, iface.Name) + f, err := interfaceStatusFlags(stk, iface.Name) if err != nil { return err } @@ -3131,7 +3167,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe case linux.SIOCGIFADDR: // Copy the IPv4 address out. - for _, addr := range stack.InterfaceAddrs()[index] { + for _, addr := range stk.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. if addr.Family != linux.AF_INET { continue @@ -3167,7 +3203,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe case linux.SIOCGIFNETMASK: // Gets the network mask of a device. - for _, addr := range stack.InterfaceAddrs()[index] { + for _, addr := range stk.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. if addr.Family != linux.AF_INET { continue @@ -3199,24 +3235,24 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. -func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error { +func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error { // If Ptr is NULL, return the necessary buffer size via Len. // Otherwise, write up to Len bytes starting at Ptr containing ifreq // structs. - stack := inet.StackFromContext(ctx) - if stack == nil { + stk := inet.StackFromContext(ctx) + if stk == nil { return syserr.ErrNoDevice.ToError() } if ifc.Ptr == 0 { - ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq) + ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq) return nil } max := ifc.Len ifc.Len = 0 - for key, ifaceAddrs := range stack.InterfaceAddrs() { - iface := stack.Interfaces()[key] + for key, ifaceAddrs := range stk.InterfaceAddrs() { + iface := stk.Interfaces()[key] for _, ifaceAddr := range ifaceAddrs { // Don't write past the end of the buffer. if ifc.Len+int32(linux.SizeOfIFReq) > max { @@ -3332,10 +3368,10 @@ func (s *socketOpsCommon) State() uint32 { } case isUDPSocket(s.skType, s.protocol): // UDP socket. - switch udp.EndpointState(s.Endpoint.State()) { - case udp.StateInitial, udp.StateBound, udp.StateClosed: + switch transport.DatagramEndpointState(s.Endpoint.State()) { + case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed: return linux.TCP_CLOSE - case udp.StateConnected: + case transport.DatagramEndpointStateConnected: return linux.TCP_ESTABLISHED default: return 0 diff --git a/pkg/sentry/socket/netstack/netstack_state.go b/pkg/sentry/socket/netstack/netstack_state.go new file mode 100644 index 000000000..591e00d42 --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack_state.go @@ -0,0 +1,31 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "time" +) + +func (s *socketOpsCommon) saveTimestamp() int64 { + s.readMu.Lock() + defer s.readMu.Unlock() + return s.timestamp.UnixNano() +} + +func (s *socketOpsCommon) loadTimestamp(nsec int64) { + s.readMu.Lock() + defer s.readMu.Unlock() + s.timestamp = time.Unix(0, nsec) +} diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index edc160b1b..3cdf29b80 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -113,7 +112,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. } n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) if err == syserr.ErrWouldBlock { - return int64(n), syserror.ErrWouldBlock + return int64(n), linuxerr.ErrWouldBlock } if err != nil { return 0, err.ToError() @@ -132,14 +131,14 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs r := src.Reader(ctx) n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) if _, ok := err.(*tcpip.ErrWouldBlock); ok { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } if n < src.NumBytes() { - return n, syserror.ErrWouldBlock + return n, linuxerr.ErrWouldBlock } return n, nil diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 208ab9909..ea199f223 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -155,7 +155,7 @@ func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { // Attach address to interface. nicID := tcpip.NICID(idx) - if err := s.Stack.AddProtocolAddressWithOptions(nicID, protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + if err := s.Stack.AddProtocolAddress(nicID, protocolAddress, stack.AddressProperties{}); err != nil { return syserr.TranslateNetstackError(err).ToError() } diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 658e90bb9..d4b80a39d 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -21,6 +21,7 @@ import ( "bytes" "fmt" "sync/atomic" + "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" @@ -51,8 +52,19 @@ type ControlMessages struct { func packetInfoToLinux(packetInfo tcpip.IPPacketInfo) linux.ControlMessageIPPacketInfo { var p linux.ControlMessageIPPacketInfo p.NIC = int32(packetInfo.NIC) - copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr)) - copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr)) + copy(p.LocalAddr[:], packetInfo.LocalAddr) + copy(p.DestinationAddr[:], packetInfo.DestinationAddr) + return p +} + +// ipv6PacketInfoToLinux converts IPv6PacketInfo from tcpip format to Linux +// format. +func ipv6PacketInfoToLinux(packetInfo tcpip.IPv6PacketInfo) linux.ControlMessageIPv6PacketInfo { + var p linux.ControlMessageIPv6PacketInfo + if n := copy(p.Addr[:], packetInfo.Addr); n != len(p.Addr) { + panic(fmt.Sprintf("got copy(%x, %x) = %d, want = %d", p.Addr, packetInfo.Addr, n, len(p.Addr))) + } + p.NIC = uint32(packetInfo.NIC) return p } @@ -114,7 +126,7 @@ func NewIPControlMessages(family int, cmgs tcpip.ControlMessages) IPControlMessa if cmgs.HasOriginalDstAddress { orgDstAddr, _ = ConvertAddress(family, cmgs.OriginalDstAddress) } - return IPControlMessages{ + cm := IPControlMessages{ HasTimestamp: cmgs.HasTimestamp, Timestamp: cmgs.Timestamp, HasInq: cmgs.HasInq, @@ -125,9 +137,16 @@ func NewIPControlMessages(family int, cmgs tcpip.ControlMessages) IPControlMessa TClass: cmgs.TClass, HasIPPacketInfo: cmgs.HasIPPacketInfo, PacketInfo: packetInfoToLinux(cmgs.PacketInfo), + HasIPv6PacketInfo: cmgs.HasIPv6PacketInfo, OriginalDstAddress: orgDstAddr, SockErr: sockErrCmsgToLinux(cmgs.SockErr), } + + if cm.HasIPv6PacketInfo { + cm.IPv6PacketInfo = ipv6PacketInfoToLinux(cmgs.IPv6PacketInfo) + } + + return cm } // IPControlMessages contains socket control messages for IP sockets. @@ -138,9 +157,9 @@ type IPControlMessages struct { // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool - // Timestamp is the time (in ns) that the last packet used to create - // the read data was received. - Timestamp int64 + // Timestamp is the time that the last packet used to create the read data + // was received. + Timestamp time.Time `state:".(int64)"` // HasInq indicates whether Inq is valid/set. HasInq bool @@ -166,6 +185,12 @@ type IPControlMessages struct { // PacketInfo holds interface and address data on an incoming packet. PacketInfo linux.ControlMessageIPPacketInfo + // HasIPv6PacketInfo indicates whether IPv6PacketInfo is set. + HasIPv6PacketInfo bool + + // PacketInfo holds interface and address data on an incoming packet. + IPv6PacketInfo linux.ControlMessageIPv6PacketInfo + // OriginalDestinationAddress holds the original destination address // and port of the incoming packet. OriginalDstAddress linux.SockAddr @@ -743,6 +768,8 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } a.UnmarshalUnsafe(addr[:sockAddrLinkSize]) + // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have + // an ethernet address. if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } @@ -750,6 +777,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + Port: Ntohs(a.Protocol), }, family, nil case linux.AF_UNSPEC: diff --git a/pkg/sentry/socket/socket_state.go b/pkg/sentry/socket/socket_state.go new file mode 100644 index 000000000..32e12b238 --- /dev/null +++ b/pkg/sentry/socket/socket_state.go @@ -0,0 +1,27 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package socket + +import ( + "time" +) + +func (i *IPControlMessages) saveTimestamp() int64 { + return i.Timestamp.UnixNano() +} + +func (i *IPControlMessages) loadTimestamp(nsec int64) { + i.Timestamp = time.Unix(0, nsec) +} diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 5c3cdef6a..7b546c04d 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -62,7 +62,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 33f9aeb06..b3f0cf563 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -129,9 +129,9 @@ func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProv stype: stype, } + ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) - ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) return ep } @@ -406,14 +406,15 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error { // Accept accepts a new connection. func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) { e.Lock() - defer e.Unlock() if !e.Listening() { + e.Unlock() return nil, syserr.ErrInvalidEndpointState } select { case ne := <-e.acceptedChan: + e.Unlock() if peerAddr != nil { ne.Lock() c := ne.connected @@ -429,6 +430,7 @@ func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *s return ne, nil default: + e.Unlock() // Nothing left. return nil, syserr.ErrWouldBlock } @@ -517,3 +519,6 @@ func (e *connectionedEndpoint) OnSetSendBufferSize(v int64) (newSz int64) { } return v } + +// WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. +func (e *connectionedEndpoint) WakeupWriters() {} diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 61338728a..61311718e 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -44,9 +44,9 @@ func NewConnectionless(ctx context.Context) Endpoint { q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: defaultBufferSize} q.InitRefs() ep.receiver = &queueReceiver{readQueue: &q} + ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) - ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) return ep } @@ -227,3 +227,6 @@ func (e *connectionlessEndpoint) OnSetSendBufferSize(v int64) (newSz int64) { } return v } + +// WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. +func (e *connectionlessEndpoint) WakeupWriters() {} diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index e4de44498..188ad3bd9 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -59,12 +59,14 @@ func (q *queue) Close() { // q.WriterQueue.Notify(waiter.WritableEvents) func (q *queue) Reset(ctx context.Context) { q.mu.Lock() - for cur := q.dataList.Front(); cur != nil; cur = cur.Next() { - cur.Release(ctx) - } + dataList := q.dataList q.dataList.Reset() q.used = 0 q.mu.Unlock() + + for cur := dataList.Front(); cur != nil; cur = cur.Next() { + cur.Release(ctx) + } } // DecRef implements RefCounter.DecRef. @@ -133,7 +135,7 @@ func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, f free := q.limit - q.used if l > free && truncate { - if free == 0 { + if free <= 0 { // Message can't fit right now. q.mu.Unlock() return 0, false, syserr.ErrWouldBlock diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 8ccdadae9..e9e482017 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -38,7 +38,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -494,7 +493,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b } n, err := src.CopyInTo(t, &w) - if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + if err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { return int(n), syserr.FromError(err) } @@ -514,13 +513,13 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b n, err = src.CopyInTo(t, &w) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -648,7 +647,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } var total int64 - if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait { + if n, err := doRead(); err != linuxerr.ErrWouldBlock || dontWait { var from linux.SockAddr var fromLen uint32 if r.From != nil && len([]byte(r.From.Addr)) != 0 { @@ -683,7 +682,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags defer s.EventUnregister(&e) for { - if n, err := doRead(); err != syserror.ErrWouldBlock { + if n, err := doRead(); err != linuxerr.ErrWouldBlock { var from linux.SockAddr var fromLen uint32 if r.From != nil { diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 757ff2a40..4d3f4d556 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -610,9 +610,9 @@ func (i *SyscallInfo) printExit(t *kernel.Task, elapsed time.Duration, output [] if err == nil { // Fill in the output after successful execution. i.post(t, args, retval, output, LogMaximumSize) - rval = fmt.Sprintf("%#x (%v)", retval, elapsed) + rval = fmt.Sprintf("%d (%#x) (%v)", retval, retval, elapsed) } else { - rval = fmt.Sprintf("%#x errno=%d (%s) (%v)", retval, errno, err, elapsed) + rval = fmt.Sprintf("%d (%#x) errno=%d (%s) (%v)", retval, retval, errno, err, elapsed) } switch len(output) { diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index f2c55588f..7a7c80ac6 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -16,7 +16,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/time", - "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index b5a371d9a..394396cde 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -104,7 +104,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 76389fbe3..f4d549a3f 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) var ( @@ -90,9 +89,9 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er } // Translate error, if possible, to consolidate errors from other packages - // into a smaller set of errors from syserror package. + // into a smaller set of errors from linuxerr package. translatedErr := errOrig - if errno, ok := syserror.TranslateError(errOrig); ok { + if errno, ok := linuxerr.TranslateError(errOrig); ok { translatedErr = errno } switch { @@ -167,10 +166,7 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er // files. Since we have a partial read/write, we consume // ErrWouldBlock, returning the partial result. return true, nil - } - - switch errOrig.(type) { - case syserror.SyscallRestartErrno: + case linuxerr.IsRestartError(translatedErr): // Identical to the EINTR case. return true, nil } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 1ead3c7e8..2046a48b9 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -124,7 +123,7 @@ var AMD64 = &kernel.SyscallTable{ 68: syscalls.Supported("msgget", Msgget), 69: syscalls.Supported("msgsnd", Msgsnd), 70: syscalls.Supported("msgrcv", Msgrcv), - 71: syscalls.PartiallySupported("msgctl", Msgctl, "Only supports IPC_RMID option.", []string{"gvisor.dev/issue/135"}), + 71: syscalls.Supported("msgctl", Msgctl), 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), @@ -175,8 +174,8 @@ var AMD64 = &kernel.SyscallTable{ 119: syscalls.Supported("setresgid", Setresgid), 120: syscalls.Supported("getresgid", Getresgid), 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 122: syscalls.ErrorWithEvent("setfsuid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 124: syscalls.Supported("getsid", Getsid), 125: syscalls.Supported("capget", Capget), 126: syscalls.Supported("capset", Capset), @@ -187,12 +186,12 @@ var AMD64 = &kernel.SyscallTable{ 131: syscalls.Supported("sigaltstack", Sigaltstack), 132: syscalls.Supported("utime", Utime), 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 134: syscalls.Error("uselib", linuxerr.ENOSYS, "Obsolete", nil), 135: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 136: syscalls.ErrorWithEvent("ustat", linuxerr.ENOSYS, "Needs filesystem support.", nil), 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 139: syscalls.ErrorWithEvent("sysfs", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/165"}), 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), @@ -230,15 +229,15 @@ var AMD64 = &kernel.SyscallTable{ 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 177: syscalls.Error("get_kernel_syms", linuxerr.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", linuxerr.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 180: syscalls.Error("nfsservctl", linuxerr.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", linuxerr.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), @@ -258,18 +257,18 @@ var AMD64 = &kernel.SyscallTable{ 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 205: syscalls.Error("set_thread_area", linuxerr.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 211: syscalls.Error("get_thread_area", linuxerr.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", linuxerr.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", linuxerr.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", linuxerr.ENOSYS, "Deprecated since Linux 3.16.", nil), 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), @@ -289,16 +288,16 @@ var AMD64 = &kernel.SyscallTable{ 233: syscalls.Supported("epoll_ctl", EpollCtl), 234: syscalls.Supported("tgkill", Tgkill), 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), + 236: syscalls.Error("vserver", linuxerr.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 240: syscalls.ErrorWithEvent("mq_open", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 247: syscalls.Supported("waitid", Waitid), 248: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), @@ -331,7 +330,7 @@ var AMD64 = &kernel.SyscallTable{ 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 278: syscalls.ErrorWithEvent("vmsplice", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 280: syscalls.Supported("utimensat", Utimensat), 281: syscalls.Supported("epoll_pwait", EpollPwait), @@ -353,8 +352,8 @@ var AMD64 = &kernel.SyscallTable{ 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), 298: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.Supported("prlimit64", Prlimit64), 303: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 304: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), @@ -363,48 +362,48 @@ var AMD64 = &kernel.SyscallTable{ 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), 308: syscalls.ErrorWithEvent("setns", linuxerr.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 310: syscalls.ErrorWithEvent("process_vm_readv", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 314: syscalls.ErrorWithEvent("sched_setattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) 317: syscalls.Supported("seccomp", Seccomp), 318: syscalls.Supported("getrandom", GetRandom), 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 322: syscalls.Supported("execveat", Execveat), - 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 323: syscalls.ErrorWithEvent("userfaultfd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls implemented after 325 are "backports" from versions // of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 326: syscalls.ErrorWithEvent("copy_file_range", linuxerr.ENOSYS, "", nil), 327: syscalls.Supported("preadv2", Preadv2), 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", linuxerr.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", linuxerr.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", linuxerr.ENOSYS, "", nil), 332: syscalls.Supported("statx", Statx), - 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 333: syscalls.ErrorWithEvent("io_pgetevents", linuxerr.ENOSYS, "", nil), 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + 424: syscalls.ErrorWithEvent("pidfd_send_signal", linuxerr.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", linuxerr.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", linuxerr.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", linuxerr.ENOSYS, "", nil), 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{ @@ -414,7 +413,7 @@ var AMD64 = &kernel.SyscallTable{ }, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS }, } @@ -472,7 +471,7 @@ var ARM64 = &kernel.SyscallTable{ 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), 41: syscalls.Error("pivot_root", linuxerr.EPERM, "", nil), - 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 42: syscalls.Error("nfsservctl", linuxerr.ENOSYS, "Removed after Linux 3.1.", nil), 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), 45: syscalls.Supported("truncate", Truncate), @@ -505,7 +504,7 @@ var ARM64 = &kernel.SyscallTable{ 72: syscalls.Supported("pselect", Pselect), 73: syscalls.Supported("ppoll", Ppoll), 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 75: syscalls.ErrorWithEvent("vmsplice", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 76: syscalls.Supported("splice", Splice), 77: syscalls.Supported("tee", Tee), 78: syscalls.Supported("readlinkat", Readlinkat), @@ -581,8 +580,8 @@ var ARM64 = &kernel.SyscallTable{ 148: syscalls.Supported("getresuid", Getresuid), 149: syscalls.Supported("setresgid", Setresgid), 150: syscalls.Supported("getresgid", Getresgid), - 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 151: syscalls.ErrorWithEvent("setfsuid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 152: syscalls.ErrorWithEvent("setfsgid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 153: syscalls.Supported("times", Times), 154: syscalls.Supported("setpgid", Setpgid), 155: syscalls.Supported("getpgid", Getpgid), @@ -610,14 +609,14 @@ var ARM64 = &kernel.SyscallTable{ 177: syscalls.Supported("getegid", Getegid), 178: syscalls.Supported("gettid", Gettid), 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 180: syscalls.ErrorWithEvent("mq_open", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 181: syscalls.ErrorWithEvent("mq_unlink", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 182: syscalls.ErrorWithEvent("mq_timedsend", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 183: syscalls.ErrorWithEvent("mq_timedreceive", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 184: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 185: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 186: syscalls.Supported("msgget", Msgget), - 187: syscalls.PartiallySupported("msgctl", Msgctl, "Only supports IPC_RMID option.", []string{"gvisor.dev/issue/135"}), + 187: syscalls.Supported("msgctl", Msgctl), 188: syscalls.Supported("msgrcv", Msgrcv), 189: syscalls.Supported("msgsnd", Msgsnd), 190: syscalls.Supported("semget", Semget), @@ -664,7 +663,7 @@ var ARM64 = &kernel.SyscallTable{ 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 234: syscalls.ErrorWithEvent("remap_file_pages", linuxerr.ENOSYS, "Deprecated since Linux 3.16.", nil), 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), @@ -676,60 +675,60 @@ var ARM64 = &kernel.SyscallTable{ 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), 260: syscalls.Supported("wait4", Wait4), 261: syscalls.Supported("prlimit64", Prlimit64), - 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 262: syscalls.ErrorWithEvent("fanotify_init", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 263: syscalls.ErrorWithEvent("fanotify_mark", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 264: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 265: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), 268: syscalls.ErrorWithEvent("setns", linuxerr.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 270: syscalls.ErrorWithEvent("process_vm_readv", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 271: syscalls.ErrorWithEvent("process_vm_writev", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 274: syscalls.ErrorWithEvent("sched_setattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 275: syscalls.ErrorWithEvent("sched_getattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 276: syscalls.ErrorWithEvent("renameat2", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) 277: syscalls.Supported("seccomp", Seccomp), 278: syscalls.Supported("getrandom", GetRandom), 279: syscalls.Supported("memfd_create", MemfdCreate), 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 281: syscalls.Supported("execveat", Execveat), - 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 282: syscalls.ErrorWithEvent("userfaultfd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 284 are "backports" from versions of Linux after 4.4. - 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 285: syscalls.ErrorWithEvent("copy_file_range", linuxerr.ENOSYS, "", nil), 286: syscalls.Supported("preadv2", Preadv2), 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", linuxerr.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", linuxerr.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", linuxerr.ENOSYS, "", nil), 291: syscalls.Supported("statx", Statx), - 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 292: syscalls.ErrorWithEvent("io_pgetevents", linuxerr.ENOSYS, "", nil), 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + 424: syscalls.ErrorWithEvent("pidfd_send_signal", linuxerr.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", linuxerr.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", linuxerr.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", linuxerr.ENOSYS, "", nil), 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{}, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS }, } diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 9dea78085..373948991 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and @@ -67,6 +66,6 @@ func copyInSigSetWithSize(t *kernel.Task, addr hostarch.Addr) (hostarch.Addr, ui maskSize := uint(hostarch.ByteOrder.Uint64(in[8:])) return maskAddr, maskSize, nil default: - return 0, 0, syserror.ENOSYS + return 0, 0, linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 4ce3430e2..2f00c3783 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" ) @@ -138,7 +138,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if count > 0 || linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return uintptr(count), nil, nil } - return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + return 0, nil, syserr.ConvertIntr(err, linuxerr.EINTR) } } @@ -216,7 +216,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) // It is not presently supported (ENOSYS indicates no support on this // architecture). func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // LINT.IfChange @@ -355,7 +355,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } cbAddr = hostarch.Addr(cbAddrP) default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // Copy in this callback. diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index daa151bb4..6c807124c 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -22,7 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) @@ -109,7 +109,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func waitEpoll(t *kernel.Task, fd int32, eventsAddr hostarch.Addr, max int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { r, err := syscalls.WaitEpoll(t, fd, max, timeoutInNanos) if err != nil { - return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + return 0, nil, syserr.ConvertIntr(err, linuxerr.EINTR) } if len(r) != 0 { diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 3528d325f..e79b92fb6 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -30,7 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // fileOpAt performs an operation on the second last component in the path. @@ -122,7 +122,7 @@ func copyInPath(t *kernel.Task, addr hostarch.Addr, allowEmpty bool) (path strin return "", false, err } if path == "" && !allowEmpty { - return "", false, syserror.ENOENT + return "", false, linuxerr.ENOENT } // If the path ends with a /, then checks must be enforced in various @@ -162,7 +162,7 @@ func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uin if fs.IsDir(d.Inode.StableAttr) { // Don't allow directories to be opened writable. if fileFlags.Write { - return syserror.EISDIR + return linuxerr.EISDIR } } else { // If O_DIRECTORY is set, but the file is not a directory, then fail. @@ -177,7 +177,7 @@ func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uin file, err := d.Inode.GetFile(t, d, fileFlags) if err != nil { - return syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } defer file.DecRef(t) @@ -215,7 +215,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod return err } if dirPath { - return syserror.ENOENT + return linuxerr.ENOENT } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { @@ -308,7 +308,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode return 0, err } if dirPath { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } fileFlags := linuxToFlags(flags) @@ -416,7 +416,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode // Create a new fs.File. newFile, err = found.Inode.GetFile(t, found, fileFlags) if err != nil { - return syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } defer newFile.DecRef(t) case linuxerr.Equals(linuxerr.ENOENT, err): @@ -795,7 +795,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall defer file.DecRef(t) err := file.Flush(t) - return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) + return 0, nil, handleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file) } // Dup implements linux syscall dup(2). @@ -1020,7 +1020,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } return 0, nil, nil @@ -1036,7 +1036,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } return 0, nil, nil @@ -1263,7 +1263,7 @@ func symlinkAt(t *kernel.Task, dirFD int32, newAddr hostarch.Addr, oldAddr hosta return err } if dirPath { - return syserror.ENOENT + return linuxerr.ENOENT } // The oldPath is copied in verbatim. This is because the symlink @@ -1273,7 +1273,7 @@ func symlinkAt(t *kernel.Task, dirFD int32, newAddr hostarch.Addr, oldAddr hosta return err } if oldPath == "" { - return syserror.ENOENT + return linuxerr.ENOENT } return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { @@ -1352,7 +1352,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 return err } if dirPath { - return syserror.ENOENT + return linuxerr.ENOENT } if allowEmpty && oldPath == "" { @@ -1439,7 +1439,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) @@ -1455,7 +1455,7 @@ func readlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, bufAddr hostarc return 0, err } if dirPath { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -1579,7 +1579,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { if fs.IsDir(d.Inode.StableAttr) { - return syserror.EISDIR + return linuxerr.EISDIR } // In contrast to open(O_TRUNC), truncate(2) is only valid for file // types. @@ -2131,7 +2131,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, linuxerr.ESPIPE } if fs.IsDir(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EISDIR + return 0, nil, linuxerr.EISDIR } if !fs.IsRegular(file.Dirent.Inode.StableAttr) { return 0, nil, linuxerr.ENODEV @@ -2189,7 +2189,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } case linux.LOCK_SH: @@ -2201,7 +2201,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } case linux.LOCK_UN: diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index 717cec04d..bcdd7b633 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -23,7 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // futexWaitRestartBlock encapsulates the state required to restart futex(2) @@ -75,7 +75,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo } t.Futex().WaitComplete(w, t) - return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is @@ -103,7 +103,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add // The wait was unsuccessful for some reason other than interruption. Simply // forward the error. - if err != syserror.ErrInterrupted { + if err != linuxerr.ErrInterrupted { return 0, err } @@ -111,7 +111,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add // The wait duration was absolute, restart with the original arguments. if forever { - return 0, syserror.ERESTARTSYS + return 0, linuxerr.ERESTARTSYS } // The wait duration was relative, restart with the remaining duration. @@ -122,7 +122,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add val: val, mask: mask, }) - return 0, syserror.ERESTART_RESTARTBLOCK + return 0, linuxerr.ERESTART_RESTARTBLOCK } func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error { @@ -150,7 +150,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch. } t.Futex().WaitComplete(w, t) - return syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { @@ -280,11 +280,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS default: // We don't even know about this command. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 917717e31..9f7a5ae8a 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -83,7 +82,7 @@ func getdents(t *kernel.Task, fd int32, addr hostarch.Addr, size int, f func(*di ds := newDirentSerializer(f, w, t.Arch(), size) rerr := dir.Readdir(t, ds) - switch err := handleIOError(t, ds.Written() > 0, rerr, syserror.ERESTARTSYS, "getdents", dir); err { + switch err := handleIOError(t, ds.Written() > 0, rerr, linuxerr.ERESTARTSYS, "getdents", dir); err { case nil: dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) return uintptr(ds.Written()), nil diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index bf71a9af3..4a5712a29 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -49,7 +48,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } offset, serr := file.Seek(t, sw, offset) - err := handleIOError(t, false /* partialResult */, serr, syserror.ERESTARTSYS, "lseek", file) + err := handleIOError(t, false /* partialResult */, serr, linuxerr.ERESTARTSYS, "lseek", file) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index cee621791..7efd17d40 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // Brk implements linux syscall brk(2). @@ -211,7 +211,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca case linux.MADV_REMOVE: // These "suggestions" have application-visible side effects, so we // have to indicate that we don't support them. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS case linux.MADV_HWPOISON: // Only privileged processes are allowed to poison pages. return 0, nil, linuxerr.EPERM @@ -235,18 +235,18 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // rounded up to the next multiple of the page size." - mincore(2) la, ok := hostarch.Addr(length).RoundUp() if !ok { - return 0, nil, syserror.ENOMEM + return 0, nil, linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { - return 0, nil, syserror.ENOMEM + return 0, nil, linuxerr.ENOMEM } // Pretend that all mapped pages are "resident in core". mapped := t.MemoryManager().VirtualMemorySizeRange(ar) // "ENOMEM: addr to addr + length contained unmapped memory." if mapped != uint64(la) { - return 0, nil, syserror.ENOMEM + return 0, nil, linuxerr.ENOMEM } resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize)) _, err := t.CopyOutBytes(vec, resident) @@ -277,7 +277,7 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }) // MSync calls fsync, the same interrupt conversion rules apply, see // mm/msync.c, fsync POSIX.1-2008. - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // Mlock implements linux syscall mlock(2). diff --git a/pkg/sentry/syscalls/linux/sys_msgqueue.go b/pkg/sentry/syscalls/linux/sys_msgqueue.go index 5259ade90..60b989ee7 100644 --- a/pkg/sentry/syscalls/linux/sys_msgqueue.go +++ b/pkg/sentry/syscalls/linux/sys_msgqueue.go @@ -130,12 +130,63 @@ func receive(t *kernel.Task, id ipc.ID, mType int64, maxSize int64, msgCopy, wai func Msgctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) cmd := args[1].Int() + buf := args[2].Pointer() creds := auth.CredentialsFromContext(t) + r := t.IPCNamespace().MsgqueueRegistry() + switch cmd { + case linux.IPC_INFO: + info := r.IPCInfo(t) + _, err := info.CopyOut(t, buf) + return 0, nil, err + case linux.MSG_INFO: + msgInfo := r.MsgInfo(t) + _, err := msgInfo.CopyOut(t, buf) + return 0, nil, err case linux.IPC_RMID: - return 0, nil, t.IPCNamespace().MsgqueueRegistry().Remove(id, creds) + return 0, nil, r.Remove(id, creds) + } + + // Remaining commands use a queue. + queue, err := r.FindByID(id) + if err != nil { + return 0, nil, err + } + + switch cmd { + case linux.MSG_STAT: + // Technically, we should be treating id as "an index into the kernel's + // internal array that maintains information about all shared memory + // segments on the system". Since we don't track segments in an array, + // we'll just pretend the msqid is the index and do the same thing as + // IPC_STAT. Linux also uses the index as the msqid. + fallthrough + case linux.IPC_STAT: + stat, err := queue.Stat(t) + if err != nil { + return 0, nil, err + } + _, err = stat.CopyOut(t, buf) + return 0, nil, err + + case linux.MSG_STAT_ANY: + stat, err := queue.StatAny(t) + if err != nil { + return 0, nil, err + } + _, err = stat.CopyOut(t, buf) + return 0, nil, err + + case linux.IPC_SET: + var ds linux.MsqidDS + if _, err := ds.CopyIn(t, buf); err != nil { + return 0, nil, linuxerr.EINVAL + } + err := queue.Set(t, &ds) + return 0, nil, err + default: return 0, nil, linuxerr.EINVAL } diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index a80c84fcd..ee4dbbc64 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -25,7 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) @@ -185,7 +185,7 @@ func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration pfd[i].Events |= linux.POLLHUP | linux.POLLERR } remainingTimeout, n, err := pollBlock(t, pfd, timeout) - err = syserror.ConvertIntr(err, syserror.EINTR) + err = syserr.ConvertIntr(err, linuxerr.EINTR) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. @@ -295,7 +295,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // Do the syscall, then count the number of bits set. if _, _, err = pollBlock(t, pfd, timeout); err != nil { - return 0, syserror.ConvertIntr(err, syserror.EINTR) + return 0, syserr.ConvertIntr(err, linuxerr.EINTR) } // r, w, and e are currently event mask bitsets; unset bits corresponding @@ -411,7 +411,7 @@ func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duratio nfds: nfds, timeout: remainingTimeout, }) - return 0, syserror.ERESTART_RESTARTBLOCK + return 0, linuxerr.ERESTART_RESTARTBLOCK } return n, err } @@ -465,7 +465,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { - err = syserror.ERESTARTNOHAND + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -495,7 +495,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { - err = syserror.ERESTARTNOHAND + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -540,7 +540,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { - err = syserror.ERESTARTNOHAND + err = linuxerr.ERESTARTNOHAND } return n, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index b54a3a11f..18ea23913 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -72,7 +71,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "read", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "read", file) } // Readahead implements readahead(2). @@ -152,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pread64", file) } // Readv implements linux syscall readv(2). @@ -182,7 +181,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "readv", file) } // Preadv implements linux syscall preadv(2). @@ -223,7 +222,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv", file) } // Preadv2 implements linux syscall preadv2(2). @@ -281,17 +280,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if offset == -1 { n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { n, err := f.Readv(t, dst) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we read anything. f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) @@ -304,7 +303,7 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { var deadline ktime.Time if s, ok := f.FileOperations.(socket.Socket); ok { dl := s.RecvTimeout() - if dl < 0 && err == syserror.ErrWouldBlock { + if dl < 0 && err == linuxerr.ErrWouldBlock { return n, err } if dl > 0 { @@ -326,14 +325,14 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { // other than "would block". n, err = f.Readv(t, dst) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -351,7 +350,7 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { n, err := f.Preadv(t, dst, offset) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we read anything. f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) @@ -372,7 +371,7 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i // other than "would block". n, err = f.Preadv(t, dst, offset+total) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index a12e1c915..7210333d2 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" ) // rlimit describes an implementation of 'struct rlimit', which may vary from @@ -44,7 +43,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) { // On 64-bit system, struct rlimit and struct rlimit64 are identical. return &rlimit64{}, nil default: - return nil, syserror.ENOSYS + return nil, linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go index 5fe196647..8328a3742 100644 --- a/pkg/sentry/syscalls/linux/sys_rseq.go +++ b/pkg/sentry/syscalls/linux/sys_rseq.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // RSeq implements syscall rseq(2). @@ -33,7 +32,7 @@ func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Event for applications that want rseq on a configuration // that doesn't support them. t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } switch flags { diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index f61cc466c..5a119b21c 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" @@ -166,8 +165,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777)) - return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms) + return 0, nil, ipcSet(t, id, &s) case linux.GETPID: v, err := getPID(t, id, num) @@ -243,24 +241,13 @@ func remove(t *kernel.Task, id ipc.ID) error { return r.Remove(id, creds) } -func ipcSet(t *kernel.Task, id ipc.ID, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error { +func ipcSet(t *kernel.Task, id ipc.ID, ds *linux.SemidDS) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return linuxerr.EINVAL } - - creds := auth.CredentialsFromContext(t) - kuid := creds.UserNamespace.MapToKUID(uid) - if !kuid.Ok() { - return linuxerr.EINVAL - } - kgid := creds.UserNamespace.MapToKGID(gid) - if !kgid.Ok() { - return linuxerr.EINVAL - } - owner := fs.FileOwner{UID: kuid, GID: kgid} - return set.Change(t, creds, owner, perms) + return set.Set(t, ds) } func ipcStat(t *kernel.Task, id ipc.ID) (*linux.SemidDS, error) { diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 45608f3fa..03871d713 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -25,7 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // "For a process to have permission to send a signal it must @@ -348,7 +348,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Pause implements linux syscall pause(2). func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND) + return 0, nil, syserr.ConvertIntr(t.Block(nil), linuxerr.ERESTARTNOHAND) } // RtSigpending implements linux syscall rt_sigpending(2). @@ -496,7 +496,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. t.SetSavedSignalMask(oldmask) // Perform the wait. - return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND) + return 0, nil, syserr.ConvertIntr(t.Block(nil), linuxerr.ERESTARTNOHAND) } // RestartSyscall implements the linux syscall restart_syscall(2). @@ -512,7 +512,7 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // function is never null by (re)initializing it with one that translates // the restart into EINTR. We'll emulate that behaviour. t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } // sharedSignalfd is shared between the two calls. diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 06eb8f319..50ddbc142 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -30,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -260,7 +259,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -270,7 +269,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } blocking := !file.Flags().NonBlocking - return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(s.Connect(t, a, blocking).ToError(), linuxerr.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept @@ -291,7 +290,7 @@ func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -301,7 +300,7 @@ func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't @@ -350,7 +349,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -377,7 +376,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if backlog > maxListenBacklog { @@ -415,7 +414,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Validate how, then call syscall implementation. @@ -446,7 +445,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Read the length. Reject negative values. @@ -527,7 +526,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if optLen < 0 { @@ -565,7 +564,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -593,7 +592,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -626,7 +625,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. @@ -683,7 +682,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if file.Flags().NonBlocking { @@ -763,7 +762,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { - return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(err.ToError(), linuxerr.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC @@ -785,7 +784,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } defer cms.Release(t) @@ -848,7 +847,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, fla // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if file.Flags().NonBlocking { @@ -874,7 +873,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, fla n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) cm.Release(t) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } // Copy the address to the caller. @@ -921,7 +920,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. @@ -963,7 +962,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. @@ -1060,7 +1059,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) - err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) + err = handleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendmsg", file) // Control messages should be released on error as well as for zero-length // messages, which are discarded by the receiver. if n == 0 || err != nil { @@ -1087,7 +1086,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if file.Flags().NonBlocking { @@ -1122,7 +1121,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) - return uintptr(n), handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file) + return uintptr(n), handleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 34d87ac1f..8c8847efa 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -46,9 +45,9 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB for { n, err = fs.Splice(t, outFile, inFile, opts) - if n != 0 || err != syserror.ErrWouldBlock { + if n != 0 || err != linuxerr.ErrWouldBlock { break - } else if err == syserror.ErrWouldBlock && nonBlocking { + } else if err == linuxerr.ErrWouldBlock && nonBlocking { break } @@ -177,7 +176,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We can only pass a single file to handleIOError, so pick inFile // arbitrarily. This is used only for debugging purposes. - return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "sendfile", inFile) + return uintptr(n), nil, handleIOError(t, false, err, linuxerr.ERESTARTSYS, "sendfile", inFile) } // Splice implements splice(2). @@ -287,7 +286,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "splice", inFile) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", inFile) } // Tee imlements tee(2). @@ -340,5 +339,5 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "tee", inFile) + return uintptr(n), nil, handleIOError(t, false, err, linuxerr.ERESTARTSYS, "tee", inFile) } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 6278bef21..0c22599bf 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // LINT.IfChange @@ -58,7 +58,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // Fdatasync implements linux syscall fdatasync(2). @@ -74,7 +74,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // SyncFileRange implements linux syscall sync_file_rage(2) @@ -112,7 +112,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if uflags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // SYNC_FILE_RANGE_WRITE initiates write-out of all dirty pages in the @@ -137,7 +137,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData) } - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go index ba372f9e3..15acb2b8b 100644 --- a/pkg/sentry/syscalls/linux/sys_syslog.go +++ b/pkg/sentry/syscalls/linux/sys_syslog.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -57,6 +56,6 @@ func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case _SYSLOG_ACTION_SIZE_BUFFER: return logBufLen, nil, nil default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 981cdd985..d74173c56 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -111,7 +110,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host } atEmptyPath := flags&linux.AT_EMPTY_PATH != 0 if !atEmptyPath && len(pathname) == 0 { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 @@ -244,7 +243,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.Events |= kernel.EventGroupContinue } if options&linux.WNOHANG == 0 { - wopts.BlockInterruptErr = syserror.ERESTARTSYS + wopts.BlockInterruptErr = linuxerr.ERESTARTSYS } if options&linux.WNOTHREAD == 0 { wopts.SiblingChildren = true diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index 674e74f82..4adc8b8a4 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // The most significant 29 bits hold either a pid or a file descriptor. @@ -214,7 +213,7 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem host case linuxerr.Equals(linuxerr.ETIMEDOUT, err): // Slept for entire timeout. return nil - case err == syserror.ErrInterrupted: + case err == linuxerr.ErrInterrupted: // Interrupted. remaining := end.Sub(c.Now()) if remaining <= 0 { @@ -235,9 +234,9 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem host end: end, rem: rem, }) - return syserror.ERESTART_RESTARTBLOCK + return linuxerr.ERESTART_RESTARTBLOCK } - return syserror.ERESTARTNOHAND + return linuxerr.ERESTARTNOHAND default: panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) } diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index 45eef4feb..d39a0a6f5 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -18,9 +18,9 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const nsecPerSec = int64(time.Second) @@ -29,7 +29,7 @@ const nsecPerSec = int64(time.Second) func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if t.Arch().Width() != 8 { // Definition of linux.ItimerVal assumes 64-bit architecture. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } timerID := args[0].Int() @@ -51,7 +51,7 @@ func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if t.Arch().Width() != 8 { // Definition of linux.ItimerVal assumes 64-bit architecture. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } timerID := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go index 8c6cd7511..bde672d67 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl implements linux syscall arch_prctl(2). @@ -39,7 +38,7 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } case linux.ARCH_SET_FS: fsbase := args[1].Uint64() diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go index ff4ac4d6d..dfa684387 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go @@ -18,12 +18,12 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl is not defined for ARM64. func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 872168606..4a4ef5046 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -72,7 +71,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "write", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "write", file) } // Pwrite64 implements linux syscall pwrite64(2). @@ -119,7 +118,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwrite64", file) } // Writev implements linux syscall writev(2). @@ -149,7 +148,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "writev", file) } // Pwritev implements linux syscall pwritev(2). @@ -190,7 +189,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements linux syscall pwritev2(2). @@ -251,17 +250,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if offset == -1 { n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { n, err := f.Writev(t, src) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we wrote anything. f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) @@ -274,7 +273,7 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { var deadline ktime.Time if s, ok := f.FileOperations.(socket.Socket); ok { dl := s.SendTimeout() - if dl < 0 && err == syserror.ErrWouldBlock { + if dl < 0 && err == linuxerr.ErrWouldBlock { return n, err } if dl > 0 { @@ -296,14 +295,14 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { // anything other than "would block". n, err = f.Writev(t, src) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -321,7 +320,7 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) { n, err := f.Pwritev(t, src, offset) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we wrote anything. f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) @@ -342,7 +341,7 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) ( // anything other than "would block". n, err = f.Pwritev(t, src, offset+total) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index b327e27d6..d90652a3f 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // copyTimespecIn copies a Timespec from the untrusted app range to the kernel. @@ -38,7 +37,7 @@ func copyTimespecIn(t *kernel.Task, addr hostarch.Addr) (linux.Timespec, error) ts.Nsec = int64(hostarch.ByteOrder.Uint64(in[8:])) return ts, nil default: - return linux.Timespec{}, syserror.ENOSYS + return linux.Timespec{}, linuxerr.ENOSYS } } @@ -52,7 +51,7 @@ func copyTimespecOut(t *kernel.Task, addr hostarch.Addr, ts *linux.Timespec) err _, err := t.CopyOutBytes(addr, out) return err default: - return syserror.ENOSYS + return linuxerr.ENOSYS } } @@ -70,7 +69,7 @@ func copyTimevalIn(t *kernel.Task, addr hostarch.Addr) (linux.Timeval, error) { tv.Usec = int64(hostarch.ByteOrder.Uint64(in[8:])) return tv, nil default: - return linux.Timeval{}, syserror.ENOSYS + return linux.Timeval{}, linuxerr.ENOSYS } } @@ -84,7 +83,7 @@ func copyTimevalOut(t *kernel.Task, addr hostarch.Addr, tv *linux.Timeval) error _, err := t.CopyOutBytes(addr, out) return err default: - return syserror.ENOSYS + return linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index a73f096ff..1e3bd2a50 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -73,7 +73,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index a8fa86cdc..0b57c0f7c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -56,7 +55,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } cbAddr = hostarch.Addr(cbAddrP) default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // Copy in this callback. diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index 38818c175..fcf2e25de 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/loader" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Execve implements linux syscall execve(2). @@ -83,7 +82,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr host // do_open_execat(fd=AT_FDCWD)), and the loader package is currently // incapable of handling this correctly. if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) if dirfile == nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 2cfb12cad..2198aa065 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Close implements Linux syscall close(2). @@ -42,7 +41,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall defer file.DecRef(t) err := file.OnClose(t) - return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) + return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, linuxerr.EINTR, "close", file) } // Dup implements Linux syscall dup(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 534355237..f19f0fd41 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Link implements Linux syscall link(2). @@ -46,7 +45,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd return linuxerr.EINVAL } if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { - return syserror.ENOENT + return linuxerr.ENOENT } oldpath, err := copyInPath(t, oldpathAddr) @@ -320,7 +319,7 @@ func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpat return err } if len(target) == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } linkpath, err := copyInPath(t, linkpathAddr) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 2bb783a85..38796d4db 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func copyInPath(t *kernel.Task, addr hostarch.Addr) (fspath.Path, error) { @@ -44,7 +43,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { root.DecRef(t) - return taskPathOperation{}, syserror.ENOENT + return taskPathOperation{}, linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index 042aa4c97..204051cd0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -20,15 +20,14 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" - - "gvisor.dev/gvisor/pkg/hostarch" ) // fileCap is the maximum allowable files for poll & select. This has no @@ -189,7 +188,7 @@ func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration pfd[i].Events |= linux.POLLHUP | linux.POLLERR } remainingTimeout, n, err := pollBlock(t, pfd, timeout) - err = syserror.ConvertIntr(err, syserror.EINTR) + err = syserr.ConvertIntr(err, linuxerr.EINTR) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. @@ -299,7 +298,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // Do the syscall, then count the number of bits set. if _, _, err = pollBlock(t, pfd, timeout); err != nil { - return 0, syserror.ConvertIntr(err, syserror.EINTR) + return 0, syserr.ConvertIntr(err, linuxerr.EINTR) } // r, w, and e are currently event mask bitsets; unset bits corresponding @@ -417,7 +416,7 @@ func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duratio nfds: nfds, timeout: remainingTimeout, }) - return 0, syserror.ERESTART_RESTARTBLOCK + return 0, linuxerr.ERESTART_RESTARTBLOCK } return n, err } @@ -464,7 +463,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { - err = syserror.ERESTARTNOHAND + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -494,7 +493,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { - err = syserror.ERESTARTNOHAND + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -541,7 +540,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { - err = syserror.ERESTARTNOHAND + err = linuxerr.ERESTARTNOHAND } return n, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index fe8aa06da..4e7dc5080 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -63,7 +62,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "read", file) } // Readv implements Linux syscall readv(2). @@ -88,12 +87,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "readv", file) } func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -115,14 +114,14 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt // "would block". n, err = file.Read(t, dst, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -166,7 +165,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pread64", file) } // Preadv implements Linux syscall preadv(2). @@ -197,7 +196,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv", file) } // Preadv2 implements Linux syscall preadv2(2). @@ -243,12 +242,12 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err = pread(t, file, dst, offset, opts) } t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -270,14 +269,14 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of // "would block". n, err = file.PRead(t, dst, offset+total, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -314,7 +313,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "write", file) } // Writev implements Linux syscall writev(2). @@ -339,12 +338,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "writev", file) } func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -366,14 +365,14 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op // "would block". n, err = file.Write(t, src, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } @@ -416,7 +415,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pwrite64", file) } // Pwritev implements Linux syscall pwritev(2). @@ -447,7 +446,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements Linux syscall pwritev2(2). @@ -493,12 +492,12 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err = pwrite(t, file, src, offset, opts) } t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -520,14 +519,14 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o // "would block". n, err = file.PWrite(t, src, offset+total, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } break } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index b5a3b92c5..e608572b4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX @@ -432,7 +431,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { - return syserror.ENOENT + return linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() @@ -465,7 +464,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } func handleSetSizeError(t *kernel.Task, err error) error { - if err == syserror.ErrExceedsFileSizeLimit { + if err == linuxerr.ErrExceedsFileSizeLimit { // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) return linuxerr.EFBIG diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 0c2e0720b..48be5a88d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -30,10 +31,7 @@ import ( slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) // maxAddrLen is the maximum socket address length we're willing to accept. @@ -264,7 +262,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -274,7 +272,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 - return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(s.Connect(t, a, blocking).ToError(), linuxerr.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept @@ -295,7 +293,7 @@ func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -305,7 +303,7 @@ func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't @@ -354,7 +352,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -381,7 +379,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if backlog > maxListenBacklog { @@ -419,7 +417,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Validate how, then call syscall implementation. @@ -450,7 +448,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Read the length. Reject negative values. @@ -531,7 +529,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if optLen < 0 { @@ -569,7 +567,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -597,7 +595,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -630,7 +628,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. @@ -687,7 +685,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -767,7 +765,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { - return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(err.ToError(), linuxerr.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC @@ -789,7 +787,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } defer cms.Release(t) @@ -852,7 +850,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, fla // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -878,7 +876,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, fla n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) cm.Release(t) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } // Copy the address to the caller. @@ -925,7 +923,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. @@ -967,7 +965,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. @@ -1064,7 +1062,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) - err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) + err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendmsg", file) // Control messages should be released on error as well as for zero-length // messages, which are discarded by the receiver. if n == 0 || err != nil { @@ -1091,7 +1089,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -1126,7 +1124,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) - return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file) + return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index d8009123f..0205f09e0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -151,7 +150,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal panic("at least one end of splice must be a pipe") } - if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + if n != 0 || err != linuxerr.ErrWouldBlock || nonBlock { break } if err = dw.waitForBoth(t); err != nil { @@ -173,7 +172,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", outFile) } // Tee implements Linux syscall tee(2). @@ -241,7 +240,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo defer dw.destroy() for { n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) - if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + if n != 0 || err != linuxerr.ErrWouldBlock || nonBlock { break } if err = dw.waitForBoth(t); err != nil { @@ -251,7 +250,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if n != 0 { // If a partial write is completed, the error is dropped. Log it here. - if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { + if err != nil && err != io.EOF && err != linuxerr.ErrWouldBlock { log.Debugf("tee completed a partial write with error: %v", err) err = nil } @@ -259,7 +258,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "tee", inFile) } // Sendfile implements linux system call sendfile(2). @@ -360,10 +359,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc break } if err == nil && t.Interrupted() { - err = syserror.ErrInterrupted + err = linuxerr.ErrInterrupted break } - if err == syserror.ErrWouldBlock && !nonBlock { + if err == linuxerr.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) } if err != nil { @@ -389,7 +388,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc var writeN int64 writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) wbuf = wbuf[writeN:] - if err == syserror.ErrWouldBlock && !nonBlock { + if err == linuxerr.ErrWouldBlock && !nonBlock { err = dw.waitForOut(t) } if err != nil { @@ -420,10 +419,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc break } if err == nil && t.Interrupted() { - err = syserror.ErrInterrupted + err = linuxerr.ErrInterrupted break } - if err == syserror.ErrWouldBlock && !nonBlock { + if err == linuxerr.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) } if err != nil { @@ -441,7 +440,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } if total != 0 { - if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { + if err != nil && err != io.EOF && err != linuxerr.ErrWouldBlock { // If a partial write is completed, the error is dropped. Log it here. log.Debugf("sendfile completed a partial write with error: %v", err) err = nil @@ -450,7 +449,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile) + return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, linuxerr.ERESTARTSYS, "sendfile", inFile) } // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index ba1d30823..adaf8db3f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Stat implements Linux syscall stat(2). @@ -70,7 +69,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flag start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() @@ -182,7 +181,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index d0ffc7c32..cfc693422 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // Sync implements Linux syscall sync(2). @@ -108,12 +108,12 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { if err := file.Sync(t); err != nil { - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } } return 0, nil, nil diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index 511fb8b28..cfcc21271 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -31,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Supported returns a syscall that is fully supported. @@ -103,10 +102,10 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne return 0, nil, linuxerr.EPERM } t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, linuxerr.EPERM, c.String(), syserror.ENOSYS), + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, linuxerr.EPERM, c.String(), linuxerr.ENOSYS), URLs: urls, } } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 36d999c47..c21971322 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -39,7 +39,6 @@ go_library( "//pkg/log", "//pkg/metric", "//pkg/sync", - "//pkg/syserror", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/time/sampler_arm64.go b/pkg/sentry/time/sampler_arm64.go index 3560e66ae..9b8c9a480 100644 --- a/pkg/sentry/time/sampler_arm64.go +++ b/pkg/sentry/time/sampler_arm64.go @@ -30,9 +30,9 @@ func getDefaultArchOverheadCycles() TSCValue { // frqRatio. defaultOverheadCycles of ARM equals to that on // x86 devided by frqRatio cntfrq := getCNTFRQ() - frqRatio := 1000000000 / cntfrq + frqRatio := 1000000000 / float64(cntfrq) overheadCycles := (1 * 1000) / frqRatio - return overheadCycles + return TSCValue(overheadCycles) } // defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index a2032162d..914574543 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -116,7 +116,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -137,7 +136,6 @@ go_test( "//pkg/errors/linuxerr", "//pkg/sentry/contexttest", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 5aad31b78..82ee2c521 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -1,9 +1,5 @@ # The gVisor Virtual Filesystem -THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION -USE. For the filesystem implementation currently used by gVisor, see the `fs` -package. - ## Implementation Notes ### Reference Counting diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 23ccc6b66..fefd0fc9c 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -263,7 +262,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event num: num, }] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } // Update epi for the next call to ep.ReadEvents(). @@ -299,7 +298,7 @@ func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error num: num, }] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } // Unregister from the file so that epi will no longer be readied. diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index a875fdeca..452f5f1f9 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -17,6 +17,7 @@ package vfs import ( "bytes" "io" + "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -25,7 +26,6 @@ import ( fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -56,7 +56,7 @@ func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error { // StatFS implements FileDescriptionImpl.StatFS analogously to // super_operations::statfs == NULL in Linux. func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { - return linux.Statfs{}, syserror.ENOSYS + return linux.Statfs{}, linuxerr.ENOSYS } // Allocate implements FileDescriptionImpl.Allocate analogously to @@ -175,27 +175,27 @@ type DirectoryFileDescriptionDefaultImpl struct{} // Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Read implements FileDescriptionImpl.Read. func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // PWrite implements FileDescriptionImpl.PWrite. func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements FileDescriptionImpl.Write. func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // DentryMetadataFileDescriptionImpl may be embedded by implementations of @@ -368,7 +368,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { - return 0, syserror.EIO + return 0, linuxerr.EIO } n, err := writable.Write(ctx, src, offset) if err != nil { @@ -400,6 +400,9 @@ func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src userme // GenericConfigureMMap may be used by most implementations of // FileDescriptionImpl.ConfigureMMap. func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { + if opts.Offset+opts.Length > math.MaxInt64 { + return linuxerr.EOVERFLOW + } opts.Mappable = m opts.MappingIdentity = fd fd.IncRef() diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3423dede1..e34a8c11b 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -157,10 +156,10 @@ func TestGenCountFD(t *testing.T) { // Write and PWrite fails. if _, err := fd.Write(ctx, ioseq, WriteOptions{}); !linuxerr.Equals(linuxerr.EIO, err) { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) + t.Errorf("Write: got err %v, wanted %v", err, linuxerr.EIO) } if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); !linuxerr.Equals(linuxerr.EIO, err) { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) + t.Errorf("Write: got err %v, wanted %v", err, linuxerr.EIO) } } diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 088beb8e2..17d94b341 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -209,7 +208,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt if i.events.Empty() { // Nothing to read yet, tell caller to block. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } var writeLen int64 diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go index cbe4d8c2d..1853cdca0 100644 --- a/pkg/sentry/vfs/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -17,8 +17,8 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/syserror" ) // FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) @@ -47,9 +47,9 @@ func (fl *FileLocks) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerID i // Return an appropriate error for the unsuccessful lock attempt, depending on // whether this is a blocking or non-blocking operation. if block == nil { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } // UnlockBSD releases a BSD-style lock on the entire file. @@ -69,9 +69,9 @@ func (fl *FileLocks) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPI // Return an appropriate error for the unsuccessful lock attempt, depending on // whether this is a blocking or non-blocking operation. if block == nil { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } // UnlockPOSIX releases a POSIX-style lock on a file region. diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 4d6b59a26..05a416775 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem @@ -225,7 +224,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr vdDentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef(ctx) - return syserror.ENOENT + return linuxerr.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and // vfs.mountMu.Lock(). diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index e4da15009..7cc68a157 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -16,9 +16,9 @@ package vfs import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) var fspathBuilderPool = sync.Pool{ @@ -137,7 +137,7 @@ loop: // Linux's sys_getcwd(). func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { if vd.dentry.IsDead() { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } b := getFSPathBuilder() diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 4744514bd..953d31876 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" ) // AccessTypes is a bitmask of Unix file permissions. @@ -195,7 +194,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt return err } if limit < int64(stat.Size) { - return syserror.ErrExceedsFileSizeLimit + return linuxerr.ErrExceedsFileSizeLimit } } if stat.Mask&linux.STATX_MODE != 0 { @@ -282,7 +281,7 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { return size, nil } if offset >= int64(fileSizeLimit) { - return 0, syserror.ErrExceedsFileSizeLimit + return 0, linuxerr.ErrExceedsFileSizeLimit } remaining := int64(fileSizeLimit) - offset if remaining < size { diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 6f58f33ce..40aff2927 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // ResolvingPath represents the state of an in-progress path resolution, shared @@ -224,6 +223,12 @@ func (rp *ResolvingPath) Final() bool { return rp.curPart == 0 && !rp.pit.NextOk() } +// Pit returns a copy of rp's current path iterator. Modifying the iterator +// does not change rp. +func (rp *ResolvingPath) Pit() fspath.Iterator { + return rp.pit +} + // Component returns the current path component in the stream represented by // rp. // @@ -331,7 +336,7 @@ func (rp *ResolvingPath) HandleSymlink(target string) error { return linuxerr.ELOOP } if len(target) == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } rp.symlinks++ targetPath := fspath.Parse(target) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index eb3c60610..1b2a668c0 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -48,7 +48,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. @@ -281,7 +280,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential if newpop.Path.Absolute { return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if newpop.FollowFinalSymlink { oldVD.DecRef(ctx) @@ -318,7 +317,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia if pop.Path.Absolute { return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") @@ -348,7 +347,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia } // MknodAt creates a file of the given mode at the given path. It returns an -// error from the syserror package. +// error from the linuxerr package. func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. @@ -356,7 +355,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia if pop.Path.Absolute { return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") @@ -494,7 +493,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti if oldpop.Path.Absolute { return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if oldpop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") @@ -515,7 +514,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti if newpop.Path.Absolute { return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if newpop.FollowFinalSymlink { oldParentVD.DecRef(ctx) @@ -556,7 +555,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia if pop.Path.Absolute { return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") @@ -639,7 +638,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent if pop.Path.Absolute { return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") @@ -673,7 +672,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti if pop.Path.Absolute { return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") |