summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/devices/quotedev/BUILD16
-rw-r--r--pkg/sentry/devices/quotedev/quotedev.go52
-rw-r--r--pkg/sentry/fs/gofer/BUILD1
-rw-r--r--pkg/sentry/fs/gofer/file.go3
-rw-r--r--pkg/sentry/fs/proc/sys_net.go16
-rw-r--r--pkg/sentry/fs/proc/sys_net_state.go6
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/base.go6
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/cgroupfs.go11
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go2
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD1
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD2
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go331
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go6
-rw-r--r--pkg/sentry/fsimpl/gofer/p9file.go7
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go3
-rw-r--r--pkg/sentry/fsimpl/gofer/revalidate.go386
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go3
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go12
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go6
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go2
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go1
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go39
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go18
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys_test.go2
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go2
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go42
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go44
-rw-r--r--pkg/sentry/fsmetric/fsmetric.go1
-rw-r--r--pkg/sentry/inet/inet.go3
-rw-r--r--pkg/sentry/inet/test_stack.go5
-rw-r--r--pkg/sentry/kernel/cgroup.go42
-rw-r--r--pkg/sentry/kernel/fd_table.go8
-rw-r--r--pkg/sentry/kernel/task_syscall.go3
-rw-r--r--pkg/sentry/loader/BUILD3
-rw-r--r--pkg/sentry/loader/elf.go14
-rw-r--r--pkg/sentry/platform/kvm/machine.go15
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go26
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go36
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go67
-rw-r--r--pkg/sentry/socket/BUILD1
-rw-r--r--pkg/sentry/socket/control/BUILD4
-rw-r--r--pkg/sentry/socket/control/control.go66
-rw-r--r--pkg/sentry/socket/hostinet/BUILD3
-rw-r--r--pkg/sentry/socket/hostinet/socket.go23
-rw-r--r--pkg/sentry/socket/hostinet/stack.go53
-rw-r--r--pkg/sentry/socket/netfilter/BUILD4
-rw-r--r--pkg/sentry/socket/netfilter/extensions.go13
-rw-r--r--pkg/sentry/socket/netfilter/ipv4.go7
-rw-r--r--pkg/sentry/socket/netfilter/ipv6.go7
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go11
-rw-r--r--pkg/sentry/socket/netfilter/owner_matcher.go9
-rw-r--r--pkg/sentry/socket/netfilter/targets.go40
-rw-r--r--pkg/sentry/socket/netfilter/tcp_matcher.go8
-rw-r--r--pkg/sentry/socket/netfilter/udp_matcher.go8
-rw-r--r--pkg/sentry/socket/netlink/BUILD4
-rw-r--r--pkg/sentry/socket/netlink/message.go40
-rw-r--r--pkg/sentry/socket/netlink/message_test.go18
-rw-r--r--pkg/sentry/socket/netlink/route/BUILD1
-rw-r--r--pkg/sentry/socket/netlink/route/protocol.go30
-rw-r--r--pkg/sentry/socket/netlink/socket.go23
-rw-r--r--pkg/sentry/socket/netstack/BUILD1
-rw-r--r--pkg/sentry/socket/netstack/netstack.go51
-rw-r--r--pkg/sentry/socket/netstack/stack.go17
-rw-r--r--pkg/sentry/socket/socket.go17
-rw-r--r--pkg/sentry/strace/BUILD1
-rw-r--r--pkg/sentry/strace/linux64_amd64.go1
-rw-r--r--pkg/sentry/strace/linux64_arm64.go1
-rw-r--r--pkg/sentry/strace/socket.go32
-rw-r--r--pkg/sentry/syscalls/epoll.go8
-rw-r--r--pkg/sentry/syscalls/linux/error.go4
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go56
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go3
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/epoll.go52
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go3
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go2
-rw-r--r--pkg/sentry/time/calibrated_clock.go8
-rw-r--r--pkg/sentry/vfs/file_description.go22
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go66
-rw-r--r--pkg/sentry/vfs/opath.go42
-rw-r--r--pkg/sentry/vfs/resolving_path.go84
-rw-r--r--pkg/sentry/vfs/vfs.go83
-rw-r--r--pkg/sentry/watchdog/watchdog.go4
84 files changed, 1412 insertions, 765 deletions
diff --git a/pkg/sentry/devices/quotedev/BUILD b/pkg/sentry/devices/quotedev/BUILD
new file mode 100644
index 000000000..d09214e3e
--- /dev/null
+++ b/pkg/sentry/devices/quotedev/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+ name = "quotedev",
+ srcs = ["quotedev.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/devices/quotedev/quotedev.go b/pkg/sentry/devices/quotedev/quotedev.go
new file mode 100644
index 000000000..6114cb724
--- /dev/null
+++ b/pkg/sentry/devices/quotedev/quotedev.go
@@ -0,0 +1,52 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package quotedev implements a vfs.Device for /dev/gvisor_quote.
+package quotedev
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ quoteDevMinor = 0
+)
+
+// quoteDevice implements vfs.Device for /dev/gvisor_quote
+//
+// +stateify savable
+type quoteDevice struct{}
+
+// Open implements vfs.Device.Open.
+// TODO(b/157161182): Add support for attestation ioctls.
+func (quoteDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ return nil, syserror.EIO
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ return vfsObj.RegisterDevice(vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, quoteDevice{}, &vfs.RegisterDeviceOptions{
+ GroupName: "gvisor_quote",
+ })
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+ return dev.CreateDeviceFile(ctx, "gvisor_quote", vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index c4a069832..94cb05246 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -29,6 +29,7 @@ go_library(
"//pkg/fd",
"//pkg/hostarch",
"//pkg/log",
+ "//pkg/metric",
"//pkg/p9",
"//pkg/refs",
"//pkg/safemem",
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 8f5a87120..819e140bc 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -21,6 +21,7 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -91,7 +92,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF
}
if flags.Write {
if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil {
- fsmetric.GoferOpensWX.Increment()
+ metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file")
log.Warningf("Opened a writable executable: %q", name)
}
}
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 1d09afdd7..4893af56b 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -403,7 +403,7 @@ type ipForwarding struct {
// enabled stores the IPv4 forwarding state on save.
// We must save/restore this here, since a netstack instance
// is created on restore.
- enabled *bool
+ enabled bool
}
func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
@@ -461,13 +461,8 @@ func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
return 0, io.EOF
}
- if f.ipf.enabled == nil {
- enabled := f.stack.Forwarding(ipv4.ProtocolNumber)
- f.ipf.enabled = &enabled
- }
-
val := "0\n"
- if *f.ipf.enabled {
+ if f.ipf.enabled {
// Technically, this is not quite compatible with Linux. Linux
// stores these as an integer, so if you write "2" into
// ip_forward, you should get 2 back.
@@ -494,11 +489,8 @@ func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IO
if err != nil {
return n, err
}
- if f.ipf.enabled == nil {
- f.ipf.enabled = new(bool)
- }
- *f.ipf.enabled = v != 0
- return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled)
+ f.ipf.enabled = v != 0
+ return n, f.stack.SetForwarding(ipv4.ProtocolNumber, f.ipf.enabled)
}
// portRangeInode implements fs.InodeOperations. It provides and allows
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 4cb4741af..51d2be647 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -47,9 +47,7 @@ func (s *tcpSack) afterLoad() {
// afterLoad is invoked by stateify.
func (ipf *ipForwarding) afterLoad() {
- if ipf.enabled != nil {
- if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
- panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err))
- }
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil {
+ panic(fmt.Sprintf("ipf.stack.SetForwarding(%d, %t): %s", ipv4.ProtocolNumber, ipf.enabled, err))
}
}
diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go
index 0f54888d8..6512e9cdb 100644
--- a/pkg/sentry/fsimpl/cgroupfs/base.go
+++ b/pkg/sentry/fsimpl/cgroupfs/base.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -68,11 +67,6 @@ func (c *controllerCommon) Enabled() bool {
return true
}
-// Filesystem implements kernel.CgroupController.Filesystem.
-func (c *controllerCommon) Filesystem() *vfs.Filesystem {
- return c.fs.VFSFilesystem()
-}
-
// RootCgroup implements kernel.CgroupController.RootCgroup.
func (c *controllerCommon) RootCgroup() kernel.Cgroup {
return c.fs.rootCgroup()
diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
index bd3e69757..54050de3c 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
@@ -109,7 +109,7 @@ type InternalData struct {
DefaultControlValues map[string]int64
}
-// filesystem implements vfs.FilesystemImpl.
+// filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
//
// +stateify savable
type filesystem struct {
@@ -139,6 +139,11 @@ type filesystem struct {
tasksMu sync.RWMutex `state:"nosave"`
}
+// InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID.
+func (fs *filesystem) InitializeHierarchyID(hid uint32) {
+ fs.hierarchyID = hid
+}
+
// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
return Name
@@ -284,14 +289,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Register controllers. The registry may be modified concurrently, so if we
// get an error, we raced with someone else who registered the same
// controllers first.
- hid, err := r.Register(fs.kcontrollers)
- if err != nil {
+ if err := r.Register(fs.kcontrollers, fs); err != nil {
ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err)
rootD.DecRef(ctx)
fs.VFSFilesystem().DecRef(ctx)
return nil, nil, syserror.EBUSY
}
- fs.hierarchyID = hid
// Move all existing tasks to the root of the new hierarchy.
k.PopulateNewCgroupHierarchy(fs.rootCgroup())
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index e6fe0fc0d..daff40cd5 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -36,7 +36,7 @@ const Name = "devtmpfs"
//
// +stateify savable
type FilesystemType struct {
- initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
+ initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
initErr error
// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 7b1eec3da..2dbc6bfd5 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -46,7 +46,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/fd",
"//pkg/fspath",
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 6d5258a9b..368272f12 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -38,6 +38,7 @@ go_library(
"host_named_pipe.go",
"p9file.go",
"regular_file.go",
+ "revalidate.go",
"save_restore.go",
"socket.go",
"special_file.go",
@@ -53,6 +54,7 @@ go_library(
"//pkg/fspath",
"//pkg/hostarch",
"//pkg/log",
+ "//pkg/metric",
"//pkg/p9",
"//pkg/refs",
"//pkg/refsvfs2",
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 4b5621043..91ec4a142 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -117,6 +117,17 @@ func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
return ds
}
+// Precondition: !parent.isSynthetic() && !child.isSynthetic().
+func appendNewChildDentry(ds **[]*dentry, parent *dentry, child *dentry) {
+ // The new child was added to parent and took a ref on the parent (hence
+ // parent can be removed from cache). A new child has 0 refs for now. So
+ // checkCachingLocked() should be called on both. Call it first on the parent
+ // as it may create space in the cache for child to be inserted - hence
+ // avoiding a cache eviction.
+ *ds = appendDentry(*ds, parent)
+ *ds = appendDentry(*ds, child)
+}
+
// Preconditions: ds != nil.
func putDentrySlice(ds *[]*dentry) {
// Allow dentries to be GC'd.
@@ -169,167 +180,96 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]
// * fs.renameMu must be locked.
// * d.dirMu must be locked.
// * !rp.Done().
-// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
-// to date.
+// * If !d.cachedMetadataAuthoritative(), then d and all children that are
+// part of rp must have been revalidated.
//
// Postconditions: The returned dentry's cached metadata is up to date.
-func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, bool, error) {
if !d.isDir() {
- return nil, syserror.ENOTDIR
+ return nil, false, syserror.ENOTDIR
}
if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
- return nil, err
+ return nil, false, err
}
+ followedSymlink := false
afterSymlink:
name := rp.Component()
if name == "." {
rp.Advance()
- return d, nil
+ return d, followedSymlink, nil
}
if name == ".." {
if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
- return nil, err
+ return nil, false, err
} else if isRoot || d.parent == nil {
rp.Advance()
- return d, nil
- }
- // We must assume that d.parent is correct, because if d has been moved
- // elsewhere in the remote filesystem so that its parent has changed,
- // we have no way of determining its new parent's location in the
- // filesystem.
- //
- // Call rp.CheckMount() before updating d.parent's metadata, since if
- // we traverse to another mount then d.parent's metadata is irrelevant.
- if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
- return nil, err
+ return d, followedSymlink, nil
}
- if d != d.parent && !d.cachedMetadataAuthoritative() {
- if err := d.parent.updateFromGetattr(ctx); err != nil {
- return nil, err
- }
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, false, err
}
rp.Advance()
- return d.parent, nil
+ return d.parent, followedSymlink, nil
}
- child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds)
+ child, err := fs.getChildLocked(ctx, d, name, ds)
if err != nil {
- return nil, err
- }
- if child == nil {
- return nil, syserror.ENOENT
+ return nil, false, err
}
if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
- return nil, err
+ return nil, false, err
}
if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
target, err := child.readlink(ctx, rp.Mount())
if err != nil {
- return nil, err
+ return nil, false, err
}
if err := rp.HandleSymlink(target); err != nil {
- return nil, err
+ return nil, false, err
}
+ followedSymlink = true
goto afterSymlink // don't check the current directory again
}
rp.Advance()
- return child, nil
+ return child, followedSymlink, nil
}
// getChildLocked returns a dentry representing the child of parent with the
-// given name. If no such child exists, getChildLocked returns (nil, nil).
+// given name. Returns ENOENT if the child doesn't exist.
//
// Preconditions:
// * fs.renameMu must be locked.
// * parent.dirMu must be locked.
// * parent.isDir().
// * name is not "." or "..".
-//
-// Postconditions: If getChildLocked returns a non-nil dentry, its cached
-// metadata is up to date.
-func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+// * dentry at name has been revalidated
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
if len(name) > maxFilenameLen {
return nil, syserror.ENAMETOOLONG
}
- child, ok := parent.children[name]
- if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() {
- // Whether child is nil or not, it is cached information that is
- // assumed to be correct.
+ if child, ok := parent.children[name]; ok || parent.isSynthetic() {
+ if child == nil {
+ return nil, syserror.ENOENT
+ }
return child, nil
}
- // We either don't have cached information or need to verify that it's
- // still correct, either of which requires a remote lookup. Check if this
- // name is valid before performing the lookup.
- return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
-}
-// Preconditions: Same as getChildLocked, plus:
-// * !parent.isSynthetic().
-func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
- if child != nil {
- // Need to lock child.metadataMu because we might be updating child
- // metadata. We need to hold the lock *before* getting metadata from the
- // server and release it after updating local metadata.
- child.metadataMu.Lock()
- }
qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
- if err != nil && err != syserror.ENOENT {
- if child != nil {
- child.metadataMu.Unlock()
+ if err != nil {
+ if err == syserror.ENOENT {
+ parent.cacheNegativeLookupLocked(name)
}
return nil, err
}
- if child != nil {
- if !file.isNil() && qid.Path == child.qidPath {
- // The file at this path hasn't changed. Just update cached metadata.
- file.close(ctx)
- child.updateFromP9AttrsLocked(attrMask, &attr)
- child.metadataMu.Unlock()
- return child, nil
- }
- child.metadataMu.Unlock()
- if file.isNil() && child.isSynthetic() {
- // We have a synthetic file, and no remote file has arisen to
- // replace it.
- return child, nil
- }
- // The file at this path has changed or no longer exists. Mark the
- // dentry invalidated, and re-evaluate its caching status (i.e. if it
- // has 0 references, drop it). Wait to update parent.children until we
- // know what to replace the existing dentry with (i.e. one of the
- // returns below), to avoid a redundant map access.
- vfsObj.InvalidateDentry(ctx, &child.vfsd)
- if child.isSynthetic() {
- // Normally we don't mark invalidated dentries as deleted since
- // they may still exist (but at a different path), and also for
- // consistency with Linux. However, synthetic files are guaranteed
- // to become unreachable if their dentries are invalidated, so
- // treat their invalidation as deletion.
- child.setDeleted()
- parent.syntheticChildren--
- child.decRefNoCaching()
- parent.dirents = nil
- }
- *ds = appendDentry(*ds, child)
- }
- if file.isNil() {
- // No file exists at this path now. Cache the negative lookup if
- // allowed.
- parent.cacheNegativeLookupLocked(name)
- return nil, nil
- }
+
// Create a new dentry representing the file.
- child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
+ child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
if err != nil {
file.close(ctx)
delete(parent.children, name)
return nil, err
}
parent.cacheNewChildLocked(child, name)
- // For now, child has 0 references, so our caller should call
- // child.checkCachingLocked(). parent gained a ref so we should also call
- // parent.checkCachingLocked() so it can be removed from the cache if needed.
- *ds = appendDentry(*ds, child)
- *ds = appendDentry(*ds, parent)
+ appendNewChildDentry(ds, parent, child)
return child, nil
}
@@ -344,14 +284,22 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
// to date.
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+ if err := fs.revalidateParentDir(ctx, rp, d, ds); err != nil {
+ return nil, err
+ }
for !rp.Final() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
}
d = next
+ if followedSymlink {
+ if err := fs.revalidateParentDir(ctx, rp, d, ds); err != nil {
+ return nil, err
+ }
+ }
}
if !d.isDir() {
return nil, syserror.ENOTDIR
@@ -364,20 +312,22 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
// Preconditions: fs.renameMu must be locked.
func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
d := rp.Start().Impl().(*dentry)
- if !d.cachedMetadataAuthoritative() {
- // Get updated metadata for rp.Start() as required by fs.stepLocked().
- if err := d.updateFromGetattr(ctx); err != nil {
- return nil, err
- }
+ if err := fs.revalidatePath(ctx, rp, d, ds); err != nil {
+ return nil, err
}
for !rp.Done() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
}
d = next
+ if followedSymlink {
+ if err := fs.revalidatePath(ctx, rp, d, ds); err != nil {
+ return nil, err
+ }
+ }
}
if rp.MustBeDir() && !d.isDir() {
return nil, syserror.ENOTDIR
@@ -397,13 +347,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
- if !start.cachedMetadataAuthoritative() {
- // Get updated metadata for start as required by
- // fs.walkParentDirLocked().
- if err := start.updateFromGetattr(ctx); err != nil {
- return err
- }
- }
parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
if err != nil {
return err
@@ -421,25 +364,47 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if parent.isDeleted() {
return syserror.ENOENT
}
+ if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil {
+ return err
+ }
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
- child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), parent, name, &ds)
- switch {
- case err != nil && err != syserror.ENOENT:
- return err
- case child != nil:
+ if len(name) > maxFilenameLen {
+ return syserror.ENAMETOOLONG
+ }
+ // Check for existence only if caching information is available. Otherwise,
+ // don't check for existence just yet. We will check for existence if the
+ // checks for writability fail below. Existence check is done by the creation
+ // RPCs themselves.
+ if child, ok := parent.children[name]; ok && child != nil {
return syserror.EEXIST
}
+ checkExistence := func() error {
+ if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && err != syserror.ENOENT {
+ return err
+ } else if child != nil {
+ return syserror.EEXIST
+ }
+ return nil
+ }
mnt := rp.Mount()
if err := mnt.CheckBeginWrite(); err != nil {
+ // Existence check takes precedence.
+ if existenceErr := checkExistence(); existenceErr != nil {
+ return existenceErr
+ }
return err
}
defer mnt.EndWrite()
if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ // Existence check takes precedence.
+ if existenceErr := checkExistence(); existenceErr != nil {
+ return existenceErr
+ }
return err
}
if !dir && rp.MustBeDir() {
@@ -489,13 +454,6 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
- if !start.cachedMetadataAuthoritative() {
- // Get updated metadata for start as required by
- // fs.walkParentDirLocked().
- if err := start.updateFromGetattr(ctx); err != nil {
- return err
- }
- }
parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
if err != nil {
return err
@@ -521,33 +479,32 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
return syserror.EISDIR
}
}
+
vfsObj := rp.VirtualFilesystem()
+ if err := fs.revalidateOne(ctx, vfsObj, parent, rp.Component(), &ds); err != nil {
+ return err
+ }
+
mntns := vfs.MountNamespaceFromContext(ctx)
defer mntns.DecRef(ctx)
+
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
- child, ok := parent.children[name]
- if ok && child == nil {
- return syserror.ENOENT
- }
-
- sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
- if sticky {
- if !ok {
- // If the sticky bit is set, we need to retrieve the child to determine
- // whether removing it is allowed.
- child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
- if err != nil {
- return err
- }
- } else if child != nil && !child.cachedMetadataAuthoritative() {
- // Make sure the dentry representing the file at name is up to date
- // before examining its metadata.
- child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
- if err != nil {
- return err
- }
+ // Load child if sticky bit is set because we need to determine whether
+ // deletion is allowed.
+ var child *dentry
+ if atomic.LoadUint32(&parent.mode)&linux.ModeSticky == 0 {
+ var ok bool
+ child, ok = parent.children[name]
+ if ok && child == nil {
+ // Hit a negative cached entry, child doesn't exist.
+ return syserror.ENOENT
+ }
+ } else {
+ child, _, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err != nil {
+ return err
}
if err := parent.mayDelete(rp.Credentials(), child); err != nil {
return err
@@ -556,11 +513,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
// If a child dentry exists, prepare to delete it. This should fail if it is
// a mount point. We detect mount points by speculatively calling
- // PrepareDeleteDentry, which fails if child is a mount point. However, we
- // may need to revalidate the file in this case to make sure that it has not
- // been deleted or replaced on the remote fs, in which case the mount point
- // will have disappeared. If calling PrepareDeleteDentry fails again on the
- // up-to-date dentry, we can be sure that it is a mount point.
+ // PrepareDeleteDentry, which fails if child is a mount point.
//
// Also note that if child is nil, then it can't be a mount point.
if child != nil {
@@ -575,23 +528,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
child.dirMu.Lock()
defer child.dirMu.Unlock()
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
- // We can skip revalidation in several cases:
- // - We are not in InteropModeShared
- // - The parent directory is synthetic, in which case the child must also
- // be synthetic
- // - We already updated the child during the sticky bit check above
- if parent.cachedMetadataAuthoritative() || sticky {
- return err
- }
- child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
- if err != nil {
- return err
- }
- if child != nil {
- if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
- return err
- }
- }
+ return err
}
}
flags := uint32(0)
@@ -723,13 +660,6 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
- if !start.cachedMetadataAuthoritative() {
- // Get updated metadata for start as required by
- // fs.walkParentDirLocked().
- if err := start.updateFromGetattr(ctx); err != nil {
- return nil, err
- }
- }
d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
if err != nil {
return nil, err
@@ -830,7 +760,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// to creating a synthetic one, i.e. one that is kept entirely in memory.
// Check that we're not overriding an existing file with a synthetic one.
- _, err = fs.stepLocked(ctx, rp, parent, true, ds)
+ _, _, err = fs.stepLocked(ctx, rp, parent, true, ds)
switch {
case err == nil:
// Step succeeded, another file exists.
@@ -891,12 +821,6 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
defer unlock()
start := rp.Start().Impl().(*dentry)
- if !start.cachedMetadataAuthoritative() {
- // Get updated metadata for start as required by fs.stepLocked().
- if err := start.updateFromGetattr(ctx); err != nil {
- return nil, err
- }
- }
if rp.Done() {
// Reject attempts to open mount root directory with O_CREAT.
if mayCreate && rp.MustBeDir() {
@@ -905,6 +829,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if mustCreate {
return nil, syserror.EEXIST
}
+ if !start.cachedMetadataAuthoritative() {
+ // Refresh dentry's attributes before opening.
+ if err := start.updateFromGetattr(ctx); err != nil {
+ return nil, err
+ }
+ }
start.IncRef()
defer start.DecRef(ctx)
unlock()
@@ -926,9 +856,12 @@ afterTrailingSymlink:
if mayCreate && rp.MustBeDir() {
return nil, syserror.EISDIR
}
+ if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil {
+ return nil, err
+ }
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
- child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ child, _, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
if err == syserror.ENOENT && mayCreate {
if parent.isSynthetic() {
parent.dirMu.Unlock()
@@ -1028,7 +961,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
}
return &fd.vfsfd, nil
case linux.S_IFLNK:
- // Can't open symlinks without O_PATH (which is unimplemented).
+ // Can't open symlinks without O_PATH, which is handled at the VFS layer.
return nil, syserror.ELOOP
case linux.S_IFSOCK:
if d.isSynthetic() {
@@ -1188,7 +1121,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
return nil, err
}
- *ds = appendDentry(*ds, child)
// Incorporate the fid that was opened by lcreate.
useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
if useRegularFileFD {
@@ -1212,7 +1144,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
// Insert the dentry into the tree.
d.cacheNewChildLocked(child, name)
- *ds = appendDentry(*ds, d)
+ appendNewChildDentry(ds, d, child)
if d.cachedMetadataAuthoritative() {
d.touchCMtime()
d.dirents = nil
@@ -1297,18 +1229,23 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
+
vfsObj := rp.VirtualFilesystem()
+ if err := fs.revalidateOne(ctx, vfsObj, newParent, newName, &ds); err != nil {
+ return err
+ }
+ if err := fs.revalidateOne(ctx, vfsObj, oldParent, oldName, &ds); err != nil {
+ return err
+ }
+
// We need a dentry representing the renamed file since, if it's a
// directory, we need to check for write permission on it.
oldParent.dirMu.Lock()
defer oldParent.dirMu.Unlock()
- renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds)
+ renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
if err != nil {
return err
}
- if renamed == nil {
- return syserror.ENOENT
- }
if err := oldParent.mayDelete(creds, renamed); err != nil {
return err
}
@@ -1337,8 +1274,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if newParent.isDeleted() {
return syserror.ENOENT
}
- replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
- if err != nil {
+ replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds)
+ if err != nil && err != syserror.ENOENT {
return err
}
var replacedVFSD *vfs.Dentry
@@ -1402,9 +1339,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// parent isn't actually changing.
if oldParent != newParent {
oldParent.decRefNoCaching()
- ds = appendDentry(ds, oldParent)
newParent.IncRef()
ds = appendDentry(ds, newParent)
+ ds = appendDentry(ds, oldParent)
if renamed.isSynthetic() {
oldParent.syntheticChildren--
newParent.syntheticChildren++
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index fb42c5f62..21692d2ac 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -32,9 +32,9 @@
// specialFileFD.mu
// specialFileFD.bufMu
//
-// Locking dentry.dirMu in multiple dentries requires that either ancestor
-// dentries are locked before descendant dentries, or that filesystem.renameMu
-// is locked for writing.
+// Locking dentry.dirMu and dentry.metadataMu in multiple dentries requires that
+// either ancestor dentries are locked before descendant dentries, or that
+// filesystem.renameMu is locked for writing.
package gofer
import (
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 21b4a96fe..b0a429d42 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -238,3 +238,10 @@ func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, err
ctx.UninterruptibleSleepFinish(false)
return fdobj, err
}
+
+func (f p9file) multiGetAttr(ctx context.Context, names []string) ([]p9.FullStat, error) {
+ ctx.UninterruptibleSleepStart(false)
+ stats, err := f.file.MultiGetAttr(names)
+ ctx.UninterruptibleSleepFinish(false)
+ return stats, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index f0e7bbaf7..eed05e369 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -59,7 +60,7 @@ func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD,
return nil, err
}
if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
- fsmetric.GoferOpensWX.Increment()
+ metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file")
}
if atomic.LoadInt32(&d.mmapFD) >= 0 {
fsmetric.GoferOpensHost.Increment()
diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go
new file mode 100644
index 000000000..8f81f0822
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/revalidate.go
@@ -0,0 +1,386 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+)
+
+type errPartialRevalidation struct{}
+
+// Error implements error.Error.
+func (errPartialRevalidation) Error() string {
+ return "partial revalidation"
+}
+
+type errRevalidationStepDone struct{}
+
+// Error implements error.Error.
+func (errRevalidationStepDone) Error() string {
+ return "stop revalidation"
+}
+
+// revalidatePath checks cached dentries for external modification. File
+// attributes are refreshed and cache is invalidated in case the dentry has been
+// deleted, or a new file/directory created in its place.
+//
+// Revalidation stops at symlinks and mount points. The caller is responsible
+// for revalidating again after symlinks are resolved and after changing to
+// different mounts.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+func (fs *filesystem) revalidatePath(ctx context.Context, rpOrig *vfs.ResolvingPath, start *dentry, ds **[]*dentry) error {
+ // Revalidation is done even if start is synthetic in case the path is
+ // something like: ../non_synthetic_file.
+ if fs.opts.interop != InteropModeShared {
+ return nil
+ }
+
+ // Copy resolving path to walk the path for revalidation.
+ rp := rpOrig.Copy()
+ err := fs.revalidate(ctx, rp, start, rp.Done, ds)
+ rp.Release(ctx)
+ return err
+}
+
+// revalidateParentDir does the same as revalidatePath, but stops at the parent.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+func (fs *filesystem) revalidateParentDir(ctx context.Context, rpOrig *vfs.ResolvingPath, start *dentry, ds **[]*dentry) error {
+ // Revalidation is done even if start is synthetic in case the path is
+ // something like: ../non_synthetic_file and parent is non synthetic.
+ if fs.opts.interop != InteropModeShared {
+ return nil
+ }
+
+ // Copy resolving path to walk the path for revalidation.
+ rp := rpOrig.Copy()
+ err := fs.revalidate(ctx, rp, start, rp.Final, ds)
+ rp.Release(ctx)
+ return err
+}
+
+// revalidateOne does the same as revalidatePath, but checks a single dentry.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+func (fs *filesystem) revalidateOne(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) error {
+ // Skip revalidation for interop mode different than InteropModeShared or
+ // if the parent is synthetic (child must be synthetic too, but it cannot be
+ // replaced without first replacing the parent).
+ if parent.cachedMetadataAuthoritative() {
+ return nil
+ }
+
+ parent.dirMu.Lock()
+ child, ok := parent.children[name]
+ parent.dirMu.Unlock()
+ if !ok {
+ return nil
+ }
+
+ state := makeRevalidateState(parent)
+ defer state.release()
+
+ state.add(name, child)
+ return fs.revalidateHelper(ctx, vfsObj, state, ds)
+}
+
+// revalidate revalidates path components in rp until done returns true, or
+// until a mount point or symlink is reached. It may send multiple MultiGetAttr
+// calls to the gofer to handle ".." in the path.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * InteropModeShared is in effect.
+func (fs *filesystem) revalidate(ctx context.Context, rp *vfs.ResolvingPath, start *dentry, done func() bool, ds **[]*dentry) error {
+ state := makeRevalidateState(start)
+ defer state.release()
+
+ // Skip synthetic dentries because the start dentry cannot be replaced in case
+ // it has been created in the remote file system.
+ if !start.isSynthetic() {
+ state.add("", start)
+ }
+
+done:
+ for cur := start; !done(); {
+ var err error
+ cur, err = fs.revalidateStep(ctx, rp, cur, state)
+ if err != nil {
+ switch err.(type) {
+ case errPartialRevalidation:
+ if err := fs.revalidateHelper(ctx, rp.VirtualFilesystem(), state, ds); err != nil {
+ return err
+ }
+
+ // Reset state to release any remaining locks and restart from where
+ // stepping stopped.
+ state.reset()
+ state.start = cur
+
+ // Skip synthetic dentries because the start dentry cannot be replaced in
+ // case it has been created in the remote file system.
+ if !cur.isSynthetic() {
+ state.add("", cur)
+ }
+
+ case errRevalidationStepDone:
+ break done
+
+ default:
+ return err
+ }
+ }
+ }
+ return fs.revalidateHelper(ctx, rp.VirtualFilesystem(), state, ds)
+}
+
+// revalidateStep walks one element of the path and updates revalidationState
+// with the dentry if needed. It may also stop the stepping or ask for a
+// partial revalidation. Partial revalidation requires the caller to revalidate
+// the current revalidationState, release all locks, and resume stepping.
+// In case a symlink is hit, revalidation stops and the caller is responsible
+// for calling revalidate again after the symlink is resolved. Revalidation may
+// also stop for other reasons, like hitting a child not in the cache.
+//
+// Returns:
+// * (dentry, nil): step worked, continue stepping.`
+// * (dentry, errPartialRevalidation): revalidation should be done with the
+// state gathered so far. Then continue stepping with the remainder of the
+// path, starting at `dentry`.
+// * (nil, errRevalidationStepDone): revalidation doesn't need to step any
+// further. It hit a symlink, a mount point, or an uncached dentry.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * InteropModeShared is in effect (assumes no negative dentries).
+func (fs *filesystem) revalidateStep(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, state *revalidateState) (*dentry, error) {
+ switch name := rp.Component(); name {
+ case ".":
+ // Do nothing.
+
+ case "..":
+ // Partial revalidation is required when ".." is hit because metadata locks
+ // can only be acquired from parent to child to avoid deadlocks.
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, errRevalidationStepDone{}
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, errPartialRevalidation{}
+ }
+ // We must assume that d.parent is correct, because if d has been moved
+ // elsewhere in the remote filesystem so that its parent has changed,
+ // we have no way of determining its new parent's location in the
+ // filesystem.
+ //
+ // Call rp.CheckMount() before updating d.parent's metadata, since if
+ // we traverse to another mount then d.parent's metadata is irrelevant.
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, errRevalidationStepDone{}
+ }
+ rp.Advance()
+ return d.parent, errPartialRevalidation{}
+
+ default:
+ d.dirMu.Lock()
+ child, ok := d.children[name]
+ d.dirMu.Unlock()
+ if !ok {
+ // child is not cached, no need to validate any further.
+ return nil, errRevalidationStepDone{}
+ }
+
+ state.add(name, child)
+
+ // Symlink must be resolved before continuing with revalidation.
+ if child.isSymlink() {
+ return nil, errRevalidationStepDone{}
+ }
+
+ d = child
+ }
+
+ rp.Advance()
+ return d, nil
+}
+
+// revalidateHelper calls the gofer to stat all dentries in `state`. It will
+// update or invalidate dentries in the cache based on the result.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * InteropModeShared is in effect.
+func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error {
+ if len(state.names) == 0 {
+ return nil
+ }
+ // Lock metadata on all dentries *before* getting attributes for them.
+ state.lockAllMetadata()
+ stats, err := state.start.file.multiGetAttr(ctx, state.names)
+ if err != nil {
+ return err
+ }
+
+ i := -1
+ for d := state.popFront(); d != nil; d = state.popFront() {
+ i++
+ found := i < len(stats)
+ if i == 0 && len(state.names[0]) == 0 {
+ if found && !d.isSynthetic() {
+ // First dentry is where the search is starting, just update attributes
+ // since it cannot be replaced.
+ d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr)
+ }
+ d.metadataMu.Unlock()
+ continue
+ }
+
+ // Note that synthetic dentries will always fails the comparison check
+ // below.
+ if !found || d.qidPath != stats[i].QID.Path {
+ d.metadataMu.Unlock()
+ if !found && d.isSynthetic() {
+ // We have a synthetic file, and no remote file has arisen to replace
+ // it.
+ return nil
+ }
+ // The file at this path has changed or no longer exists. Mark the
+ // dentry invalidated, and re-evaluate its caching status (i.e. if it
+ // has 0 references, drop it). The dentry will be reloaded next time it's
+ // accessed.
+ vfsObj.InvalidateDentry(ctx, &d.vfsd)
+
+ name := state.names[i]
+ d.parent.dirMu.Lock()
+
+ if d.isSynthetic() {
+ // Normally we don't mark invalidated dentries as deleted since
+ // they may still exist (but at a different path), and also for
+ // consistency with Linux. However, synthetic files are guaranteed
+ // to become unreachable if their dentries are invalidated, so
+ // treat their invalidation as deletion.
+ d.setDeleted()
+ d.decRefNoCaching()
+ *ds = appendDentry(*ds, d)
+
+ d.parent.syntheticChildren--
+ d.parent.dirents = nil
+ }
+
+ // Since the dirMu was released and reacquired, re-check that the
+ // parent's child with this name is still the same. Do not touch it if
+ // it has been replaced with a different one.
+ if child := d.parent.children[name]; child == d {
+ // Invalidate dentry so it gets reloaded next time it's accessed.
+ delete(d.parent.children, name)
+ }
+ d.parent.dirMu.Unlock()
+
+ return nil
+ }
+
+ // The file at this path hasn't changed. Just update cached metadata.
+ d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr)
+ d.metadataMu.Unlock()
+ }
+
+ return nil
+}
+
+// revalidateStatePool caches revalidateState instances to save array
+// allocations for dentries and names.
+var revalidateStatePool = sync.Pool{
+ New: func() interface{} {
+ return &revalidateState{}
+ },
+}
+
+// revalidateState keeps state related to a revalidation request. It keeps track
+// of {name, dentry} list being revalidated, as well as metadata locks on the
+// dentries. The list must be in ancestry order, in other words `n` must be
+// `n-1` child.
+type revalidateState struct {
+ // start is the dentry where to start the attributes search.
+ start *dentry
+
+ // List of names of entries to refresh attributes. Names length must be the
+ // same as detries length. They are kept in separate slices because names is
+ // used to call File.MultiGetAttr().
+ names []string
+
+ // dentries is the list of dentries that correspond to the names above.
+ // dentry.metadataMu is acquired as each dentry is added to this list.
+ dentries []*dentry
+
+ // locked indicates if metadata lock has been acquired on dentries.
+ locked bool
+}
+
+func makeRevalidateState(start *dentry) *revalidateState {
+ r := revalidateStatePool.Get().(*revalidateState)
+ r.start = start
+ return r
+}
+
+// release must be called after the caller is done with this object. It releases
+// all metadata locks and resources.
+func (r *revalidateState) release() {
+ r.reset()
+ revalidateStatePool.Put(r)
+}
+
+// Preconditions:
+// * d is a descendant of all dentries in r.dentries.
+func (r *revalidateState) add(name string, d *dentry) {
+ r.names = append(r.names, name)
+ r.dentries = append(r.dentries, d)
+}
+
+func (r *revalidateState) lockAllMetadata() {
+ for _, d := range r.dentries {
+ d.metadataMu.Lock()
+ }
+ r.locked = true
+}
+
+func (r *revalidateState) popFront() *dentry {
+ if len(r.dentries) == 0 {
+ return nil
+ }
+ d := r.dentries[0]
+ r.dentries = r.dentries[1:]
+ return d
+}
+
+// reset releases all metadata locks and resets all fields to allow this
+// instance to be reused.
+func (r *revalidateState) reset() {
+ if r.locked {
+ // Unlock any remaining dentries.
+ for _, d := range r.dentries {
+ d.metadataMu.Unlock()
+ }
+ r.locked = false
+ }
+ r.start = nil
+ r.names = r.names[:0]
+ r.dentries = r.dentries[:0]
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index ac3b5b621..c12444b7e 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fsmetric"
@@ -100,7 +101,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci
d.fs.specialFileFDs[fd] = struct{}{}
d.fs.syncMu.Unlock()
if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
- fsmetric.GoferOpensWX.Increment()
+ metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file")
}
if h.fd >= 0 {
fsmetric.GoferOpensHost.Increment()
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index badca4d9f..f50b0fb08 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -612,16 +612,24 @@ afterTrailingSymlink:
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
- fs.mu.RLock()
defer fs.processDeferredDecRefs(ctx)
- defer fs.mu.RUnlock()
+
+ fs.mu.RLock()
d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return "", err
}
if !d.isSymlink() {
+ fs.mu.RUnlock()
return "", syserror.EINVAL
}
+
+ // Inode.Readlink() cannot be called holding fs locks.
+ d.IncRef()
+ defer d.DecRef(ctx)
+ fs.mu.RUnlock()
+
return d.inode.Readlink(ctx, rp.Mount())
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 16486eeae..6f699c9cd 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -534,6 +534,9 @@ func (d *Dentry) FSLocalPath() string {
// - Checking that dentries passed to methods are of the appropriate file type.
// - Checking permissions.
//
+// Inode functions may be called holding filesystem wide locks and are not
+// allowed to call vfs functions that may reenter, unless otherwise noted.
+//
// Specific responsibilities of implementations are documented below.
type Inode interface {
// Methods related to reference counting. A generic implementation is
@@ -680,6 +683,9 @@ type inodeDirectory interface {
type inodeSymlink interface {
// Readlink returns the target of a symbolic link. If an inode is not a
// symlink, the implementation should return EINVAL.
+ //
+ // Readlink is called with no kernfs locks held, so it may reenter if needed
+ // to resolve symlink targets.
Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
// Getlink returns the target of a symbolic link, as used by path
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 02bf74dbc..4718fac7a 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -221,6 +221,8 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error)
defer file.DecRef(ctx)
root := vfs.RootFromContext(ctx)
defer root.DecRef(ctx)
+
+ // Note: it's safe to reenter kernfs from Readlink if needed to resolve path.
return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 7c7543f14..cf905fae4 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -65,6 +65,7 @@ var _ kernfs.Inode = (*tasksInode)(nil)
func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, fakeCgroupControllers map[string]string) *tasksInode {
root := auth.NewRootCredentials(pidns.UserNamespace())
contents := map[string]kernfs.Inode{
+ "cmdline": fs.newInode(ctx, root, 0444, &cmdLineData{}),
"cpuinfo": fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))),
"filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}),
"loadavg": fs.newInode(ctx, root, 0444, &loadavgData{}),
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index e1a8b4409..045ed7a2d 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -336,15 +336,6 @@ var _ dynamicInode = (*versionData)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- k := kernel.KernelFromContext(ctx)
- init := k.GlobalInit()
- if init == nil {
- // Attempted to read before the init Task is created. This can
- // only occur during startup, which should never need to read
- // this file.
- panic("Attempted to read version before initial Task is available")
- }
-
// /proc/version takes the form:
//
// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
@@ -364,7 +355,7 @@ func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// FIXME(mpratt): Using Version from the init task SyscallTable
// disregards the different version a task may have (e.g., in a uts
// namespace).
- ver := init.Leader().SyscallTable().Version
+ ver := kernelVersion(ctx)
fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
return nil
}
@@ -400,3 +391,31 @@ func (*cgroupsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
r.GenerateProcCgroups(buf)
return nil
}
+
+// cmdLineData backs /proc/cmdline.
+//
+// +stateify savable
+type cmdLineData struct {
+ dynamicBytesFileSetAttr
+}
+
+var _ dynamicInode = (*cmdLineData)(nil)
+
+// Generate implements vfs.DynamicByteSource.Generate.
+func (*cmdLineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "BOOT_IMAGE=/vmlinuz-%s-gvisor quiet\n", kernelVersion(ctx).Release)
+ return nil
+}
+
+// kernelVersion returns the kernel version.
+func kernelVersion(ctx context.Context) kernel.Version {
+ k := kernel.KernelFromContext(ctx)
+ init := k.GlobalInit()
+ if init == nil {
+ // Attempted to read before the init Task is created. This can
+ // only occur during startup, which should never need to read
+ // this file.
+ panic("Attempted to read version before initial Task is available")
+ }
+ return init.Leader().SyscallTable().Version
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 9b14dd6b9..88ab49048 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -365,27 +365,22 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
}
// ipForwarding implements vfs.WritableDynamicBytesSource for
-// /proc/sys/net/ipv4/ip_forwarding.
+// /proc/sys/net/ipv4/ip_forward.
//
// +stateify savable
type ipForwarding struct {
kernfs.DynamicBytesFile
stack inet.Stack `state:"wait"`
- enabled *bool
+ enabled bool
}
var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
- if ipf.enabled == nil {
- enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
- ipf.enabled = &enabled
- }
-
val := "0\n"
- if *ipf.enabled {
+ if ipf.enabled {
// Technically, this is not quite compatible with Linux. Linux stores these
// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
// Tough luck.
@@ -414,11 +409,8 @@ func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offs
if err != nil {
return 0, err
}
- if ipf.enabled == nil {
- ipf.enabled = new(bool)
- }
- *ipf.enabled = v != 0
- if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+ ipf.enabled = v != 0
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil {
return 0, err
}
return n, nil
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index 6cee22823..19b012f7d 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -132,7 +132,7 @@ func TestConfigureIPForwarding(t *testing.T) {
t.Run(c.comment, func(t *testing.T) {
s.IPForwarding = c.initial
- file := &ipForwarding{stack: s, enabled: &c.initial}
+ file := &ipForwarding{stack: s, enabled: c.initial}
// Write the values.
src := usermem.BytesIOSequence([]byte(c.str))
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index d6f076cd6..e534fbca8 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -47,6 +47,7 @@ var (
var (
tasksStaticFiles = map[string]testutil.DirentType{
+ "cmdline": linux.DT_REG,
"cpuinfo": linux.DT_REG,
"filesystems": linux.DT_REG,
"loadavg": linux.DT_REG,
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 5fdca1d46..766289e60 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -465,7 +465,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
}
return &fd.vfsfd, nil
case *symlink:
- // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
+ // Can't open symlinks without O_PATH, which is handled at the VFS layer.
return nil, syserror.ELOOP
case *namedPipe:
return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index ca8090bbf..3582d14c9 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -168,10 +168,6 @@ afterSymlink:
// Preconditions:
// * fs.renameMu must be locked.
// * d.dirMu must be locked.
-//
-// TODO(b/166474175): Investigate all possible errors returned in this
-// function, and make sure we differentiate all errors that indicate unexpected
-// modifications to the file system from the ones that are not harmful.
func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
vfsObj := fs.vfsfs.VirtualFilesystem()
@@ -278,16 +274,15 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi
var buf bytes.Buffer
parent.hashMu.RLock()
_, err = merkletree.Verify(&merkletree.VerifyParams{
- Out: &buf,
- File: &fdReader,
- Tree: &fdReader,
- Size: int64(parentSize),
- Name: parent.name,
- Mode: uint32(parentStat.Mode),
- UID: parentStat.UID,
- GID: parentStat.GID,
- Children: parent.childrenNames,
- //TODO(b/156980949): Support passing other hash algorithms.
+ Out: &buf,
+ File: &fdReader,
+ Tree: &fdReader,
+ Size: int64(parentSize),
+ Name: parent.name,
+ Mode: uint32(parentStat.Mode),
+ UID: parentStat.UID,
+ GID: parentStat.GID,
+ Children: parent.childrenNames,
HashAlgorithms: fs.alg.toLinuxHashAlg(),
ReadOffset: int64(offset),
ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())),
@@ -409,15 +404,14 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry
var buf bytes.Buffer
d.hashMu.RLock()
params := &merkletree.VerifyParams{
- Out: &buf,
- Tree: &fdReader,
- Size: int64(size),
- Name: d.name,
- Mode: uint32(stat.Mode),
- UID: stat.UID,
- GID: stat.GID,
- Children: d.childrenNames,
- //TODO(b/156980949): Support passing other hash algorithms.
+ Out: &buf,
+ Tree: &fdReader,
+ Size: int64(size),
+ Name: d.name,
+ Mode: uint32(stat.Mode),
+ UID: stat.UID,
+ GID: stat.GID,
+ Children: d.childrenNames,
HashAlgorithms: fs.alg.toLinuxHashAlg(),
ReadOffset: 0,
// Set read size to 0 so only the metadata is verified.
@@ -991,8 +985,6 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
}
// StatAt implements vfs.FilesystemImpl.StatAt.
-// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should
-// be verified.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
var ds *[]*dentry
fs.renameMu.RLock()
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 458c7fcb6..fa7696ad6 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -840,7 +840,6 @@ func (fd *fileDescription) Release(ctx context.Context) {
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- // TODO(b/162788573): Add integrity check for metadata.
stat, err := fd.lowerFD.Stat(ctx, opts)
if err != nil {
return linux.Statx{}, err
@@ -960,10 +959,9 @@ func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, ui
}
params := &merkletree.GenerateParams{
- TreeReader: &merkleReader,
- TreeWriter: &merkleWriter,
- Children: fd.d.childrenNames,
- //TODO(b/156980949): Support passing other hash algorithms.
+ TreeReader: &merkleReader,
+ TreeWriter: &merkleWriter,
+ Children: fd.d.childrenNames,
HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
Name: fd.d.name,
Mode: uint32(stat.Mode),
@@ -1192,8 +1190,6 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
case linux.FS_IOC_GETFLAGS:
return fd.verityFlags(ctx, args[2].Pointer())
default:
- // TODO(b/169682228): Investigate which ioctl commands should
- // be allowed.
return 0, syserror.ENOSYS
}
}
@@ -1253,16 +1249,15 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
fd.d.hashMu.RLock()
n, err := merkletree.Verify(&merkletree.VerifyParams{
- Out: dst.Writer(ctx),
- File: &dataReader,
- Tree: &merkleReader,
- Size: int64(size),
- Name: fd.d.name,
- Mode: fd.d.mode,
- UID: fd.d.uid,
- GID: fd.d.gid,
- Children: fd.d.childrenNames,
- //TODO(b/156980949): Support passing other hash algorithms.
+ Out: dst.Writer(ctx),
+ File: &dataReader,
+ Tree: &merkleReader,
+ Size: int64(size),
+ Name: fd.d.name,
+ Mode: fd.d.mode,
+ UID: fd.d.uid,
+ GID: fd.d.gid,
+ Children: fd.d.childrenNames,
HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
ReadOffset: offset,
ReadSize: dst.NumBytes(),
@@ -1304,6 +1299,11 @@ func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapO
return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts)
}
+// SupportsLocks implements vfs.FileDescriptionImpl.SupportsLocks.
+func (fd *fileDescription) SupportsLocks() bool {
+ return fd.lowerFD.SupportsLocks()
+}
+
// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
return fd.lowerFD.LockBSD(ctx, ownerPID, t, block)
@@ -1333,7 +1333,7 @@ func (fd *fileDescription) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t
func (fd *fileDescription) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
ts, err := fd.lowerMappable.Translate(ctx, required, optional, at)
if err != nil {
- return ts, err
+ return nil, err
}
// dataSize is the size of the whole file.
@@ -1346,17 +1346,17 @@ func (fd *fileDescription) Translate(ctx context.Context, required, optional mem
// contains the expected xattrs. If the xattr does not exist, it
// indicates unexpected modifications to the file system.
if err == syserror.ENODATA {
- return ts, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+ return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
}
if err != nil {
- return ts, err
+ return nil, err
}
// The dataSize xattr should be an integer. If it's not, it indicates
// unexpected modifications to the file system.
size, err := strconv.Atoi(dataSize)
if err != nil {
- return ts, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+ return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
}
merkleReader := FileReadWriteSeeker{
@@ -1389,7 +1389,7 @@ func (fd *fileDescription) Translate(ctx context.Context, required, optional mem
DataAndTreeInSameFile: false,
})
if err != nil {
- return ts, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
+ return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
}
}
return ts, err
diff --git a/pkg/sentry/fsmetric/fsmetric.go b/pkg/sentry/fsmetric/fsmetric.go
index 7e535b527..17d0d5025 100644
--- a/pkg/sentry/fsmetric/fsmetric.go
+++ b/pkg/sentry/fsmetric/fsmetric.go
@@ -42,7 +42,6 @@ var (
// Metrics that only apply to fs/gofer and fsimpl/gofer.
var (
- GoferOpensWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a executable file was opened writably from a gofer.")
GoferOpens9P = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a file was opened from a gofer and did not have a host file descriptor.")
GoferOpensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a file was opened from a gofer and did have a host file descriptor.")
GoferReads9P = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 6b71bd3a9..80dda1559 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -88,9 +88,6 @@ type Stack interface {
// for restoring a stack after a save.
RestoreCleanupEndpoints([]stack.TransportEndpoint)
- // Forwarding returns if packet forwarding between NICs is enabled.
- Forwarding(protocol tcpip.NetworkProtocolNumber) bool
-
// SetForwarding enables or disables packet forwarding between NICs.
SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 03e2608c2..218d9dafc 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -154,11 +154,6 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint {
// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
-// Forwarding implements inet.Stack.Forwarding.
-func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
- return s.IPForwarding
-}
-
// SetForwarding implements inet.Stack.SetForwarding.
func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
s.IPForwarding = enable
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
index 1f1c63f37..0fbf27f64 100644
--- a/pkg/sentry/kernel/cgroup.go
+++ b/pkg/sentry/kernel/cgroup.go
@@ -48,10 +48,6 @@ type CgroupController interface {
// attached to. Returned value is valid for the lifetime of the controller.
HierarchyID() uint32
- // Filesystem returns the filesystem this controller is attached to.
- // Returned value is valid for the lifetime of the controller.
- Filesystem() *vfs.Filesystem
-
// RootCgroup returns the root cgroup for this controller. Returned value is
// valid for the lifetime of the controller.
RootCgroup() Cgroup
@@ -124,6 +120,19 @@ func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
return true
}
+// cgroupFS is the public interface to cgroupfs. This lets the kernel package
+// refer to cgroupfs.filesystem methods without directly depending on the
+// cgroupfs package, which would lead to a circular dependency.
+type cgroupFS interface {
+ // Returns the vfs.Filesystem for the cgroupfs.
+ VFSFilesystem() *vfs.Filesystem
+
+ // InitializeHierarchyID sets the hierarchy ID for this filesystem during
+ // filesystem creation. May only be called before the filesystem is visible
+ // to the vfs layer.
+ InitializeHierarchyID(hid uint32)
+}
+
// CgroupRegistry tracks the active set of cgroup controllers on the system.
//
// +stateify savable
@@ -182,31 +191,35 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files
// Register registers the provided set of controllers with the registry as a new
// hierarchy. If any controller is already registered, the function returns an
-// error without modifying the registry. The hierarchy can be later referenced
-// by the returned id.
-func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
+// error without modifying the registry. Register sets the hierarchy ID for the
+// filesystem on success.
+func (r *CgroupRegistry) Register(cs []CgroupController, fs cgroupFS) error {
r.mu.Lock()
defer r.mu.Unlock()
if len(cs) == 0 {
- return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers")
+ return fmt.Errorf("can't register hierarchy with no controllers")
}
for _, c := range cs {
if _, ok := r.controllers[c.Type()]; ok {
- return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy")
+ return fmt.Errorf("controllers may only be mounted on a single hierarchy")
}
}
hid, err := r.nextHierarchyID()
if err != nil {
- return hid, err
+ return err
}
+ // Must not fail below here, once we publish the hierarchy ID.
+
+ fs.InitializeHierarchyID(hid)
+
h := hierarchy{
id: hid,
controllers: make(map[CgroupControllerType]CgroupController),
- fs: cs[0].Filesystem(),
+ fs: fs.VFSFilesystem(),
}
for _, c := range cs {
n := c.Type()
@@ -214,7 +227,7 @@ func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
h.controllers[n] = c
}
r.hierarchies[hid] = h
- return hid, nil
+ return nil
}
// Unregister removes a previously registered hierarchy from the registry. If
@@ -253,6 +266,11 @@ func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[C
for name, ctl := range r.controllers {
if _, ok := ctlSet[name]; !ok {
cg := ctl.RootCgroup()
+ // Multiple controllers may share the same hierarchy, so may have
+ // the same root cgroup. Grab a single ref per hierarchy root.
+ if _, ok := cgset[cg]; ok {
+ continue
+ }
cg.IncRef() // Ref transferred to caller.
cgset[cg] = struct{}{}
}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 10885688c..62777faa8 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -154,9 +154,11 @@ func (f *FDTable) drop(ctx context.Context, file *fs.File) {
// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
// Release any POSIX lock possibly held by the FDTable.
- err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF})
- if err != nil && err != syserror.ENOLCK {
- panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+ if file.SupportsLocks() {
+ err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF})
+ if err != nil && err != syserror.ENOLCK {
+ panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+ }
}
// Drop the table's reference.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 36855e3ec..601fc0d3a 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -30,8 +30,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
-
// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
@@ -284,7 +282,6 @@ func (*runSyscallExit) execute(t *Task) taskRunState {
// indicated by an execution fault at address addr. doVsyscall returns the
// task's next run state.
func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
- vsyscallCount.Increment()
metric.WeirdnessMetric.Increment("vsyscall_count")
// Grab the caller up front, to make sure there's a sensible stack.
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index ecb6603a1..4c65215fa 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -11,11 +11,12 @@ go_library(
"vdso.go",
"vdso_state.go",
],
+ marshal = True,
+ marshal_debug = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi",
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/cpuid",
"//pkg/hostarch",
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index e92d9fdc3..8fc3e2a79 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/hostarch"
@@ -47,10 +46,10 @@ const (
var (
// header64Size is the size of elf.Header64.
- header64Size = int(binary.Size(elf.Header64{}))
+ header64Size = (*linux.ElfHeader64)(nil).SizeBytes()
// Prog64Size is the size of elf.Prog64.
- prog64Size = int(binary.Size(elf.Prog64{}))
+ prog64Size = (*linux.ElfProg64)(nil).SizeBytes()
)
func progFlagsAsPerms(f elf.ProgFlag) hostarch.AccessType {
@@ -136,7 +135,6 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
log.Infof("Unsupported ELF endianness: %v", endian)
return elfInfo{}, syserror.ENOEXEC
}
- byteOrder := binary.LittleEndian
if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT {
log.Infof("Unsupported ELF version: %v", version)
@@ -145,7 +143,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
// EI_OSABI is ignored by Linux, which is the only OS supported.
os := abi.Linux
- var hdr elf.Header64
+ var hdr linux.ElfHeader64
hdrBuf := make([]byte, header64Size)
_, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0)
if err != nil {
@@ -156,7 +154,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
}
return elfInfo{}, err
}
- binary.Unmarshal(hdrBuf, byteOrder, &hdr)
+ hdr.UnmarshalUnsafe(hdrBuf)
// We support amd64 and arm64.
var a arch.Arch
@@ -213,8 +211,8 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
phdrs := make([]elf.ProgHeader, hdr.Phnum)
for i := range phdrs {
- var prog64 elf.Prog64
- binary.Unmarshal(phdrBuf[:prog64Size], byteOrder, &prog64)
+ var prog64 linux.ElfProg64
+ prog64.UnmarshalUnsafe(phdrBuf[:prog64Size])
phdrBuf = phdrBuf[prog64Size:]
phdrs[i] = elf.ProgHeader{
Type: elf.ProgType(prog64.Type),
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 99f036bba..1b5d5f66e 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -75,6 +75,9 @@ type machine struct {
// nextID is the next vCPU ID.
nextID uint32
+
+ // machineArchState is the architecture-specific state.
+ machineArchState
}
const (
@@ -196,12 +199,7 @@ func newMachine(vm int) (*machine, error) {
m.available.L = &m.mu
// Pull the maximum vCPUs.
- maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
- if errno != 0 {
- m.maxVCPUs = _KVM_NR_VCPUS
- } else {
- m.maxVCPUs = int(maxVCPUs)
- }
+ m.getMaxVCPU()
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
m.vCPUsByTID = make(map[uint64]*vCPU)
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
@@ -427,9 +425,8 @@ func (m *machine) Get() *vCPU {
}
}
- // Create a new vCPU (maybe).
- if int(m.nextID) < m.maxVCPUs {
- c := m.newVCPU()
+ // Get a new vCPU (maybe).
+ if c := m.getNewVCPU(); c != nil {
c.lock()
m.vCPUsByTID[tid] = c
m.mu.Unlock()
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index d7abfefb4..9a2337654 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -63,6 +63,9 @@ func (m *machine) initArchState() error {
return nil
}
+type machineArchState struct {
+}
+
type vCPUArchState struct {
// PCIDs is the set of PCIDs for this vCPU.
//
@@ -351,6 +354,10 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
// allocations occur.
entersyscall()
bluepill(c)
+ // The root table physical page has to be mapped to not fault in iret
+ // or sysret after switching into a user address space. sysret and
+ // iret are in the upper half that is global and already mapped.
+ switchOpts.PageTables.PrefaultRootTable()
prefaultFloatingPointState(switchOpts.FloatingPointState)
vector = c.CPU.SwitchToUser(switchOpts)
exitsyscall()
@@ -495,3 +502,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
physical)
}
}
+
+// getMaxVCPU get max vCPU number
+func (m *machine) getMaxVCPU() {
+ maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+ if errno != 0 {
+ m.maxVCPUs = _KVM_NR_VCPUS
+ } else {
+ m.maxVCPUs = int(maxVCPUs)
+ }
+}
+
+// getNewVCPU create a new vCPU (maybe)
+func (m *machine) getNewVCPU() *vCPU {
+ if int(m.nextID) < m.maxVCPUs {
+ c := m.newVCPU()
+ return c
+ }
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index cd912f922..8926b1d9f 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -17,6 +17,10 @@
package kvm
import (
+ "runtime"
+ "sync/atomic"
+
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/ring0"
"gvisor.dev/gvisor/pkg/ring0/pagetables"
@@ -25,6 +29,11 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
)
+type machineArchState struct {
+ //initialvCPUs is the machine vCPUs which has initialized but not used
+ initialvCPUs map[int]*vCPU
+}
+
type vCPUArchState struct {
// PCIDs is the set of PCIDs for this vCPU.
//
@@ -182,3 +191,30 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType,
return accessType, platform.ErrContextSignal
}
+
+// getMaxVCPU get max vCPU number
+func (m *machine) getMaxVCPU() {
+ rmaxVCPUs := runtime.NumCPU()
+ smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+ // compare the max vcpu number from runtime and syscall, use smaller one.
+ if errno != 0 {
+ m.maxVCPUs = rmaxVCPUs
+ } else {
+ if rmaxVCPUs < int(smaxVCPUs) {
+ m.maxVCPUs = rmaxVCPUs
+ } else {
+ m.maxVCPUs = int(smaxVCPUs)
+ }
+ }
+}
+
+// getNewVCPU() scan for an available vCPU from initialvCPUs
+func (m *machine) getNewVCPU() *vCPU {
+ for CID, c := range m.initialvCPUs {
+ if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+ delete(m.initialvCPUs, CID)
+ return c
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 634e55ec0..92edc992b 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/arch/fpu"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
)
type kvmVcpuInit struct {
@@ -47,6 +48,19 @@ func (m *machine) initArchState() error {
uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
}
+
+ // Initialize all vCPUs on ARM64, while this does not happen on x86_64.
+ // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms.
+ // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time.
+ // For more detail, please refer to https://github.com/google/gvisor/issues/5739
+ m.initialvCPUs = make(map[int]*vCPU)
+ m.mu.Lock()
+ for int(m.nextID) < m.maxVCPUs-1 {
+ c := m.newVCPU()
+ c.state = 0
+ m.initialvCPUs[c.id] = c
+ }
+ m.mu.Unlock()
return nil
}
@@ -174,9 +188,58 @@ func (c *vCPU) setTSC(value uint64) error {
return nil
}
+// getTSC gets the counter Physical Counter minus Virtual Offset.
+func (c *vCPU) getTSC() error {
+ var (
+ reg kvmOneReg
+ data uint64
+ )
+
+ reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+ reg.id = _KVM_ARM64_REGS_TIMER_CNT
+
+ if err := c.getOneRegister(&reg); err != nil {
+ return err
+ }
+
+ return nil
+}
+
// setSystemTime sets the vCPU to the system time.
func (c *vCPU) setSystemTime() error {
- return c.setSystemTimeLegacy()
+ const minIterations = 10
+ minimum := uint64(0)
+ for iter := 0; ; iter++ {
+ // Use get the TSC to an estimate of where it will be
+ // on the host during a "fast" system call iteration.
+ // replace getTSC to another setOneRegister syscall can get more accurate value?
+ start := uint64(ktime.Rdtsc())
+ if err := c.getTSC(); err != nil {
+ return err
+ }
+ // See if this is our new minimum call time. Note that this
+ // serves two functions: one, we make sure that we are
+ // accurately predicting the offset we need to set. Second, we
+ // don't want to do the final set on a slow call, which could
+ // produce a really bad result.
+ end := uint64(ktime.Rdtsc())
+ if end < start {
+ continue // Totally bogus: unstable TSC?
+ }
+ current := end - start
+ if current < minimum || iter == 0 {
+ minimum = current // Set our new minimum.
+ }
+ // Is this past minIterations and within ~10% of minimum?
+ upperThreshold := (((minimum << 3) + minimum) >> 3)
+ if iter >= minIterations && (current <= upperThreshold || minimum < 50) {
+ // Try to set the TSC
+ if err := c.setTSC(end + (minimum / 2)); err != nil {
+ return err
+ }
+ return nil
+ }
+ }
}
//go:nosplit
@@ -203,7 +266,7 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
uintptr(c.fd),
_KVM_GET_ONE_REG,
uintptr(unsafe.Pointer(reg))); errno != 0 {
- return fmt.Errorf("error setting one register: %v", errno)
+ return fmt.Errorf("error getting one register: %v", errno)
}
return nil
}
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 080859125..7ee89a735 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -8,7 +8,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/hostarch",
"//pkg/marshal",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 0e0e82365..2029e7cf4 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -14,9 +14,11 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
+ "//pkg/bits",
"//pkg/context",
"//pkg/hostarch",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 45a05cd63..235b9c306 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -18,9 +18,11 @@ package control
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -193,7 +195,7 @@ func putUint32(buf []byte, n uint32) []byte {
// putCmsg writes a control message header and as much data as will fit into
// the unused capacity of a buffer.
func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) {
- space := binary.AlignDown(cap(buf)-len(buf), 4)
+ space := bits.AlignDown(cap(buf)-len(buf), 4)
// We can't write to space that doesn't exist, so if we are going to align
// the available space, we must align down.
@@ -230,7 +232,7 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([
return alignSlice(buf, align), flags
}
-func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interface{}) []byte {
+func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data marshal.Marshallable) []byte {
if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader {
return buf
}
@@ -241,8 +243,7 @@ func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interf
buf = putUint32(buf, msgType)
hdrBuf := buf
-
- buf = binary.Marshal(buf, hostarch.ByteOrder, data)
+ buf = append(buf, marshal.Marshal(data)...)
// If the control message data brought us over capacity, omit it.
if cap(buf) != cap(ob) {
@@ -288,7 +289,7 @@ func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int
// alignSlice extends a slice's length (up to the capacity) to align it.
func alignSlice(buf []byte, align uint) []byte {
- aligned := binary.AlignUp(len(buf), align)
+ aligned := bits.AlignUp(len(buf), align)
if aligned > cap(buf) {
// Linux allows unaligned data if there isn't room for alignment.
// Since there isn't room for alignment, there isn't room for any
@@ -300,12 +301,13 @@ func alignSlice(buf []byte, align uint) []byte {
// PackTimestamp packs a SO_TIMESTAMP socket control message.
func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
+ timestampP := linux.NsecToTimeval(timestamp)
return putCmsgStruct(
buf,
linux.SOL_SOCKET,
linux.SO_TIMESTAMP,
t.Arch().Width(),
- linux.NsecToTimeval(timestamp),
+ &timestampP,
)
}
@@ -316,7 +318,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
linux.SOL_TCP,
linux.TCP_INQ,
t.Arch().Width(),
- inq,
+ primitive.AllocateInt32(inq),
)
}
@@ -327,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
linux.SOL_IP,
linux.IP_TOS,
t.Arch().Width(),
- tos,
+ primitive.AllocateUint8(tos),
)
}
@@ -338,7 +340,7 @@ func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
linux.SOL_IPV6,
linux.IPV6_TCLASS,
t.Arch().Width(),
- tClass,
+ primitive.AllocateUint32(tClass),
)
}
@@ -423,7 +425,7 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
// cmsgSpace is equivalent to CMSG_SPACE in Linux.
func cmsgSpace(t *kernel.Task, dataLen int) int {
- return linux.SizeOfControlMessageHeader + binary.AlignUp(dataLen, t.Arch().Width())
+ return linux.SizeOfControlMessageHeader + bits.AlignUp(dataLen, t.Arch().Width())
}
// CmsgsSpace returns the number of bytes needed to fit the control messages
@@ -475,7 +477,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
}
var h linux.ControlMessageHeader
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], hostarch.ByteOrder, &h)
+ h.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageHeader])
if h.Length < uint64(linux.SizeOfControlMessageHeader) {
return socket.ControlMessages{}, syserror.EINVAL
@@ -491,7 +493,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
case linux.SOL_SOCKET:
switch h.Type {
case linux.SCM_RIGHTS:
- rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
+ rightsSize := bits.AlignDown(length, linux.SizeOfControlMessageRight)
numRights := rightsSize / linux.SizeOfControlMessageRight
if len(fds)+numRights > linux.SCM_MAX_FD {
@@ -502,7 +504,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
fds = append(fds, int32(hostarch.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
}
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
case linux.SCM_CREDENTIALS:
if length < linux.SizeOfControlMessageCredentials {
@@ -510,23 +512,23 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
}
var creds linux.ControlMessageCredentials
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], hostarch.ByteOrder, &creds)
+ creds.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageCredentials])
scmCreds, err := NewSCMCredentials(t, creds)
if err != nil {
return socket.ControlMessages{}, err
}
cmsgs.Unix.Credentials = scmCreds
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
case linux.SO_TIMESTAMP:
if length < linux.SizeOfTimeval {
return socket.ControlMessages{}, syserror.EINVAL
}
var ts linux.Timeval
- binary.Unmarshal(buf[i:i+linux.SizeOfTimeval], hostarch.ByteOrder, &ts)
+ ts.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval])
cmsgs.IP.Timestamp = ts.ToNsecCapped()
cmsgs.IP.HasTimestamp = true
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
default:
// Unknown message type.
@@ -539,8 +541,10 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
return socket.ControlMessages{}, syserror.EINVAL
}
cmsgs.IP.HasTOS = true
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], hostarch.ByteOrder, &cmsgs.IP.TOS)
- i += binary.AlignUp(length, width)
+ var tos primitive.Uint8
+ tos.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageTOS])
+ cmsgs.IP.TOS = uint8(tos)
+ i += bits.AlignUp(length, width)
case linux.IP_PKTINFO:
if length < linux.SizeOfControlMessageIPPacketInfo {
@@ -549,19 +553,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
cmsgs.IP.HasIPPacketInfo = true
var packetInfo linux.ControlMessageIPPacketInfo
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], hostarch.ByteOrder, &packetInfo)
+ packetInfo.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageIPPacketInfo])
cmsgs.IP.PacketInfo = packetInfo
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
case linux.IP_RECVORIGDSTADDR:
var addr linux.SockAddrInet
if length < addr.SizeBytes() {
return socket.ControlMessages{}, syserror.EINVAL
}
- binary.Unmarshal(buf[i:i+addr.SizeBytes()], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()])
cmsgs.IP.OriginalDstAddress = &addr
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
case linux.IP_RECVERR:
var errCmsg linux.SockErrCMsgIPv4
@@ -571,7 +575,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()])
cmsgs.IP.SockErr = &errCmsg
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
default:
return socket.ControlMessages{}, syserror.EINVAL
@@ -583,17 +587,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
return socket.ControlMessages{}, syserror.EINVAL
}
cmsgs.IP.HasTClass = true
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], hostarch.ByteOrder, &cmsgs.IP.TClass)
- i += binary.AlignUp(length, width)
+ var tclass primitive.Uint32
+ tclass.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageTClass])
+ cmsgs.IP.TClass = uint32(tclass)
+ i += bits.AlignUp(length, width)
case linux.IPV6_RECVORIGDSTADDR:
var addr linux.SockAddrInet6
if length < addr.SizeBytes() {
return socket.ControlMessages{}, syserror.EINVAL
}
- binary.Unmarshal(buf[i:i+addr.SizeBytes()], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()])
cmsgs.IP.OriginalDstAddress = &addr
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
case linux.IPV6_RECVERR:
var errCmsg linux.SockErrCMsgIPv6
@@ -603,7 +609,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint)
errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()])
cmsgs.IP.SockErr = &errCmsg
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
default:
return socket.ControlMessages{}, syserror.EINVAL
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index a5c2155a2..3c6511ead 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -17,7 +17,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/fdnotifier",
"//pkg/hostarch",
@@ -40,8 +39,6 @@ go_library(
"//pkg/syserr",
"//pkg/syserror",
"//pkg/tcpip",
- "//pkg/tcpip/network/ipv4",
- "//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 0d3b23643..52ae4bc9c 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -19,7 +19,6 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/hostarch"
@@ -529,7 +528,7 @@ func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) s
case linux.SO_TIMESTAMP:
controlMessages.IP.HasTimestamp = true
ts := linux.Timeval{}
- ts.UnmarshalBytes(unixCmsg.Data[:linux.SizeOfTimeval])
+ ts.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfTimeval])
controlMessages.IP.Timestamp = ts.ToNsecCapped()
}
@@ -537,17 +536,19 @@ func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) s
switch unixCmsg.Header.Type {
case linux.IP_TOS:
controlMessages.IP.HasTOS = true
- binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], hostarch.ByteOrder, &controlMessages.IP.TOS)
+ var tos primitive.Uint8
+ tos.UnmarshalUnsafe(unixCmsg.Data[:tos.SizeBytes()])
+ controlMessages.IP.TOS = uint8(tos)
case linux.IP_PKTINFO:
controlMessages.IP.HasIPPacketInfo = true
var packetInfo linux.ControlMessageIPPacketInfo
- binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], hostarch.ByteOrder, &packetInfo)
+ packetInfo.UnmarshalUnsafe(unixCmsg.Data[:packetInfo.SizeBytes()])
controlMessages.IP.PacketInfo = packetInfo
case linux.IP_RECVORIGDSTADDR:
var addr linux.SockAddrInet
- binary.Unmarshal(unixCmsg.Data[:addr.SizeBytes()], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(unixCmsg.Data[:addr.SizeBytes()])
controlMessages.IP.OriginalDstAddress = &addr
case unix.IP_RECVERR:
@@ -560,11 +561,13 @@ func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) s
switch unixCmsg.Header.Type {
case linux.IPV6_TCLASS:
controlMessages.IP.HasTClass = true
- binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], hostarch.ByteOrder, &controlMessages.IP.TClass)
+ var tclass primitive.Uint32
+ tclass.UnmarshalUnsafe(unixCmsg.Data[:tclass.SizeBytes()])
+ controlMessages.IP.TClass = uint32(tclass)
case linux.IPV6_RECVORIGDSTADDR:
var addr linux.SockAddrInet6
- binary.Unmarshal(unixCmsg.Data[:addr.SizeBytes()], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(unixCmsg.Data[:addr.SizeBytes()])
controlMessages.IP.OriginalDstAddress = &addr
case unix.IPV6_RECVERR:
@@ -577,7 +580,9 @@ func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) s
switch unixCmsg.Header.Type {
case linux.TCP_INQ:
controlMessages.IP.HasInq = true
- binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageInq], hostarch.ByteOrder, &controlMessages.IP.Inq)
+ var inq primitive.Int32
+ inq.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfControlMessageInq])
+ controlMessages.IP.Inq = int32(inq)
}
}
}
@@ -691,7 +696,7 @@ func (s *socketOpsCommon) State() uint32 {
return 0
}
- binary.Unmarshal(buf, hostarch.ByteOrder, &info)
+ info.UnmarshalUnsafe(buf[:info.SizeBytes()])
return uint32(info.State)
}
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 26e8ae17a..cbb1e905d 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -15,6 +15,7 @@
package hostinet
import (
+ "encoding/binary"
"fmt"
"io"
"io/ioutil"
@@ -26,16 +27,14 @@ import (
"syscall"
"golang.org/x/sys/unix"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
- "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
- "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -65,8 +64,6 @@ type Stack struct {
tcpSACKEnabled bool
netDevFile *os.File
netSNMPFile *os.File
- ipv4Forwarding bool
- ipv6Forwarding bool
}
// NewStack returns an empty Stack containing no configuration.
@@ -126,13 +123,6 @@ func (s *Stack) Configure() error {
s.netSNMPFile = f
}
- s.ipv6Forwarding = false
- if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil {
- s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0"
- } else {
- log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false")
- }
-
return nil
}
@@ -147,8 +137,8 @@ func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.Netli
if len(link.Data) < unix.SizeofIfInfomsg {
return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid data length (%d bytes, expected at least %d bytes)", len(link.Data), unix.SizeofIfInfomsg)
}
- var ifinfo unix.IfInfomsg
- binary.Unmarshal(link.Data[:unix.SizeofIfInfomsg], hostarch.ByteOrder, &ifinfo)
+ var ifinfo linux.InterfaceInfoMessage
+ ifinfo.UnmarshalUnsafe(link.Data[:ifinfo.SizeBytes()])
inetIF := inet.Interface{
DeviceType: ifinfo.Type,
Flags: ifinfo.Flags,
@@ -178,11 +168,11 @@ func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.Netli
if len(addr.Data) < unix.SizeofIfAddrmsg {
return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid data length (%d bytes, expected at least %d bytes)", len(addr.Data), unix.SizeofIfAddrmsg)
}
- var ifaddr unix.IfAddrmsg
- binary.Unmarshal(addr.Data[:unix.SizeofIfAddrmsg], hostarch.ByteOrder, &ifaddr)
+ var ifaddr linux.InterfaceAddrMessage
+ ifaddr.UnmarshalUnsafe(addr.Data[:ifaddr.SizeBytes()])
inetAddr := inet.InterfaceAddr{
Family: ifaddr.Family,
- PrefixLen: ifaddr.Prefixlen,
+ PrefixLen: ifaddr.PrefixLen,
Flags: ifaddr.Flags,
}
attrs, err := syscall.ParseNetlinkRouteAttr(&addr)
@@ -210,13 +200,13 @@ func ExtractHostRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error)
continue
}
- var ifRoute unix.RtMsg
- binary.Unmarshal(routeMsg.Data[:unix.SizeofRtMsg], hostarch.ByteOrder, &ifRoute)
+ var ifRoute linux.RouteMessage
+ ifRoute.UnmarshalUnsafe(routeMsg.Data[:ifRoute.SizeBytes()])
inetRoute := inet.Route{
Family: ifRoute.Family,
- DstLen: ifRoute.Dst_len,
- SrcLen: ifRoute.Src_len,
- TOS: ifRoute.Tos,
+ DstLen: ifRoute.DstLen,
+ SrcLen: ifRoute.SrcLen,
+ TOS: ifRoute.TOS,
Table: ifRoute.Table,
Protocol: ifRoute.Protocol,
Scope: ifRoute.Scope,
@@ -245,7 +235,9 @@ func ExtractHostRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error)
if len(attr.Value) != expected {
return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid attribute data length (%d bytes, expected %d bytes)", len(attr.Value), expected)
}
- binary.Unmarshal(attr.Value, hostarch.ByteOrder, &inetRoute.OutputInterface)
+ var outputIF primitive.Int32
+ outputIF.UnmarshalUnsafe(attr.Value)
+ inetRoute.OutputInterface = int32(outputIF)
}
}
@@ -489,19 +481,6 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
-// Forwarding implements inet.Stack.Forwarding.
-func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
- switch protocol {
- case ipv4.ProtocolNumber:
- return s.ipv4Forwarding
- case ipv6.ProtocolNumber:
- return s.ipv6Forwarding
- default:
- log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol)
- return false
- }
-}
-
// SetForwarding implements inet.Stack.SetForwarding.
func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error {
return syserror.EACCES
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 4381dfa06..61b2c9755 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -14,14 +14,16 @@ go_library(
"tcp_matcher.go",
"udp_matcher.go",
],
+ marshal = True,
# This target depends on netstack and should only be used by epsocket,
# which is allowed to depend on netstack.
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
+ "//pkg/bits",
"//pkg/hostarch",
"//pkg/log",
+ "//pkg/marshal",
"//pkg/sentry/kernel",
"//pkg/syserr",
"//pkg/tcpip",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 4bd305a44..6fc7781ad 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -18,8 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -79,7 +78,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
nflog("marshaling matcher %q", name)
// We have to pad this struct size to a multiple of 8 bytes.
- size := binary.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+ size := bits.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8)
matcher := linux.KernelXTEntryMatch{
XTEntryMatch: linux.XTEntryMatch{
MatchSize: uint16(size),
@@ -88,9 +87,11 @@ func marshalEntryMatch(name string, data []byte) []byte {
}
copy(matcher.Name[:], name)
- buf := make([]byte, 0, size)
- buf = binary.Marshal(buf, hostarch.ByteOrder, matcher)
- return append(buf, make([]byte, size-len(buf))...)
+ buf := make([]byte, size)
+ entryLen := matcher.XTEntryMatch.SizeBytes()
+ matcher.XTEntryMatch.MarshalUnsafe(buf[:entryLen])
+ copy(buf[entryLen:], matcher.Data)
+ return buf
}
func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) {
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
index 1fc4cb651..cb78ef60b 100644
--- a/pkg/sentry/socket/netfilter/ipv4.go
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -18,8 +18,6 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -141,10 +139,9 @@ func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace,
return nil, syserr.ErrInvalidArgument
}
var entry linux.IPTEntry
- buf := optVal[:linux.SizeOfIPTEntry]
- binary.Unmarshal(buf, hostarch.ByteOrder, &entry)
+ entry.UnmarshalUnsafe(optVal[:entry.SizeBytes()])
initialOptValLen := len(optVal)
- optVal = optVal[linux.SizeOfIPTEntry:]
+ optVal = optVal[entry.SizeBytes():]
if entry.TargetOffset < linux.SizeOfIPTEntry {
nflog("entry has too-small target offset %d", entry.TargetOffset)
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
index 67a52b628..5cb7fe4aa 100644
--- a/pkg/sentry/socket/netfilter/ipv6.go
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -18,8 +18,6 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -144,10 +142,9 @@ func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace,
return nil, syserr.ErrInvalidArgument
}
var entry linux.IP6TEntry
- buf := optVal[:linux.SizeOfIP6TEntry]
- binary.Unmarshal(buf, hostarch.ByteOrder, &entry)
+ entry.UnmarshalUnsafe(optVal[:entry.SizeBytes()])
initialOptValLen := len(optVal)
- optVal = optVal[linux.SizeOfIP6TEntry:]
+ optVal = optVal[entry.SizeBytes():]
if entry.TargetOffset < linux.SizeOfIP6TEntry {
nflog("entry has too-small target offset %d", entry.TargetOffset)
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index c6fa3fd16..f42d73178 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -22,7 +22,6 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -121,7 +120,7 @@ func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, outLe
nflog("couldn't read entries: %v", err)
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
}
- if binary.Size(entries) > uintptr(outLen) {
+ if entries.SizeBytes() > outLen {
nflog("insufficient GetEntries output size: %d", uintptr(outLen))
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
}
@@ -146,7 +145,7 @@ func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, outLe
nflog("couldn't read entries: %v", err)
return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
}
- if binary.Size(entries) > uintptr(outLen) {
+ if entries.SizeBytes() > outLen {
nflog("insufficient GetEntries output size: %d", uintptr(outLen))
return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
}
@@ -179,7 +178,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
var replace linux.IPTReplace
replaceBuf := optVal[:linux.SizeOfIPTReplace]
optVal = optVal[linux.SizeOfIPTReplace:]
- binary.Unmarshal(replaceBuf, hostarch.ByteOrder, &replace)
+ replace.UnmarshalBytes(replaceBuf)
// TODO(gvisor.dev/issue/170): Support other tables.
var table stack.Table
@@ -309,8 +308,8 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal))
}
var match linux.XTEntryMatch
- buf := optVal[:linux.SizeOfXTEntryMatch]
- binary.Unmarshal(buf, hostarch.ByteOrder, &match)
+ buf := optVal[:match.SizeBytes()]
+ match.UnmarshalUnsafe(buf)
nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match)
// Check some invariants.
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
index b2cc6be20..60845cab3 100644
--- a/pkg/sentry/socket/netfilter/owner_matcher.go
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -18,8 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -59,8 +58,8 @@ func (ownerMarshaler) marshal(mr matcher) []byte {
}
}
- buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo)
- return marshalEntryMatch(matcherNameOwner, binary.Marshal(buf, hostarch.ByteOrder, iptOwnerInfo))
+ buf := marshal.Marshal(&iptOwnerInfo)
+ return marshalEntryMatch(matcherNameOwner, buf)
}
// unmarshal implements matchMaker.unmarshal.
@@ -72,7 +71,7 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.
// For alignment reasons, the match's total size may
// exceed what's strictly necessary to hold matchData.
var matchData linux.IPTOwnerInfo
- binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], hostarch.ByteOrder, &matchData)
+ matchData.UnmarshalUnsafe(buf[:linux.SizeOfIPTOwnerInfo])
nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData)
var owner OwnerMatcher
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 4ae1592b2..fa5456eee 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,11 +15,12 @@
package netfilter
import (
+ "encoding/binary"
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -189,8 +190,7 @@ func (*standardTargetMaker) marshal(target target) []byte {
Verdict: verdict,
}
- ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, hostarch.ByteOrder, xt)
+ return marshal.Marshal(&xt)
}
func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
@@ -199,8 +199,7 @@ func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (
return nil, syserr.ErrInvalidArgument
}
var standardTarget linux.XTStandardTarget
- buf = buf[:linux.SizeOfXTStandardTarget]
- binary.Unmarshal(buf, hostarch.ByteOrder, &standardTarget)
+ standardTarget.UnmarshalUnsafe(buf[:standardTarget.SizeBytes()])
if standardTarget.Verdict < 0 {
// A Verdict < 0 indicates a non-jump verdict.
@@ -245,8 +244,7 @@ func (*errorTargetMaker) marshal(target target) []byte {
copy(xt.Name[:], errorName)
copy(xt.Target.Name[:], ErrorTargetName)
- ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
- return binary.Marshal(ret, hostarch.ByteOrder, xt)
+ return marshal.Marshal(&xt)
}
func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
@@ -256,7 +254,7 @@ func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (tar
}
var errTgt linux.XTErrorTarget
buf = buf[:linux.SizeOfXTErrorTarget]
- binary.Unmarshal(buf, hostarch.ByteOrder, &errTgt)
+ errTgt.UnmarshalUnsafe(buf)
// Error targets are used in 2 cases:
// * An actual error case. These rules have an error named
@@ -299,12 +297,11 @@ func (*redirectTargetMaker) marshal(target target) []byte {
}
copy(xt.Target.Name[:], RedirectTargetName)
- ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
xt.NfRange.RangeSize = 1
xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
xt.NfRange.RangeIPV4.MinPort = htons(rt.Port)
xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
- return binary.Marshal(ret, hostarch.ByteOrder, xt)
+ return marshal.Marshal(&xt)
}
func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
@@ -320,7 +317,7 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (
var rt linux.XTRedirectTarget
buf = buf[:linux.SizeOfXTRedirectTarget]
- binary.Unmarshal(buf, hostarch.ByteOrder, &rt)
+ rt.UnmarshalUnsafe(buf)
// Copy linux.XTRedirectTarget to stack.RedirectTarget.
target := redirectTarget{RedirectTarget: stack.RedirectTarget{
@@ -359,6 +356,7 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (
return &target, nil
}
+// +marshal
type nfNATTarget struct {
Target linux.XTEntryTarget
Range linux.NFNATRange
@@ -394,8 +392,7 @@ func (*nfNATTargetMaker) marshal(target target) []byte {
nt.Range.MinProto = htons(rt.Port)
nt.Range.MaxProto = nt.Range.MinProto
- ret := make([]byte, 0, nfNATMarshalledSize)
- return binary.Marshal(ret, hostarch.ByteOrder, nt)
+ return marshal.Marshal(&nt)
}
func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
@@ -411,7 +408,7 @@ func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (tar
var natRange linux.NFNATRange
buf = buf[linux.SizeOfXTEntryTarget:nfNATMarshalledSize]
- binary.Unmarshal(buf, hostarch.ByteOrder, &natRange)
+ natRange.UnmarshalUnsafe(buf)
// We don't support port or address ranges.
if natRange.MinAddr != natRange.MaxAddr {
@@ -468,8 +465,7 @@ func (*snatTargetMakerV4) marshal(target target) []byte {
xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
copy(xt.NfRange.RangeIPV4.MinIP[:], st.Addr)
copy(xt.NfRange.RangeIPV4.MaxIP[:], st.Addr)
- ret := make([]byte, 0, linux.SizeOfXTSNATTarget)
- return binary.Marshal(ret, hostarch.ByteOrder, xt)
+ return marshal.Marshal(&xt)
}
func (*snatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
@@ -485,7 +481,7 @@ func (*snatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (ta
var st linux.XTSNATTarget
buf = buf[:linux.SizeOfXTSNATTarget]
- binary.Unmarshal(buf, hostarch.ByteOrder, &st)
+ st.UnmarshalUnsafe(buf)
// Copy linux.XTSNATTarget to stack.SNATTarget.
target := snatTarget{SNATTarget: stack.SNATTarget{
@@ -550,8 +546,7 @@ func (*snatTargetMakerV6) marshal(target target) []byte {
nt.Range.MinProto = htons(st.Port)
nt.Range.MaxProto = nt.Range.MinProto
- ret := make([]byte, 0, nfNATMarshalledSize)
- return binary.Marshal(ret, hostarch.ByteOrder, nt)
+ return marshal.Marshal(&nt)
}
func (*snatTargetMakerV6) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
@@ -567,9 +562,9 @@ func (*snatTargetMakerV6) unmarshal(buf []byte, filter stack.IPHeaderFilter) (ta
var natRange linux.NFNATRange
buf = buf[linux.SizeOfXTEntryTarget:nfNATMarshalledSize]
- binary.Unmarshal(buf, hostarch.ByteOrder, &natRange)
+ natRange.UnmarshalUnsafe(buf)
- // TODO(gvisor.dev/issue/5689): Support port or address ranges.
+ // TODO(gvisor.dev/issue/5697): Support port or address ranges.
if natRange.MinAddr != natRange.MaxAddr {
nflog("snatTargetMakerV6: MinAddr and MaxAddr are different")
return nil, syserr.ErrInvalidArgument
@@ -631,8 +626,7 @@ func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.T
return nil, syserr.ErrInvalidArgument
}
var target linux.XTEntryTarget
- buf := optVal[:linux.SizeOfXTEntryTarget]
- binary.Unmarshal(buf, hostarch.ByteOrder, &target)
+ target.UnmarshalUnsafe(optVal[:target.SizeBytes()])
return unmarshalTarget(target, filter, optVal)
}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 69557f515..95bb9826e 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -18,8 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -47,8 +46,7 @@ func (tcpMarshaler) marshal(mr matcher) []byte {
DestinationPortStart: matcher.destinationPortStart,
DestinationPortEnd: matcher.destinationPortEnd,
}
- buf := make([]byte, 0, linux.SizeOfXTTCP)
- return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, hostarch.ByteOrder, xttcp))
+ return marshalEntryMatch(matcherNameTCP, marshal.Marshal(&xttcp))
}
// unmarshal implements matchMaker.unmarshal.
@@ -60,7 +58,7 @@ func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma
// For alignment reasons, the match's total size may
// exceed what's strictly necessary to hold matchData.
var matchData linux.XTTCP
- binary.Unmarshal(buf[:linux.SizeOfXTTCP], hostarch.ByteOrder, &matchData)
+ matchData.UnmarshalUnsafe(buf[:matchData.SizeBytes()])
nflog("parseMatchers: parsed XTTCP: %+v", matchData)
if matchData.Option != 0 ||
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 6a60e6bd6..fb8be27e6 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -18,8 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -47,8 +46,7 @@ func (udpMarshaler) marshal(mr matcher) []byte {
DestinationPortStart: matcher.destinationPortStart,
DestinationPortEnd: matcher.destinationPortEnd,
}
- buf := make([]byte, 0, linux.SizeOfXTUDP)
- return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, hostarch.ByteOrder, xtudp))
+ return marshalEntryMatch(matcherNameUDP, marshal.Marshal(&xtudp))
}
// unmarshal implements matchMaker.unmarshal.
@@ -60,7 +58,7 @@ func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma
// For alignment reasons, the match's total size may exceed what's
// strictly necessary to hold matchData.
var matchData linux.XTUDP
- binary.Unmarshal(buf[:linux.SizeOfXTUDP], hostarch.ByteOrder, &matchData)
+ matchData.UnmarshalUnsafe(buf[:matchData.SizeBytes()])
nflog("parseMatchers: parsed XTUDP: %+v", matchData)
if matchData.InverseFlags != 0 {
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 171b95c63..64cd263da 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -14,7 +14,7 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
+ "//pkg/bits",
"//pkg/context",
"//pkg/hostarch",
"//pkg/marshal",
@@ -50,5 +50,7 @@ go_test(
deps = [
":netlink",
"//pkg/abi/linux",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
],
)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index ab0e68af7..80385bfdc 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -19,15 +19,17 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
)
// alignPad returns the length of padding required for alignment.
//
// Preconditions: align is a power of two.
func alignPad(length int, align uint) int {
- return binary.AlignUp(length, align) - length
+ return bits.AlignUp(length, align) - length
}
// Message contains a complete serialized netlink message.
@@ -42,7 +44,7 @@ type Message struct {
func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
return &Message{
hdr: hdr,
- buf: binary.Marshal(nil, hostarch.ByteOrder, hdr),
+ buf: marshal.Marshal(&hdr),
}
}
@@ -58,7 +60,7 @@ func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
return
}
var hdr linux.NetlinkMessageHeader
- binary.Unmarshal(hdrBytes, hostarch.ByteOrder, &hdr)
+ hdr.UnmarshalUnsafe(hdrBytes)
// Msg portion.
totalMsgLen := int(hdr.Length)
@@ -92,7 +94,7 @@ func (m *Message) Header() linux.NetlinkMessageHeader {
// GetData unmarshals the payload message header from this netlink message, and
// returns the attributes portion.
-func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
+func (m *Message) GetData(msg marshal.Marshallable) (AttrsView, bool) {
b := BytesView(m.buf)
_, ok := b.Extract(linux.NetlinkMessageHeaderSize)
@@ -100,12 +102,12 @@ func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
return nil, false
}
- size := int(binary.Size(msg))
+ size := msg.SizeBytes()
msgBytes, ok := b.Extract(size)
if !ok {
return nil, false
}
- binary.Unmarshal(msgBytes, hostarch.ByteOrder, msg)
+ msg.UnmarshalUnsafe(msgBytes)
numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
// Linux permits the last message not being aligned, just consume all of it.
@@ -131,7 +133,7 @@ func (m *Message) Finalize() []byte {
// Align the message. Note that the message length in the header (set
// above) is the useful length of the message, not the total aligned
// length. See net/netlink/af_netlink.c:__nlmsg_put.
- aligned := binary.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO)
+ aligned := bits.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO)
m.putZeros(aligned - len(m.buf))
return m.buf
}
@@ -145,45 +147,45 @@ func (m *Message) putZeros(n int) {
}
// Put serializes v into the message.
-func (m *Message) Put(v interface{}) {
- m.buf = binary.Marshal(m.buf, hostarch.ByteOrder, v)
+func (m *Message) Put(v marshal.Marshallable) {
+ m.buf = append(m.buf, marshal.Marshal(v)...)
}
// PutAttr adds v to the message as a netlink attribute.
//
// Preconditions: The serialized attribute (linux.NetlinkAttrHeaderSize +
-// binary.Size(v) fits in math.MaxUint16 bytes.
-func (m *Message) PutAttr(atype uint16, v interface{}) {
- l := linux.NetlinkAttrHeaderSize + int(binary.Size(v))
+// v.SizeBytes()) fits in math.MaxUint16 bytes.
+func (m *Message) PutAttr(atype uint16, v marshal.Marshallable) {
+ l := linux.NetlinkAttrHeaderSize + v.SizeBytes()
if l > math.MaxUint16 {
panic(fmt.Sprintf("attribute too large: %d", l))
}
- m.Put(linux.NetlinkAttrHeader{
+ m.Put(&linux.NetlinkAttrHeader{
Type: atype,
Length: uint16(l),
})
m.Put(v)
// Align the attribute.
- aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
+ aligned := bits.AlignUp(l, linux.NLA_ALIGNTO)
m.putZeros(aligned - l)
}
// PutAttrString adds s to the message as a netlink attribute.
func (m *Message) PutAttrString(atype uint16, s string) {
l := linux.NetlinkAttrHeaderSize + len(s) + 1
- m.Put(linux.NetlinkAttrHeader{
+ m.Put(&linux.NetlinkAttrHeader{
Type: atype,
Length: uint16(l),
})
// String + NUL-termination.
- m.Put([]byte(s))
+ m.Put(primitive.AsByteSlice([]byte(s)))
m.putZeros(1)
// Align the attribute.
- aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
+ aligned := bits.AlignUp(l, linux.NLA_ALIGNTO)
m.putZeros(aligned - l)
}
@@ -251,7 +253,7 @@ func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest
if !ok {
return
}
- binary.Unmarshal(hdrBytes, hostarch.ByteOrder, &hdr)
+ hdr.UnmarshalUnsafe(hdrBytes)
value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
if !ok {
diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go
index ef13d9386..968968469 100644
--- a/pkg/sentry/socket/netlink/message_test.go
+++ b/pkg/sentry/socket/netlink/message_test.go
@@ -20,13 +20,31 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
)
type dummyNetlinkMsg struct {
+ marshal.StubMarshallable
Foo uint16
}
+func (*dummyNetlinkMsg) SizeBytes() int {
+ return 2
+}
+
+func (m *dummyNetlinkMsg) MarshalUnsafe(dst []byte) {
+ p := primitive.Uint16(m.Foo)
+ p.MarshalUnsafe(dst)
+}
+
+func (m *dummyNetlinkMsg) UnmarshalUnsafe(src []byte) {
+ var p primitive.Uint16
+ p.UnmarshalUnsafe(src)
+ m.Foo = uint16(p)
+}
+
func TestParseMessage(t *testing.T) {
tests := []struct {
desc string
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 744fc74f4..c6c04b4e3 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -11,6 +11,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 5a2255db3..86f6419dc 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -21,6 +21,7 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -167,7 +168,7 @@ func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
Type: linux.RTM_NEWLINK,
})
- m.Put(linux.InterfaceInfoMessage{
+ m.Put(&linux.InterfaceInfoMessage{
Family: linux.AF_UNSPEC,
Type: i.DeviceType,
Index: idx,
@@ -175,7 +176,7 @@ func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
})
m.PutAttrString(linux.IFLA_IFNAME, i.Name)
- m.PutAttr(linux.IFLA_MTU, i.MTU)
+ m.PutAttr(linux.IFLA_MTU, primitive.AllocateUint32(i.MTU))
mac := make([]byte, 6)
brd := mac
@@ -183,8 +184,8 @@ func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
mac = i.Addr
brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
}
- m.PutAttr(linux.IFLA_ADDRESS, mac)
- m.PutAttr(linux.IFLA_BROADCAST, brd)
+ m.PutAttr(linux.IFLA_ADDRESS, primitive.AsByteSlice(mac))
+ m.PutAttr(linux.IFLA_BROADCAST, primitive.AsByteSlice(brd))
// TODO(gvisor.dev/issue/578): There are many more attributes.
}
@@ -216,14 +217,15 @@ func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netl
Type: linux.RTM_NEWADDR,
})
- m.Put(linux.InterfaceAddrMessage{
+ m.Put(&linux.InterfaceAddrMessage{
Family: a.Family,
PrefixLen: a.PrefixLen,
Index: uint32(id),
})
- m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr))
- m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
+ addr := primitive.ByteSlice([]byte(a.Addr))
+ m.PutAttr(linux.IFA_LOCAL, &addr)
+ m.PutAttr(linux.IFA_ADDRESS, &addr)
// TODO(gvisor.dev/issue/578): There are many more attributes.
}
@@ -366,7 +368,7 @@ func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *net
Type: linux.RTM_NEWROUTE,
})
- m.Put(linux.RouteMessage{
+ m.Put(&linux.RouteMessage{
Family: rt.Family,
DstLen: rt.DstLen,
SrcLen: rt.SrcLen,
@@ -382,18 +384,18 @@ func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *net
Flags: rt.Flags,
})
- m.PutAttr(254, []byte{123})
+ m.PutAttr(254, primitive.AsByteSlice([]byte{123}))
if rt.DstLen > 0 {
- m.PutAttr(linux.RTA_DST, rt.DstAddr)
+ m.PutAttr(linux.RTA_DST, primitive.AsByteSlice(rt.DstAddr))
}
if rt.SrcLen > 0 {
- m.PutAttr(linux.RTA_SRC, rt.SrcAddr)
+ m.PutAttr(linux.RTA_SRC, primitive.AsByteSlice(rt.SrcAddr))
}
if rt.OutputInterface != 0 {
- m.PutAttr(linux.RTA_OIF, rt.OutputInterface)
+ m.PutAttr(linux.RTA_OIF, primitive.AllocateInt32(rt.OutputInterface))
}
if len(rt.GatewayAddr) > 0 {
- m.PutAttr(linux.RTA_GATEWAY, rt.GatewayAddr)
+ m.PutAttr(linux.RTA_GATEWAY, primitive.AsByteSlice(rt.GatewayAddr))
}
// TODO(gvisor.dev/issue/578): There are many more attributes.
@@ -503,7 +505,7 @@ func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms
hdr := msg.Header()
// All messages start with a 1 byte protocol family.
- var family uint8
+ var family primitive.Uint8
if _, ok := msg.GetData(&family); !ok {
// Linux ignores messages missing the protocol family. See
// net/core/rtnetlink.c:rtnetlink_rcv_msg.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 30c297149..280563d09 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -20,7 +20,6 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal"
@@ -223,7 +222,7 @@ func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) {
}
var sa linux.SockAddrNetlink
- binary.Unmarshal(b[:linux.SockAddrNetlinkSize], hostarch.ByteOrder, &sa)
+ sa.UnmarshalUnsafe(b[:sa.SizeBytes()])
if sa.Family != linux.AF_NETLINK {
return nil, syserr.ErrInvalidArgument
@@ -338,16 +337,14 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
}
s.mu.Lock()
defer s.mu.Unlock()
- sendBufferSizeP := primitive.Int32(s.sendBufferSize)
- return &sendBufferSizeP, nil
+ return primitive.AllocateInt32(int32(s.sendBufferSize)), nil
case linux.SO_RCVBUF:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
// We don't have limit on receiving size.
- recvBufferSizeP := primitive.Int32(math.MaxInt32)
- return &recvBufferSizeP, nil
+ return primitive.AllocateInt32(math.MaxInt32), nil
case linux.SO_PASSCRED:
if outLen < sizeOfInt32 {
@@ -484,7 +481,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *
Family: linux.AF_NETLINK,
PortID: uint32(s.portID),
}
- return sa, uint32(binary.Size(sa)), nil
+ return sa, uint32(sa.SizeBytes()), nil
}
// GetPeerName implements socket.Socket.GetPeerName.
@@ -495,7 +492,7 @@ func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *
// must be the kernel.
PortID: 0,
}
- return sa, uint32(binary.Size(sa)), nil
+ return sa, uint32(sa.SizeBytes()), nil
}
// RecvMsg implements socket.Socket.RecvMsg.
@@ -504,7 +501,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
Family: linux.AF_NETLINK,
PortID: 0,
}
- fromLen := uint32(binary.Size(from))
+ fromLen := uint32(from.SizeBytes())
trunc := flags&linux.MSG_TRUNC != 0
@@ -640,7 +637,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys
})
// Add the dump_done_errno payload.
- m.Put(int64(0))
+ m.Put(primitive.AllocateInt64(0))
_, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
if err != nil && err != syserr.ErrWouldBlock {
@@ -658,8 +655,8 @@ func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr
m := ms.AddMessage(linux.NetlinkMessageHeader{
Type: linux.NLMSG_ERROR,
})
- m.Put(linux.NetlinkErrorMessage{
- Error: int32(-err.ToLinux().Number()),
+ m.Put(&linux.NetlinkErrorMessage{
+ Error: int32(-err.ToLinux()),
Header: hdr,
})
}
@@ -668,7 +665,7 @@ func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
m := ms.AddMessage(linux.NetlinkMessageHeader{
Type: linux.NLMSG_ERROR,
})
- m.Put(linux.NetlinkErrorMessage{
+ m.Put(&linux.NetlinkErrorMessage{
Error: 0,
Header: hdr,
})
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 0b39a5b67..9561b7c25 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -19,7 +19,6 @@ go_library(
],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/hostarch",
"//pkg/log",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 312f5f85a..0b64a24c3 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -26,6 +26,7 @@ package netstack
import (
"bytes"
+ "encoding/binary"
"fmt"
"io"
"io/ioutil"
@@ -35,7 +36,6 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
@@ -199,6 +199,15 @@ var Metrics = tcpip.Stats{
OptionRecordRouteReceived: mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
OptionRouterAlertReceived: mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
OptionUnknownReceived: mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
+ Forwarding: tcpip.IPForwardingStats{
+ Unrouteable: mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
+ ExhaustedTTL: mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
+ LinkLocalSource: mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
+ LinkLocalDestination: mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
+ ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
+ PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not fit within the outgoing MTU."),
+ Errors: mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
+ },
},
ARP: tcpip.ARPStats{
PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
@@ -375,9 +384,9 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue
}), nil
}
-var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{}))
-var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{}))
-var sockAddrLinkSize = int(binary.Size(linux.SockAddrLink{}))
+var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
+var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
+var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()
// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the
// netstack representation taking any addresses into account.
@@ -613,7 +622,7 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if len(sockaddr) < sockAddrLinkSize {
return syserr.ErrInvalidArgument
}
- binary.Unmarshal(sockaddr[:sockAddrLinkSize], hostarch.ByteOrder, &a)
+ a.UnmarshalBytes(sockaddr[:sockAddrLinkSize])
if a.Protocol != uint16(s.protocol) {
return syserr.ErrInvalidArgument
@@ -843,7 +852,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return &optP, nil
}
- optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux().Number())
+ optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
return &optP, nil
case linux.SO_PEERCRED:
@@ -1312,7 +1321,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
return &v, nil
case linux.IP6T_ORIGINAL_DST:
- if outLen < int(binary.Size(linux.SockAddrInet6{})) {
+ if outLen < sockAddrInet6Size {
return nil, syserr.ErrInvalidArgument
}
@@ -1509,7 +1518,7 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
return &v, nil
case linux.SO_ORIGINAL_DST:
- if outLen < int(binary.Size(linux.SockAddrInet{})) {
+ if outLen < sockAddrInetSize {
return nil, syserr.ErrInvalidArgument
}
@@ -1742,7 +1751,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
}
var v linux.Timeval
- binary.Unmarshal(optVal[:linux.SizeOfTimeval], hostarch.ByteOrder, &v)
+ v.UnmarshalBytes(optVal[:linux.SizeOfTimeval])
if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
return syserr.ErrDomain
}
@@ -1755,7 +1764,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
}
var v linux.Timeval
- binary.Unmarshal(optVal[:linux.SizeOfTimeval], hostarch.ByteOrder, &v)
+ v.UnmarshalBytes(optVal[:linux.SizeOfTimeval])
if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
return syserr.ErrDomain
}
@@ -1791,7 +1800,11 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
}
var v linux.Linger
- binary.Unmarshal(optVal[:linux.SizeOfLinger], hostarch.ByteOrder, &v)
+ v.UnmarshalBytes(optVal[:linux.SizeOfLinger])
+
+ if v != (linux.Linger{}) {
+ socket.SetSockOptEmitUnimplementedEvent(t, name)
+ }
ep.SocketOptions().SetLinger(tcpip.LingerOption{
Enabled: v.OnOff != 0,
@@ -2090,9 +2103,9 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
}
var (
- inetMulticastRequestSize = int(binary.Size(linux.InetMulticastRequest{}))
- inetMulticastRequestWithNICSize = int(binary.Size(linux.InetMulticastRequestWithNIC{}))
- inet6MulticastRequestSize = int(binary.Size(linux.Inet6MulticastRequest{}))
+ inetMulticastRequestSize = (*linux.InetMulticastRequest)(nil).SizeBytes()
+ inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
+ inet6MulticastRequestSize = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
)
// copyInMulticastRequest copies in a variable-size multicast request. The
@@ -2117,12 +2130,12 @@ func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastR
if len(optVal) >= inetMulticastRequestWithNICSize {
var req linux.InetMulticastRequestWithNIC
- binary.Unmarshal(optVal[:inetMulticastRequestWithNICSize], hostarch.ByteOrder, &req)
+ req.UnmarshalUnsafe(optVal[:inetMulticastRequestWithNICSize])
return req, nil
}
var req linux.InetMulticastRequestWithNIC
- binary.Unmarshal(optVal[:inetMulticastRequestSize], hostarch.ByteOrder, &req.InetMulticastRequest)
+ req.InetMulticastRequest.UnmarshalUnsafe(optVal[:inetMulticastRequestSize])
return req, nil
}
@@ -2132,7 +2145,7 @@ func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syse
}
var req linux.Inet6MulticastRequest
- binary.Unmarshal(optVal[:inet6MulticastRequestSize], hostarch.ByteOrder, &req)
+ req.UnmarshalUnsafe(optVal[:inet6MulticastRequestSize])
return req, nil
}
@@ -3101,8 +3114,8 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
continue
}
// Populate ifr.ifr_netmask (type sockaddr).
- hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(linux.AF_INET))
- hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
+ hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
+ hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
// Netmask is expected to be returned as a big endian
// value.
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index b215067cf..eef5e6519 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -458,23 +458,10 @@ func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
s.Stack.RestoreCleanupEndpoints(es)
}
-// Forwarding implements inet.Stack.Forwarding.
-func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
- switch protocol {
- case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
- return s.Stack.Forwarding(protocol)
- default:
- panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol))
- }
-}
-
// SetForwarding implements inet.Stack.SetForwarding.
func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
- switch protocol {
- case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
- s.Stack.SetForwarding(protocol, enable)
- default:
- panic(fmt.Sprintf("SetForwarding(%v) failed: unsupported protocol", protocol))
+ if err := s.Stack.SetForwardingDefaultAndAllNICs(protocol, enable); err != nil {
+ return fmt.Errorf("SetForwardingDefaultAndAllNICs(%d, %t): %s", protocol, enable, err)
}
return nil
}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 4c3d48096..353f4ade0 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -24,7 +24,6 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal"
@@ -81,7 +80,7 @@ func sockErrCmsgToLinux(sockErr *tcpip.SockError) linux.SockErrCMsg {
}
ee := linux.SockExtendedErr{
- Errno: uint32(syserr.TranslateNetstackError(sockErr.Err).ToLinux().Number()),
+ Errno: uint32(syserr.TranslateNetstackError(sockErr.Err).ToLinux()),
Origin: errOriginToLinux(sockErr.Cause.Origin()),
Type: sockErr.Cause.Type(),
Code: sockErr.Cause.Code(),
@@ -572,19 +571,19 @@ func UnmarshalSockAddr(family int, data []byte) linux.SockAddr {
switch family {
case unix.AF_INET:
var addr linux.SockAddrInet
- binary.Unmarshal(data[:unix.SizeofSockaddrInet4], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
return &addr
case unix.AF_INET6:
var addr linux.SockAddrInet6
- binary.Unmarshal(data[:unix.SizeofSockaddrInet6], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
return &addr
case unix.AF_UNIX:
var addr linux.SockAddrUnix
- binary.Unmarshal(data[:unix.SizeofSockaddrUnix], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
return &addr
case unix.AF_NETLINK:
var addr linux.SockAddrNetlink
- binary.Unmarshal(data[:unix.SizeofSockaddrNetlink], hostarch.ByteOrder, &addr)
+ addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
return &addr
default:
panic(fmt.Sprintf("Unsupported socket family %v", family))
@@ -716,7 +715,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
if len(addr) < sockAddrInetSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
- binary.Unmarshal(addr[:sockAddrInetSize], hostarch.ByteOrder, &a)
+ a.UnmarshalUnsafe(addr[:sockAddrInetSize])
out := tcpip.FullAddress{
Addr: BytesToIPAddress(a.Addr[:]),
@@ -729,7 +728,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
if len(addr) < sockAddrInet6Size {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
- binary.Unmarshal(addr[:sockAddrInet6Size], hostarch.ByteOrder, &a)
+ a.UnmarshalUnsafe(addr[:sockAddrInet6Size])
out := tcpip.FullAddress{
Addr: BytesToIPAddress(a.Addr[:]),
@@ -745,7 +744,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
if len(addr) < sockAddrLinkSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
- binary.Unmarshal(addr[:sockAddrLinkSize], hostarch.ByteOrder, &a)
+ a.UnmarshalUnsafe(addr[:sockAddrLinkSize])
if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 2ebd77f82..1fbbd133c 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -25,7 +25,6 @@ go_library(
":strace_go_proto",
"//pkg/abi",
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/bits",
"//pkg/eventchannel",
"//pkg/hostarch",
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index 71b92eaee..d66befe81 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -371,6 +371,7 @@ var linuxAMD64 = SyscallMap{
433: makeSyscallInfo("fspick", FD, Path, Hex),
434: makeSyscallInfo("pidfd_open", Hex, Hex),
435: makeSyscallInfo("clone3", Hex, Hex),
+ 441: makeSyscallInfo("epoll_pwait2", FD, EpollEvents, Hex, Timespec, SigSet),
}
func init() {
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
index bd7361a52..1a2d7d75f 100644
--- a/pkg/sentry/strace/linux64_arm64.go
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -312,6 +312,7 @@ var linuxARM64 = SyscallMap{
433: makeSyscallInfo("fspick", FD, Path, Hex),
434: makeSyscallInfo("pidfd_open", Hex, Hex),
435: makeSyscallInfo("clone3", Hex, Hex),
+ 441: makeSyscallInfo("epoll_pwait2", FD, EpollEvents, Hex, Timespec, SigSet),
}
func init() {
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index e5b7f9b96..f4aab25b0 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -20,14 +20,13 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-
- "gvisor.dev/gvisor/pkg/hostarch"
)
// SocketFamily are the possible socket(2) families.
@@ -162,6 +161,15 @@ var controlMessageType = map[int32]string{
linux.SO_TIMESTAMP: "SO_TIMESTAMP",
}
+func unmarshalControlMessageRights(src []byte) linux.ControlMessageRights {
+ count := len(src) / linux.SizeOfControlMessageRight
+ cmr := make(linux.ControlMessageRights, count)
+ for i, _ := range cmr {
+ cmr[i] = int32(hostarch.ByteOrder.Uint32(src[i*linux.SizeOfControlMessageRight:]))
+ }
+ return cmr
+}
+
func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64) string {
if length > maxBytes {
return fmt.Sprintf("%#x (error decoding control: invalid length (%d))", addr, length)
@@ -181,7 +189,7 @@ func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64)
}
var h linux.ControlMessageHeader
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], hostarch.ByteOrder, &h)
+ h.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageHeader])
var skipData bool
level := "SOL_SOCKET"
@@ -221,18 +229,14 @@ func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64)
if skipData {
strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
continue
}
switch h.Type {
case linux.SCM_RIGHTS:
- rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
-
- numRights := rightsSize / linux.SizeOfControlMessageRight
- fds := make(linux.ControlMessageRights, numRights)
- binary.Unmarshal(buf[i:i+rightsSize], hostarch.ByteOrder, &fds)
-
+ rightsSize := bits.AlignDown(length, linux.SizeOfControlMessageRight)
+ fds := unmarshalControlMessageRights(buf[i : i+rightsSize])
rights := make([]string, 0, len(fds))
for _, fd := range fds {
rights = append(rights, fmt.Sprint(fd))
@@ -258,7 +262,7 @@ func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64)
}
var creds linux.ControlMessageCredentials
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], hostarch.ByteOrder, &creds)
+ creds.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageCredentials])
strs = append(strs, fmt.Sprintf(
"{level=%s, type=%s, length=%d, pid: %d, uid: %d, gid: %d}",
@@ -282,7 +286,7 @@ func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64)
}
var tv linux.Timeval
- binary.Unmarshal(buf[i:i+linux.SizeOfTimeval], hostarch.ByteOrder, &tv)
+ tv.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval])
strs = append(strs, fmt.Sprintf(
"{level=%s, type=%s, length=%d, Sec: %d, Usec: %d}",
@@ -296,7 +300,7 @@ func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64)
default:
panic("unreachable")
}
- i += binary.AlignUp(length, width)
+ i += bits.AlignUp(length, width)
}
return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", "))
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index e115683f8..3b4d79889 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -119,7 +119,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
}
// WaitEpoll implements the epoll_wait(2) linux syscall.
-func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) {
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeoutInNanos int64) ([]linux.EpollEvent, error) {
// Get epoll from the file descriptor.
epollfile := t.GetFile(fd)
if epollfile == nil {
@@ -136,7 +136,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve
// Try to read events and return right away if we got them or if the
// caller requested a non-blocking "wait".
r := e.ReadEvents(max)
- if len(r) != 0 || timeout == 0 {
+ if len(r) != 0 || timeoutInNanos == 0 {
return r, nil
}
@@ -144,8 +144,8 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve
// and register with the epoll object for readability events.
var haveDeadline bool
var deadline ktime.Time
- if timeout > 0 {
- timeoutDur := time.Duration(timeout) * time.Millisecond
+ if timeoutInNanos > 0 {
+ timeoutDur := time.Duration(timeoutInNanos) * time.Nanosecond
deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
haveDeadline = true
}
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index c668e81ac..6eabfd219 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -29,8 +29,7 @@ import (
)
var (
- partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.")
- partialResultOnce sync.Once
+ partialResultOnce sync.Once
)
// incrementPartialResultMetric increments PartialResultMetric by calling
@@ -38,7 +37,6 @@ var (
// us to pass a function which does not take any arguments, whereas Increment()
// takes a variadic number of arguments.
func incrementPartialResultMetric() {
- partialResultMetric.Increment()
metric.WeirdnessMetric.Increment("partial_result")
}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 2d2212605..090c5ffcb 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -404,6 +404,7 @@ var AMD64 = &kernel.SyscallTable{
433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+ 441: syscalls.Supported("epoll_pwait2", EpollPwait2),
},
Emulate: map[hostarch.Addr]uintptr{
0xffffffffff600000: 96, // vsyscall gettimeofday(2)
@@ -722,6 +723,7 @@ var ARM64 = &kernel.SyscallTable{
433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+ 441: syscalls.Supported("epoll_pwait2", EpollPwait2),
},
Emulate: map[hostarch.Addr]uintptr{},
Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 7f460d30b..69cbc98d0 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
@@ -104,14 +105,8 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
}
-// EpollWait implements the epoll_wait(2) linux syscall.
-func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- epfd := args[0].Int()
- eventsAddr := args[1].Pointer()
- maxEvents := int(args[2].Int())
- timeout := int(args[3].Int())
-
- r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout)
+func waitEpoll(t *kernel.Task, fd int32, eventsAddr hostarch.Addr, max int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) {
+ r, err := syscalls.WaitEpoll(t, fd, max, timeoutInNanos)
if err != nil {
return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
}
@@ -123,6 +118,17 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
return uintptr(len(r)), nil, nil
+
+}
+
+// EpollWait implements the epoll_wait(2) linux syscall.
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ eventsAddr := args[1].Pointer()
+ maxEvents := int(args[2].Int())
+ // Convert milliseconds to nanoseconds.
+ timeoutInNanos := int64(args[3].Int()) * 1000000
+ return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos)
}
// EpollPwait implements the epoll_pwait(2) linux syscall.
@@ -144,4 +150,38 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return EpollWait(t, args)
}
+// EpollPwait2 implements the epoll_pwait(2) linux syscall.
+func EpollPwait2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ eventsAddr := args[1].Pointer()
+ maxEvents := int(args[2].Int())
+ timeoutPtr := args[3].Pointer()
+ maskAddr := args[4].Pointer()
+ maskSize := uint(args[5].Uint())
+ haveTimeout := timeoutPtr != 0
+
+ var timeoutInNanos int64 = -1
+ if haveTimeout {
+ timeout, err := copyTimespecIn(t, timeoutPtr)
+ if err != nil {
+ return 0, nil, err
+ }
+ timeoutInNanos = timeout.ToNsec()
+
+ }
+
+ if maskAddr != 0 {
+ mask, err := CopyInSigSet(t, maskAddr, maskSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ oldmask := t.SignalMask()
+ t.SetSignalMask(mask)
+ t.SetSavedSignalMask(oldmask)
+ }
+
+ return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos)
+}
+
// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 5e9e940df..e07917613 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -463,8 +463,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, e.ToError()
}
- vLen := int32(v.SizeBytes())
- if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
+ if _, err := primitive.CopyInt32Out(t, optLenAddr, int32(v.SizeBytes())); err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index b980aa43e..047d955b6 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -19,6 +19,7 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -118,13 +119,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
}
-// EpollWait implements Linux syscall epoll_wait(2).
-func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- epfd := args[0].Int()
- eventsAddr := args[1].Pointer()
- maxEvents := int(args[2].Int())
- timeout := int(args[3].Int())
-
+func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) {
var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
return 0, nil, syserror.EINVAL
@@ -158,7 +153,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
return 0, nil, err
}
- if timeout == 0 {
+ if timeoutInNanos == 0 {
return 0, nil, nil
}
// In the first iteration of this loop, register with the epoll
@@ -173,8 +168,8 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
defer epfile.EventUnregister(&w)
} else {
// Set up the timer if a timeout was specified.
- if timeout > 0 && !haveDeadline {
- timeoutDur := time.Duration(timeout) * time.Millisecond
+ if timeoutInNanos > 0 && !haveDeadline {
+ timeoutDur := time.Duration(timeoutInNanos) * time.Nanosecond
deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
haveDeadline = true
}
@@ -186,6 +181,17 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
}
}
+
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ eventsAddr := args[1].Pointer()
+ maxEvents := int(args[2].Int())
+ timeoutInNanos := int64(args[3].Int()) * 1000000
+
+ return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos)
}
// EpollPwait implements Linux syscall epoll_pwait(2).
@@ -199,3 +205,29 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return EpollWait(t, args)
}
+
+// EpollPwait2 implements Linux syscall epoll_pwait(2).
+func EpollPwait2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ epfd := args[0].Int()
+ eventsAddr := args[1].Pointer()
+ maxEvents := int(args[2].Int())
+ timeoutPtr := args[3].Pointer()
+ maskAddr := args[4].Pointer()
+ maskSize := uint(args[5].Uint())
+ haveTimeout := timeoutPtr != 0
+
+ var timeoutInNanos int64 = -1
+ if haveTimeout {
+ var timeout linux.Timespec
+ if _, err := timeout.CopyIn(t, timeoutPtr); err != nil {
+ return 0, nil, err
+ }
+ timeoutInNanos = timeout.ToNsec()
+ }
+
+ if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+ return 0, nil, err
+ }
+
+ return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index 6edde0ed1..69f69e3af 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -467,8 +467,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, e.ToError()
}
- vLen := int32(v.SizeBytes())
- if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
+ if _, err := primitive.CopyInt32Out(t, optLenAddr, int32(v.SizeBytes())); err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index c50fd97eb..0fc81e694 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -159,6 +159,7 @@ func Override() {
s.Table[327] = syscalls.Supported("preadv2", Preadv2)
s.Table[328] = syscalls.Supported("pwritev2", Pwritev2)
s.Table[332] = syscalls.Supported("statx", Statx)
+ s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2)
s.Init()
// Override ARM64.
@@ -269,6 +270,7 @@ func Override() {
s.Table[286] = syscalls.Supported("preadv2", Preadv2)
s.Table[287] = syscalls.Supported("pwritev2", Pwritev2)
s.Table[291] = syscalls.Supported("statx", Statx)
+ s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2)
s.Init()
}
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index 94f98d746..39bf1e0de 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -25,11 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-// fallbackMetric tracks failed updates. It is not sync, as it is not critical
-// that all occurrences are captured and CalibratedClock may fallback many
-// times.
-var fallbackMetric = metric.MustCreateNewUint64Metric("/time/fallback", false /* sync */, "Incremented when a clock falls back to system calls due to a failed update")
-
// CalibratedClock implements a clock that tracks a reference clock.
//
// Users should call Update at regular intervals of around approxUpdateInterval
@@ -102,8 +97,7 @@ func (c *CalibratedClock) resetLocked(str string, v ...interface{}) {
c.Warningf(str+" Resetting clock; time may jump.", v...)
c.ready = false
c.ref.Reset()
- fallbackMetric.Increment()
- metric.WeirdnessMetric.Increment("fallback")
+ metric.WeirdnessMetric.Increment("time_fallback")
}
// updateParams updates the timekeeping parameters based on the passed
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index f612a71b2..ef8d8a813 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -454,6 +454,9 @@ type FileDescriptionImpl interface {
// RemoveXattr removes the given extended attribute from the file.
RemoveXattr(ctx context.Context, name string) error
+ // SupportsLocks indicates whether file locks are supported.
+ SupportsLocks() bool
+
// LockBSD tries to acquire a BSD-style advisory file lock.
LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block lock.Blocker) error
@@ -524,7 +527,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St
Start: fd.vd,
})
stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return stat, err
}
return fd.impl.Stat(ctx, opts)
@@ -539,7 +542,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
return fd.impl.SetStat(ctx, opts)
@@ -555,7 +558,7 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
Start: fd.vd,
})
statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return statfs, err
}
return fd.impl.StatFS(ctx)
@@ -701,7 +704,7 @@ func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string
Start: fd.vd,
})
names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return names, err
}
names, err := fd.impl.ListXattr(ctx, size)
@@ -730,7 +733,7 @@ func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions)
Start: fd.vd,
})
val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return val, err
}
return fd.impl.GetXattr(ctx, *opts)
@@ -746,7 +749,7 @@ func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions)
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
return fd.impl.SetXattr(ctx, *opts)
@@ -762,7 +765,7 @@ func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
return fd.impl.RemoveXattr(ctx, name)
@@ -818,6 +821,11 @@ func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) e
return fd.Sync(ctx)
}
+// SupportsLocks indicates whether file locks are supported.
+func (fd *FileDescription) SupportsLocks() bool {
+ return fd.impl.SupportsLocks()
+}
+
// LockBSD tries to acquire a BSD-style advisory file lock.
func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, blocker lock.Blocker) error {
atomic.StoreUint32(&fd.usedLockBSD, 1)
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index b87d9690a..2b6f47b4b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -413,6 +413,11 @@ type LockFD struct {
locks *FileLocks
}
+// SupportsLocks implements FileDescriptionImpl.SupportsLocks.
+func (LockFD) SupportsLocks() bool {
+ return true
+}
+
// Init initializes fd with FileLocks to use.
func (fd *LockFD) Init(locks *FileLocks) {
fd.locks = locks
@@ -423,28 +428,28 @@ func (fd *LockFD) Locks() *FileLocks {
return fd.locks
}
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+// LockBSD implements FileDescriptionImpl.LockBSD.
func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
return fd.locks.LockBSD(ctx, uid, ownerPID, t, block)
}
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
fd.locks.UnlockBSD(uid)
return nil
}
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
return fd.locks.LockPOSIX(ctx, uid, ownerPID, t, r, block)
}
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
return fd.locks.UnlockPOSIX(ctx, uid, r)
}
-// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX.
+// TestPOSIX implements FileDescriptionImpl.TestPOSIX.
func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
return fd.locks.TestPOSIX(ctx, uid, t, r)
}
@@ -455,27 +460,68 @@ func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.L
// +stateify savable
type NoLockFD struct{}
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+// SupportsLocks implements FileDescriptionImpl.SupportsLocks.
+func (NoLockFD) SupportsLocks() bool {
+ return false
+}
+
+// LockBSD implements FileDescriptionImpl.LockBSD.
func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
return syserror.ENOLCK
}
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
return syserror.ENOLCK
}
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
return syserror.ENOLCK
}
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
return syserror.ENOLCK
}
-// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX.
+// TestPOSIX implements FileDescriptionImpl.TestPOSIX.
func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
return linux.Flock{}, syserror.ENOLCK
}
+
+// BadLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
+// returning EBADF.
+//
+// +stateify savable
+type BadLockFD struct{}
+
+// SupportsLocks implements FileDescriptionImpl.SupportsLocks.
+func (BadLockFD) SupportsLocks() bool {
+ return false
+}
+
+// LockBSD implements FileDescriptionImpl.LockBSD.
+func (BadLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
+func (BadLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+ return syserror.EBADF
+}
+
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
+func (BadLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
+func (BadLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
+ return syserror.EBADF
+}
+
+// TestPOSIX implements FileDescriptionImpl.TestPOSIX.
+func (BadLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
+ return linux.Flock{}, syserror.EBADF
+}
diff --git a/pkg/sentry/vfs/opath.go b/pkg/sentry/vfs/opath.go
index 39fbac987..e9651b631 100644
--- a/pkg/sentry/vfs/opath.go
+++ b/pkg/sentry/vfs/opath.go
@@ -24,96 +24,96 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// opathFD implements vfs.FileDescriptionImpl for a file description opened with O_PATH.
+// opathFD implements FileDescriptionImpl for a file description opened with O_PATH.
//
// +stateify savable
type opathFD struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
- NoLockFD
+ BadLockFD
}
-// Release implements vfs.FileDescriptionImpl.Release.
+// Release implements FileDescriptionImpl.Release.
func (fd *opathFD) Release(context.Context) {
// noop
}
-// Allocate implements vfs.FileDescriptionImpl.Allocate.
+// Allocate implements FileDescriptionImpl.Allocate.
func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
return syserror.EBADF
}
-// PRead implements vfs.FileDescriptionImpl.PRead.
+// PRead implements FileDescriptionImpl.PRead.
func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
return 0, syserror.EBADF
}
-// Read implements vfs.FileDescriptionImpl.Read.
+// Read implements FileDescriptionImpl.Read.
func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
return 0, syserror.EBADF
}
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
+// PWrite implements FileDescriptionImpl.PWrite.
func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
return 0, syserror.EBADF
}
-// Write implements vfs.FileDescriptionImpl.Write.
+// Write implements FileDescriptionImpl.Write.
func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
return 0, syserror.EBADF
}
-// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+// Ioctl implements FileDescriptionImpl.Ioctl.
func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
return 0, syserror.EBADF
}
-// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+// IterDirents implements FileDescriptionImpl.IterDirents.
func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
return syserror.EBADF
}
-// Seek implements vfs.FileDescriptionImpl.Seek.
+// Seek implements FileDescriptionImpl.Seek.
func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
return 0, syserror.EBADF
}
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap.
func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
return syserror.EBADF
}
-// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+// ListXattr implements FileDescriptionImpl.ListXattr.
func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) {
return nil, syserror.EBADF
}
-// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+// GetXattr implements FileDescriptionImpl.GetXattr.
func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
return "", syserror.EBADF
}
-// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+// SetXattr implements FileDescriptionImpl.SetXattr.
func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error {
return syserror.EBADF
}
-// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+// RemoveXattr implements FileDescriptionImpl.RemoveXattr.
func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error {
return syserror.EBADF
}
-// Sync implements vfs.FileDescriptionImpl.Sync.
+// Sync implements FileDescriptionImpl.Sync.
func (fd *opathFD) Sync(ctx context.Context) error {
return syserror.EBADF
}
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
+// SetStat implements FileDescriptionImpl.SetStat.
func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error {
return syserror.EBADF
}
-// Stat implements vfs.FileDescriptionImpl.Stat.
+// Stat implements FileDescriptionImpl.Stat.
func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
vfsObj := fd.vfsfd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
@@ -121,7 +121,7 @@ func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, err
Start: fd.vfsfd.vd,
})
stat, err := fd.vfsfd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return stat, err
}
@@ -134,6 +134,6 @@ func (fd *opathFD) StatFS(ctx context.Context) (linux.Statfs, error) {
Start: fd.vfsfd.vd,
})
statfs, err := fd.vfsfd.vd.mount.fs.impl.StatFSAt(ctx, rp)
- vfsObj.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return statfs, err
}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index e4fd55012..97b898aba 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -44,13 +44,10 @@ type ResolvingPath struct {
start *Dentry
pit fspath.Iterator
- flags uint16
- mustBeDir bool // final file must be a directory?
- mustBeDirOrig bool
- symlinks uint8 // number of symlinks traversed
- symlinksOrig uint8
- curPart uint8 // index into parts
- numOrigParts uint8
+ flags uint16
+ mustBeDir bool // final file must be a directory?
+ symlinks uint8 // number of symlinks traversed
+ curPart uint8 // index into parts
creds *auth.Credentials
@@ -60,14 +57,9 @@ type ResolvingPath struct {
nextStart *Dentry // ref held if not nil
absSymlinkTarget fspath.Path
- // ResolvingPath must track up to two relative paths: the "current"
- // relative path, which is updated whenever a relative symlink is
- // encountered, and the "original" relative path, which is updated from the
- // current relative path by handleError() when resolution must change
- // filesystems (due to reaching a mount boundary or absolute symlink) and
- // overwrites the current relative path when Restart() is called.
- parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
- origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
+ // ResolvingPath tracks relative paths, which is updated whenever a relative
+ // symlink is encountered.
+ parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
}
const (
@@ -120,6 +112,8 @@ var resolvingPathPool = sync.Pool{
},
}
+// getResolvingPath gets a new ResolvingPath from the pool. Caller must call
+// ResolvingPath.Release() when done.
func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
rp := resolvingPathPool.Get().(*ResolvingPath)
rp.vfs = vfs
@@ -132,17 +126,37 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat
rp.flags |= rpflagsFollowFinalSymlink
}
rp.mustBeDir = pop.Path.Dir
- rp.mustBeDirOrig = pop.Path.Dir
rp.symlinks = 0
rp.curPart = 0
- rp.numOrigParts = 1
rp.creds = creds
rp.parts[0] = pop.Path.Begin
- rp.origParts[0] = pop.Path.Begin
return rp
}
-func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) {
+// Copy creates another ResolvingPath with the same state as the original.
+// Copies are independent, using the copy does not change the original and
+// vice-versa.
+//
+// Caller must call Resease() when done.
+func (rp *ResolvingPath) Copy() *ResolvingPath {
+ copy := resolvingPathPool.Get().(*ResolvingPath)
+ *copy = *rp // All fields all shallow copiable.
+
+ // Take extra reference for the copy if the original had them.
+ if copy.flags&rpflagsHaveStartRef != 0 {
+ copy.start.IncRef()
+ }
+ if copy.flags&rpflagsHaveMountRef != 0 {
+ copy.mount.IncRef()
+ }
+ // Reset error state.
+ copy.nextStart = nil
+ copy.nextMount = nil
+ return copy
+}
+
+// Release decrements references if needed and returns the object to the pool.
+func (rp *ResolvingPath) Release(ctx context.Context) {
rp.root = VirtualDentry{}
rp.decRefStartAndMount(ctx)
rp.mount = nil
@@ -240,25 +254,6 @@ func (rp *ResolvingPath) Advance() {
}
}
-// Restart resets the stream of path components represented by rp to its state
-// on entry to the current FilesystemImpl method.
-func (rp *ResolvingPath) Restart(ctx context.Context) {
- rp.pit = rp.origParts[rp.numOrigParts-1]
- rp.mustBeDir = rp.mustBeDirOrig
- rp.symlinks = rp.symlinksOrig
- rp.curPart = rp.numOrigParts - 1
- copy(rp.parts[:], rp.origParts[:rp.numOrigParts])
- rp.releaseErrorState(ctx)
-}
-
-func (rp *ResolvingPath) relpathCommit() {
- rp.mustBeDirOrig = rp.mustBeDir
- rp.symlinksOrig = rp.symlinks
- rp.numOrigParts = rp.curPart + 1
- copy(rp.origParts[:rp.curPart], rp.parts[:])
- rp.origParts[rp.curPart] = rp.pit
-}
-
// CheckRoot is called before resolving the parent of the Dentry d. If the
// Dentry is contextually a VFS root, such that path resolution should treat
// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
@@ -405,11 +400,10 @@ func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool {
rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
rp.nextMount = nil
rp.nextStart = nil
- // Commit the previous FileystemImpl's progress through the relative
- // path. (Don't consume the path component that caused us to traverse
+ // Don't consume the path component that caused us to traverse
// through the mount root - i.e. the ".." - because we still need to
- // resolve the mount point's parent in the new FilesystemImpl.)
- rp.relpathCommit()
+ // resolve the mount point's parent in the new FilesystemImpl.
+ //
// Restart path resolution on the new Mount. Don't bother calling
// rp.releaseErrorState() since we already set nextMount and nextStart
// to nil above.
@@ -425,9 +419,6 @@ func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool {
rp.nextMount = nil
// Consume the path component that represented the mount point.
rp.Advance()
- // Commit the previous FilesystemImpl's progress through the relative
- // path.
- rp.relpathCommit()
// Restart path resolution on the new Mount.
rp.releaseErrorState(ctx)
return true
@@ -442,9 +433,6 @@ func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool {
rp.Advance()
// Prepend the symlink target to the relative path.
rp.relpathPrepend(rp.absSymlinkTarget)
- // Commit the previous FilesystemImpl's progress through the relative
- // path, including the symlink target we just prepended.
- rp.relpathCommit()
// Restart path resolution on the new Mount.
rp.releaseErrorState(ctx)
return true
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 00f1847d8..87fdcf403 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -208,11 +208,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -230,11 +230,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
dentry: d,
}
rp.mount.IncRef()
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return vd, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return VirtualDentry{}, err
}
}
@@ -252,7 +252,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
}
rp.mount.IncRef()
name := rp.Component()
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return parentVD, name, nil
}
if checkInvariants {
@@ -261,7 +261,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return VirtualDentry{}, "", err
}
}
@@ -292,7 +292,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
for {
err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
oldVD.DecRef(ctx)
return nil
}
@@ -302,7 +302,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
oldVD.DecRef(ctx)
return err
}
@@ -331,7 +331,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if checkInvariants {
@@ -340,7 +340,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -366,7 +366,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if checkInvariants {
@@ -375,7 +375,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -425,7 +425,6 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
rp := vfs.getResolvingPath(creds, pop)
if opts.Flags&linux.O_DIRECTORY != 0 {
rp.mustBeDir = true
- rp.mustBeDirOrig = true
}
// Ignore O_PATH for verity, as verity performs extra operations on the fd for verification.
// The underlying filesystem that verity wraps opens the fd with O_PATH.
@@ -444,7 +443,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
for {
fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
if opts.FileExec {
if fd.Mount().Flags.NoExec {
@@ -468,7 +467,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
return fd, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil, err
}
}
@@ -480,11 +479,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden
for {
target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return target, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return "", err
}
}
@@ -533,7 +532,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
oldParentVD.DecRef(ctx)
return nil
}
@@ -543,7 +542,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
oldParentVD.DecRef(ctx)
return err
}
@@ -569,7 +568,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.RmdirAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if checkInvariants {
@@ -578,7 +577,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -590,11 +589,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent
for {
err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -606,11 +605,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential
for {
stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return stat, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return linux.Statx{}, err
}
}
@@ -623,11 +622,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
for {
statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return statfs, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return linux.Statfs{}, err
}
}
@@ -652,7 +651,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
for {
err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if checkInvariants {
@@ -661,7 +660,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -686,7 +685,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if checkInvariants {
@@ -695,7 +694,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -707,7 +706,7 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
for {
bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return bep, nil
}
if checkInvariants {
@@ -716,7 +715,7 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
}
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil, err
}
}
@@ -729,7 +728,7 @@ func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Crede
for {
names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return names, nil
}
if err == syserror.ENOTSUP {
@@ -737,11 +736,11 @@ func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Crede
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
// subsystem to return security extended attributes, which by
// default don't exist.
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil, err
}
}
@@ -754,11 +753,11 @@ func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Creden
for {
val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return val, nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return "", err
}
}
@@ -771,11 +770,11 @@ func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Creden
for {
err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
@@ -787,11 +786,11 @@ func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Cre
for {
err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
if err == nil {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return nil
}
if !rp.handleError(ctx, err) {
- vfs.putResolvingPath(ctx, rp)
+ rp.Release(ctx)
return err
}
}
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 8e3146d8d..dfe85f31d 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -243,6 +243,7 @@ func (w *Watchdog) waitForStart() {
}
stuckStartup.Increment()
+ metric.WeirdnessMetric.Increment("watchdog_stuck_startup")
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
@@ -312,10 +313,11 @@ func (w *Watchdog) runTurn() {
// New stuck task detected.
//
// Note that tasks blocked doing IO may be considered stuck in kernel,
- // unless they are surrounded b
+ // unless they are surrounded by
// Task.UninterruptibleSleepStart/Finish.
tc = &offender{lastUpdateTime: lastUpdateTime}
stuckTasks.Increment()
+ metric.WeirdnessMetric.Increment("watchdog_stuck_tasks")
newTaskFound = true
}
newOffenders[t] = tc