summaryrefslogtreecommitdiffhomepage
path: root/pkg
diff options
context:
space:
mode:
Diffstat (limited to 'pkg')
-rw-r--r--pkg/merkletree/merkletree.go130
-rw-r--r--pkg/merkletree/merkletree_test.go52
-rw-r--r--pkg/sentry/control/proc.go4
-rw-r--r--pkg/sentry/fs/fsutil/file_range_set.go21
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go4
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go2
-rw-r--r--pkg/sentry/fs/user/path.go1
-rw-r--r--pkg/sentry/fs/user/user.go1
-rw-r--r--pkg/sentry/fsimpl/devpts/BUILD1
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go91
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go5
-rw-r--r--pkg/sentry/fsimpl/devpts/replica.go3
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go14
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go1
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go1
-rw-r--r--pkg/sentry/fsimpl/ext/ext.go6
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go1
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go66
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD1
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go3
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go12
-rw-r--r--pkg/sentry/fsimpl/host/host.go6
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD12
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go4
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go192
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go235
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go173
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go87
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go7
-rw-r--r--pkg/sentry/fsimpl/kernfs/synthetic_directory.go40
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go3
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go5
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go22
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go22
-rw-r--r--pkg/sentry/fsimpl/proc/task.go86
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go67
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go34
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go40
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go56
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go22
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go118
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go11
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go5
-rw-r--r--pkg/sentry/fsimpl/sys/kcov.go6
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go51
-rw-r--r--pkg/sentry/fsimpl/testutil/testutil.go10
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go37
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs_test.go1
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go63
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go119
-rw-r--r--pkg/sentry/fsimpl/verity/verity_test.go1
-rw-r--r--pkg/sentry/kernel/BUILD12
-rw-r--r--pkg/sentry/kernel/context.go3
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go14
-rw-r--r--pkg/sentry/kernel/kernel.go57
-rw-r--r--pkg/sentry/kernel/shm/BUILD1
-rw-r--r--pkg/sentry/kernel/shm/shm.go46
-rw-r--r--pkg/sentry/kernel/task.go6
-rw-r--r--pkg/sentry/kernel/task_clone.go5
-rw-r--r--pkg/sentry/kernel/task_exit.go1
-rw-r--r--pkg/sentry/kernel/task_start.go1
-rw-r--r--pkg/sentry/loader/vdso.go6
-rw-r--r--pkg/sentry/platform/kvm/BUILD1
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go6
-rw-r--r--pkg/sentry/platform/kvm/context.go5
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64_test.go31
-rw-r--r--pkg/sentry/platform/kvm/kvm_test.go12
-rw-r--r--pkg/sentry/platform/kvm/machine.go31
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go13
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go13
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.go5
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.s16
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s14
-rw-r--r--pkg/sentry/socket/netlink/socket.go7
-rw-r--r--pkg/sentry/socket/netstack/netstack.go5
-rw-r--r--pkg/sentry/socket/unix/unix.go8
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/execve.go3
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go4
-rw-r--r--pkg/sentry/vfs/anonfs.go5
-rw-r--r--pkg/sentry/vfs/file_description.go24
-rw-r--r--pkg/sentry/vfs/filesystem_type.go3
-rw-r--r--pkg/sentry/vfs/mount.go11
-rw-r--r--pkg/sentry/vfs/vfs.go10
-rw-r--r--pkg/tcpip/checker/checker.go18
-rw-r--r--pkg/tcpip/header/icmpv6.go5
-rw-r--r--pkg/tcpip/network/BUILD1
-rw-r--r--pkg/tcpip/network/ip_test.go405
-rw-r--r--pkg/tcpip/network/ipv4/ipv4.go40
-rw-r--r--pkg/tcpip/network/ipv4/ipv4_test.go62
-rw-r--r--pkg/tcpip/network/ipv6/icmp.go160
-rw-r--r--pkg/tcpip/network/ipv6/icmp_test.go254
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go44
-rw-r--r--pkg/tcpip/network/ipv6/ipv6_test.go4
-rw-r--r--pkg/tcpip/network/ipv6/ndp_test.go98
-rw-r--r--pkg/tcpip/stack/BUILD1
-rw-r--r--pkg/tcpip/stack/neighbor_entry.go6
-rw-r--r--pkg/tcpip/stack/neighbor_entry_test.go277
-rw-r--r--pkg/tcpip/stack/route.go6
-rw-r--r--pkg/tcpip/stack/stack_test.go47
-rw-r--r--pkg/tcpip/tcpip.go2
-rw-r--r--pkg/tcpip/transport/udp/forwarder.go1
106 files changed, 2604 insertions, 1178 deletions
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
index a6e698f57..d8227b8bd 100644
--- a/pkg/merkletree/merkletree.go
+++ b/pkg/merkletree/merkletree.go
@@ -138,10 +138,19 @@ func (d *VerityDescriptor) String() string {
return fmt.Sprintf("Name: %s, Mode: %d, UID: %d, GID: %d, RootHash: %v", d.Name, d.Mode, d.UID, d.GID, d.RootHash)
}
+// verify generates a hash from d, and compares it with expected.
+func (d *VerityDescriptor) verify(expected []byte) error {
+ h := sha256.Sum256([]byte(d.String()))
+ if !bytes.Equal(h[:], expected) {
+ return fmt.Errorf("unexpected root hash")
+ }
+ return nil
+}
+
// GenerateParams contains the parameters used to generate a Merkle tree.
type GenerateParams struct {
// File is a reader of the file to be hashed.
- File io.ReadSeeker
+ File io.ReaderAt
// Size is the size of the file.
Size int64
// Name is the name of the target file.
@@ -153,25 +162,19 @@ type GenerateParams struct {
// GID is the group ID of the target file.
GID uint32
// TreeReader is a reader for the Merkle tree.
- TreeReader io.ReadSeeker
+ TreeReader io.ReaderAt
// TreeWriter is a writer for the Merkle tree.
- TreeWriter io.WriteSeeker
+ TreeWriter io.Writer
// DataAndTreeInSameFile is true if data and Merkle tree are in the same
// file, or false if Merkle tree is a separate file from data.
DataAndTreeInSameFile bool
}
-// Generate constructs a Merkle tree for the contents of data. The output is
-// written to treeWriter. The treeReader should be able to read the tree after
-// it has been written. That is, treeWriter and treeReader should point to the
-// same underlying data but have separate cursors.
+// Generate constructs a Merkle tree for the contents of params.File. The
+// output is written to params.TreeWriter.
//
-// Generate returns a hash of descriptor. The descriptor contains the file
+// Generate returns a hash of a VerityDescriptor, which contains the file
// metadata and the hash from file content.
-//
-// Generate will modify the cursor for data, but always restores it to its
-// original position upon exit. The cursor for tree is modified and not
-// restored.
func Generate(params *GenerateParams) ([]byte, error) {
layout := InitLayout(params.Size, params.DataAndTreeInSameFile)
@@ -182,31 +185,11 @@ func Generate(params *GenerateParams) ([]byte, error) {
bytesInLastBlock := params.Size % layout.blockSize
if params.DataAndTreeInSameFile && bytesInLastBlock != 0 {
zeroBuf := make([]byte, layout.blockSize-bytesInLastBlock)
- if _, err := params.TreeWriter.Seek(0, io.SeekEnd); err != nil && err != io.EOF {
- return nil, err
- }
if _, err := params.TreeWriter.Write(zeroBuf); err != nil {
return nil, err
}
}
- // Store the current offset, so we can set it back once verification
- // finishes.
- origOffset, err := params.File.Seek(0, io.SeekCurrent)
- if err != nil {
- return nil, err
- }
- defer params.File.Seek(origOffset, io.SeekStart)
-
- // Read from the beginning of both data and treeReader.
- if _, err := params.File.Seek(0, io.SeekStart); err != nil && err != io.EOF {
- return nil, err
- }
-
- if _, err := params.TreeReader.Seek(0, io.SeekStart); err != nil && err != io.EOF {
- return nil, err
- }
-
var root []byte
for level := 0; level < layout.numLevels(); level++ {
for i := int64(0); i < numBlocks; i++ {
@@ -218,11 +201,11 @@ func Generate(params *GenerateParams) ([]byte, error) {
if level == 0 {
// Read data block from the target file since level 0 includes hashes
// of blocks in the input data.
- n, err = params.File.Read(buf)
+ n, err = params.File.ReadAt(buf, i*layout.blockSize)
} else {
// Read data block from the tree file since levels higher than 0 are
// hashing the lower level hashes.
- n, err = params.TreeReader.Read(buf)
+ n, err = params.TreeReader.ReadAt(buf, layout.blockOffset(level-1, i))
}
// err is populated as long as the bytes read is smaller than the buffer
@@ -273,9 +256,9 @@ type VerifyParams struct {
// Out will be filled with verified data.
Out io.Writer
// File is a handler on the file to be verified.
- File io.ReadSeeker
+ File io.ReaderAt
// tree is a handler on the Merkle tree used to verify file.
- Tree io.ReadSeeker
+ Tree io.ReaderAt
// Size is the size of the file.
Size int64
// Name is the name of the target file.
@@ -290,35 +273,22 @@ type VerifyParams struct {
ReadOffset int64
// ReadSize is the size of the data range to be verified.
ReadSize int64
- // ExpectedRoot is a trusted hash for the file. It is compared with the
+ // Expected is a trusted hash for the file. It is compared with the
// calculated root hash to verify the content.
- ExpectedRoot []byte
+ Expected []byte
// DataAndTreeInSameFile is true if data and Merkle tree are in the same
// file, or false if Merkle tree is a separate file from data.
DataAndTreeInSameFile bool
}
-// verifyDescriptor generates a hash from descriptor, and compares it with
-// expectedRoot.
-func verifyDescriptor(descriptor VerityDescriptor, expectedRoot []byte) error {
- h := sha256.Sum256([]byte(descriptor.String()))
- if !bytes.Equal(h[:], expectedRoot) {
- return fmt.Errorf("unexpected root hash")
- }
- return nil
-}
-
// verifyMetadata verifies the metadata by hashing a descriptor that contains
-// the metadata and compare the generated hash with expectedRoot.
+// the metadata and compare the generated hash with expected.
//
// For verifyMetadata, params.data is not needed. It only accesses params.tree
// for the raw root hash.
-func verifyMetadata(params *VerifyParams, layout Layout) error {
- if _, err := params.Tree.Seek(layout.blockOffset(layout.rootLevel(), 0 /* index */), io.SeekStart); err != nil {
- return fmt.Errorf("failed to seek to root hash: %w", err)
- }
+func verifyMetadata(params *VerifyParams, layout *Layout) error {
root := make([]byte, layout.digestSize)
- if _, err := params.Tree.Read(root); err != nil {
+ if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil {
return fmt.Errorf("failed to read root hash: %w", err)
}
descriptor := VerityDescriptor{
@@ -328,28 +298,24 @@ func verifyMetadata(params *VerifyParams, layout Layout) error {
GID: params.GID,
RootHash: root,
}
- return verifyDescriptor(descriptor, params.ExpectedRoot)
+ return descriptor.verify(params.Expected)
}
// Verify verifies the content read from data with offset. The content is
// verified against tree. If content spans across multiple blocks, each block is
// verified. Verification fails if the hash of the data does not match the tree
-// at any level, or if the final root hash does not match expectedRoot.
+// at any level, or if the final root hash does not match expected.
// Once the data is verified, it will be written using params.Out.
//
// Verify checks for both target file content and metadata. If readSize is 0,
// only metadata is checked.
-//
-// Verify will modify the cursor for data, but always restores it to its
-// original position upon exit. The cursor for tree is modified and not
-// restored.
func Verify(params *VerifyParams) (int64, error) {
if params.ReadSize < 0 {
return 0, fmt.Errorf("unexpected read size: %d", params.ReadSize)
}
layout := InitLayout(int64(params.Size), params.DataAndTreeInSameFile)
if params.ReadSize == 0 {
- return 0, verifyMetadata(params, layout)
+ return 0, verifyMetadata(params, &layout)
}
// Calculate the index of blocks that includes the target range in input
@@ -357,26 +323,13 @@ func Verify(params *VerifyParams) (int64, error) {
firstDataBlock := params.ReadOffset / layout.blockSize
lastDataBlock := (params.ReadOffset + params.ReadSize - 1) / layout.blockSize
- // Store the current offset, so we can set it back once verification
- // finishes.
- origOffset, err := params.File.Seek(0, io.SeekCurrent)
- if err != nil {
- return 0, fmt.Errorf("find current data offset failed: %w", err)
- }
- defer params.File.Seek(origOffset, io.SeekStart)
-
- // Move to the first block that contains target data.
- if _, err := params.File.Seek(firstDataBlock*layout.blockSize, io.SeekStart); err != nil {
- return 0, fmt.Errorf("seek to datablock start failed: %w", err)
- }
-
buf := make([]byte, layout.blockSize)
var readErr error
total := int64(0)
for i := firstDataBlock; i <= lastDataBlock; i++ {
// Read a block that includes all or part of target range in
// input data.
- bytesRead, err := params.File.Read(buf)
+ bytesRead, err := params.File.ReadAt(buf, i*layout.blockSize)
readErr = err
// If at the end of input data and all previous blocks are
// verified, return the verified input data and EOF.
@@ -401,7 +354,7 @@ func Verify(params *VerifyParams) (int64, error) {
UID: params.UID,
GID: params.GID,
}
- if err := verifyBlock(params.Tree, descriptor, layout, buf, i, params.ExpectedRoot); err != nil {
+ if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.Expected); err != nil {
return 0, err
}
@@ -441,11 +394,8 @@ func Verify(params *VerifyParams) (int64, error) {
// original data. The block is verified through each level of the tree. It
// fails if the calculated hash from block is different from any level of
// hashes stored in tree. And the final root hash is compared with
-// expectedRoot.
-//
-// verifyBlock modifies the cursor for tree. Users needs to maintain the cursor
-// if intended.
-func verifyBlock(tree io.ReadSeeker, descriptor VerityDescriptor, layout Layout, dataBlock []byte, blockIndex int64, expectedRoot []byte) error {
+// expected.
+func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, expected []byte) error {
if len(dataBlock) != int(layout.blockSize) {
return fmt.Errorf("incorrect block size")
}
@@ -462,22 +412,16 @@ func verifyBlock(tree io.ReadSeeker, descriptor VerityDescriptor, layout Layout,
// Read a block in previous level that contains the
// hash we just generated, and generate a next level
// hash from it.
- if _, err := tree.Seek(layout.blockOffset(level-1, blockIndex), io.SeekStart); err != nil {
- return err
- }
- if _, err := tree.Read(treeBlock); err != nil {
+ if _, err := tree.ReadAt(treeBlock, layout.blockOffset(level-1, blockIndex)); err != nil {
return err
}
digestArray := sha256.Sum256(treeBlock)
digest = digestArray[:]
}
- // Move to stored hash for the current block, read the digest
- // and store in expectedDigest.
- if _, err := tree.Seek(layout.digestOffset(level, blockIndex), io.SeekStart); err != nil {
- return err
- }
- if _, err := tree.Read(expectedDigest); err != nil {
+ // Read the digest for the current block and store in
+ // expectedDigest.
+ if _, err := tree.ReadAt(expectedDigest, layout.digestOffset(level, blockIndex)); err != nil {
return err
}
@@ -488,7 +432,7 @@ func verifyBlock(tree io.ReadSeeker, descriptor VerityDescriptor, layout Layout,
}
// Verification for the tree succeeded. Now hash the descriptor with
- // the root hash and compare it with expectedRoot.
+ // the root hash and compare it with expected.
descriptor.RootHash = digest
- return verifyDescriptor(descriptor, expectedRoot)
+ return descriptor.verify(expected)
}
diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go
index bb11ec844..e1350ebda 100644
--- a/pkg/merkletree/merkletree_test.go
+++ b/pkg/merkletree/merkletree_test.go
@@ -106,58 +106,36 @@ func (brw *bytesReadWriter) Write(p []byte) (int, error) {
return len(p), nil
}
-func (brw *bytesReadWriter) Read(p []byte) (int, error) {
- if brw.readPos >= len(brw.bytes) {
- return 0, io.EOF
- }
- bytesRead := copy(p, brw.bytes[brw.readPos:])
- brw.readPos += bytesRead
- if bytesRead < len(p) {
+func (brw *bytesReadWriter) ReadAt(p []byte, off int64) (int, error) {
+ bytesRead := copy(p, brw.bytes[off:])
+ if bytesRead == 0 {
return bytesRead, io.EOF
}
return bytesRead, nil
}
-func (brw *bytesReadWriter) Seek(offset int64, whence int) (int64, error) {
- off := offset
- if whence == io.SeekCurrent {
- off += int64(brw.readPos)
- }
- if whence == io.SeekEnd {
- off += int64(len(brw.bytes))
- }
- if off < 0 {
- panic("seek with negative offset")
- }
- if off >= int64(len(brw.bytes)) {
- return 0, io.EOF
- }
- brw.readPos = int(off)
- return off, nil
-}
-
func TestGenerate(t *testing.T) {
// The input data has size dataSize. It starts with the data in startWith,
// and all other bytes are zeroes.
testCases := []struct {
data []byte
- expectedRoot []byte
+ expectedHash []byte
}{
{
data: bytes.Repeat([]byte{0}, usermem.PageSize),
- expectedRoot: []byte{64, 253, 58, 72, 192, 131, 82, 184, 193, 33, 108, 142, 43, 46, 179, 134, 244, 21, 29, 190, 14, 39, 66, 129, 6, 46, 200, 211, 30, 247, 191, 252},
+ expectedHash: []byte{64, 253, 58, 72, 192, 131, 82, 184, 193, 33, 108, 142, 43, 46, 179, 134, 244, 21, 29, 190, 14, 39, 66, 129, 6, 46, 200, 211, 30, 247, 191, 252},
},
{
data: bytes.Repeat([]byte{0}, 128*usermem.PageSize+1),
- expectedRoot: []byte{182, 223, 218, 62, 65, 185, 160, 219, 93, 119, 186, 88, 205, 32, 122, 231, 173, 72, 78, 76, 65, 57, 177, 146, 159, 39, 44, 123, 230, 156, 97, 26},
+ expectedHash: []byte{182, 223, 218, 62, 65, 185, 160, 219, 93, 119, 186, 88, 205, 32, 122, 231, 173, 72, 78, 76, 65, 57, 177, 146, 159, 39, 44, 123, 230, 156, 97, 26},
},
{
data: []byte{'a'},
- expectedRoot: []byte{28, 201, 8, 36, 150, 178, 111, 5, 193, 212, 129, 205, 206, 124, 211, 90, 224, 142, 81, 183, 72, 165, 243, 240, 242, 241, 76, 127, 101, 61, 63, 11},
+ expectedHash: []byte{28, 201, 8, 36, 150, 178, 111, 5, 193, 212, 129, 205, 206, 124, 211, 90, 224, 142, 81, 183, 72, 165, 243, 240, 242, 241, 76, 127, 101, 61, 63, 11},
},
{
data: bytes.Repeat([]byte{'a'}, usermem.PageSize),
- expectedRoot: []byte{106, 58, 160, 152, 41, 68, 38, 108, 245, 74, 177, 84, 64, 193, 19, 176, 249, 86, 27, 193, 85, 164, 99, 240, 79, 104, 148, 222, 76, 46, 191, 79},
+ expectedHash: []byte{106, 58, 160, 152, 41, 68, 38, 108, 245, 74, 177, 84, 64, 193, 19, 176, 249, 86, 27, 193, 85, 164, 99, 240, 79, 104, 148, 222, 76, 46, 191, 79},
},
}
@@ -183,13 +161,13 @@ func TestGenerate(t *testing.T) {
bytes: tc.data,
}
}
- root, err := Generate(&params)
+ hash, err := Generate(&params)
if err != nil {
t.Fatalf("Got err: %v, want nil", err)
}
- if !bytes.Equal(root, tc.expectedRoot) {
- t.Errorf("Got root: %v, want %v", root, tc.expectedRoot)
+ if !bytes.Equal(hash, tc.expectedHash) {
+ t.Errorf("Got hash: %v, want %v", hash, tc.expectedHash)
}
}
})
@@ -390,7 +368,7 @@ func TestVerify(t *testing.T) {
bytes: data,
}
}
- root, err := Generate(&genParams)
+ hash, err := Generate(&genParams)
if err != nil {
t.Fatalf("Generate failed: %v", err)
}
@@ -409,7 +387,7 @@ func TestVerify(t *testing.T) {
GID: defaultGID,
ReadOffset: tc.verifyStart,
ReadSize: tc.verifySize,
- ExpectedRoot: root,
+ Expected: hash,
DataAndTreeInSameFile: dataAndTreeInSameFile,
}
if tc.modifyName {
@@ -478,7 +456,7 @@ func TestVerifyRandom(t *testing.T) {
bytes: data,
}
}
- root, err := Generate(&genParams)
+ hash, err := Generate(&genParams)
if err != nil {
t.Fatalf("Generate failed: %v", err)
}
@@ -499,7 +477,7 @@ func TestVerifyRandom(t *testing.T) {
GID: defaultGID,
ReadOffset: start,
ReadSize: size,
- ExpectedRoot: root,
+ Expected: hash,
DataAndTreeInSameFile: dataAndTreeInSameFile,
}
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 668f47802..1d88db12f 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -183,9 +183,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
if initArgs.MountNamespaceVFS2 == nil {
// Set initArgs so that 'ctx' returns the namespace.
//
- // MountNamespaceVFS2 adds a reference to the namespace, which is
- // transferred to the new process.
+ // Add a reference to the namespace, which is transferred to the new process.
initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+ initArgs.MountNamespaceVFS2.IncRef()
}
} else {
if initArgs.MountNamespace == nil {
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 9197aeb88..1dc409d38 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -84,7 +84,8 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRan
// returns a successful partial read, Fill will call it repeatedly until all
// bytes have been read.) EOF is handled consistently with the requirements of
// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
-// invalid.
+// invalid. fileSize is an upper bound on the file's size; bytes after fileSize
+// will be zeroed without calling readAt.
//
// Fill may read offsets outside of required, but will never read offsets
// outside of optional. It returns a non-nil error if any error occurs, even
@@ -94,7 +95,7 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRan
// * required.Length() > 0.
// * optional.IsSupersetOf(required).
// * required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
gap := frs.LowerBoundGap(required.Start)
for gap.Ok() && gap.Start() < required.End {
if gap.Range().Length() == 0 {
@@ -107,7 +108,21 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
var done uint64
for !dsts.IsEmpty() {
- n, err := readAt(ctx, dsts, gr.Start+done)
+ n, err := func() (uint64, error) {
+ off := gr.Start + done
+ if off >= fileSize {
+ return 0, io.EOF
+ }
+ if off+dsts.NumBytes() > fileSize {
+ rd := fileSize - off
+ n, err := readAt(ctx, dsts.TakeFirst64(rd), off)
+ if n == rd && err == nil {
+ return n, io.EOF
+ }
+ return n, err
+ }
+ return readAt(ctx, dsts, off)
+ }()
done += n
dsts = dsts.DropFirst64(n)
if err != nil {
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 85a23432b..82eda3e43 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -644,7 +644,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
End: fs.OffsetPageEnd(int64(gapMR.End)),
}
optMR := gap.Range()
- err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+ err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.c.cache.Find(uint64(rw.offset))
if !seg.Ok() {
@@ -870,7 +870,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
}
mf := c.mfp.MemoryFile()
- cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
+ cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1dc75291d..fc0498f17 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -613,7 +613,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
}
mf := f.kernel.MemoryFile()
- cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
index 2f5a43b84..124bc95ed 100644
--- a/pkg/sentry/fs/user/path.go
+++ b/pkg/sentry/fs/user/path.go
@@ -121,6 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s
func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
for _, p := range paths {
if !path.IsAbs(p) {
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index 936fd3932..1f8684dc6 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -105,6 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
const defaultHome = "/"
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
creds := auth.CredentialsFromContext(ctx)
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 48e13613a..84baaac66 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -35,6 +35,7 @@ go_library(
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
+ "//pkg/sentry/fs",
"//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 903135fae..d5c5aaa8c 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -37,27 +37,51 @@ const Name = "devpts"
// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
-type FilesystemType struct{}
+type FilesystemType struct {
+ initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+ initErr error
+
+ // fs backs all mounts of this FilesystemType. root is fs' root. fs and root
+ // are immutable.
+ fs *vfs.Filesystem
+ root *vfs.Dentry
+}
// Name implements vfs.FilesystemType.Name.
-func (FilesystemType) Name() string {
+func (*FilesystemType) Name() string {
return Name
}
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
// No data allowed.
if opts.Data != "" {
return nil, nil, syserror.EINVAL
}
- fs, root, err := fstype.newFilesystem(vfsObj, creds)
- if err != nil {
- return nil, nil, err
+ fstype.initOnce.Do(func() {
+ fs, root, err := fstype.newFilesystem(vfsObj, creds)
+ if err != nil {
+ fstype.initErr = err
+ return
+ }
+ fstype.fs = fs.VFSFilesystem()
+ fstype.root = root.VFSDentry()
+ })
+ if fstype.initErr != nil {
+ return nil, nil, fstype.initErr
+ }
+ fstype.fs.IncRef()
+ fstype.root.IncRef()
+ return fstype.fs, fstype.root, nil
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (fstype *FilesystemType) Release(ctx context.Context) {
+ if fstype.fs != nil {
+ fstype.root.DecRef(ctx)
+ fstype.fs.DecRef(ctx)
}
- return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
}
// +stateify savable
@@ -69,7 +93,7 @@ type filesystem struct {
// newFilesystem creates a new devpts filesystem with root directory and ptmx
// master inode. It returns the filesystem and root Dentry.
-func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+func (fstype *FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
devMinor, err := vfsObj.GetAnonBlockDevMinor()
if err != nil {
return nil, nil, err
@@ -87,7 +111,9 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
root.EnableLeakCheck()
- root.dentry.Init(root)
+
+ var rootD kernfs.Dentry
+ rootD.Init(&fs.Filesystem, root)
// Construct the pts master inode and dentry. Linux always uses inode
// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
@@ -95,15 +121,14 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
root: root,
}
master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
- master.dentry.Init(master)
// Add the master as a child of the root.
- links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
- "ptmx": &master.dentry,
+ links := root.OrderedChildren.Populate(map[string]kernfs.Inode{
+ "ptmx": master,
})
root.IncLinks(links)
- return fs, &root.dentry, nil
+ return fs, &rootD, nil
}
// Release implements vfs.FilesystemImpl.Release.
@@ -117,24 +142,19 @@ func (fs *filesystem) Release(ctx context.Context) {
// +stateify savable
type rootInode struct {
implStatFS
- kernfs.AlwaysValid
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
kernfs.OrderedChildren
rootInodeRefs
locks vfs.FileLocks
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
-
// master is the master pty inode. Immutable.
master *masterInode
- // root is the root directory inode for this filesystem. Immutable.
- root *rootInode
-
// mu protects the fields below.
mu sync.Mutex `state:"nosave"`
@@ -173,21 +193,24 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
// Linux always uses pty index + 3 as the inode id. See
// fs/devpts/inode.c:devpts_pty_new().
replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
- replica.dentry.Init(replica)
i.replicas[idx] = replica
return t, nil
}
// masterClose is called when the master end of t is closed.
-func (i *rootInode) masterClose(t *Terminal) {
+func (i *rootInode) masterClose(ctx context.Context, t *Terminal) {
i.mu.Lock()
defer i.mu.Unlock()
// Sanity check that replica with idx exists.
- if _, ok := i.replicas[t.n]; !ok {
+ ri, ok := i.replicas[t.n]
+ if !ok {
panic(fmt.Sprintf("pty with index %d does not exist", t.n))
}
+
+ // Drop the ref on replica inode taken during rootInode.allocateTerminal.
+ ri.DecRef(ctx)
delete(i.replicas, t.n)
}
@@ -203,16 +226,22 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.D
}
// Lookup implements kernfs.Inode.Lookup.
-func (i *rootInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
+func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+ // Check if a static entry was looked up.
+ if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+ return d, nil
+ }
+
+ // Not a static entry.
idx, err := strconv.ParseUint(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
}
i.mu.Lock()
defer i.mu.Unlock()
- if si, ok := i.replicas[uint32(idx)]; ok {
- si.dentry.IncRef()
- return &si.dentry, nil
+ if ri, ok := i.replicas[uint32(idx)]; ok {
+ ri.IncRef() // This ref is passed to the dentry upon creation via Init.
+ return ri, nil
}
return nil, syserror.ENOENT
@@ -243,8 +272,8 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *rootInode) DecRef(context.Context) {
- i.rootInodeRefs.DecRef(i.Destroy)
+func (i *rootInode) DecRef(ctx context.Context) {
+ i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
}
// +stateify savable
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 69c2fe951..fda30fb93 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -42,9 +42,6 @@ type masterInode struct {
locks vfs.FileLocks
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
-
// root is the devpts root inode.
root *rootInode
}
@@ -103,7 +100,7 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
func (mfd *masterFileDescription) Release(ctx context.Context) {
- mfd.inode.root.masterClose(mfd.t)
+ mfd.inode.root.masterClose(ctx, mfd.t)
}
// EventRegister implements waiter.Waitable.EventRegister.
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
index 6515c5536..70c68cf0a 100644
--- a/pkg/sentry/fsimpl/devpts/replica.go
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -41,9 +41,6 @@ type replicaInode struct {
locks vfs.FileLocks
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
-
// root is the devpts root inode.
root *rootInode
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 6d1753080..e6fe0fc0d 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -71,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua
return fst.fs, fst.root, nil
}
+// Release implements vfs.FilesystemType.Release.
+func (fst *FilesystemType) Release(ctx context.Context) {
+ if fst.fs != nil {
+ // Release the original reference obtained when creating the filesystem.
+ fst.root.DecRef(ctx)
+ fst.fs.DecRef(ctx)
+ }
+}
+
// Accessor allows devices to create device special files in devtmpfs.
type Accessor struct {
vfsObj *vfs.VirtualFilesystem
@@ -86,10 +95,13 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth
if err != nil {
return nil, err
}
+ // Pass a reference on root to the Accessor.
+ root := mntns.Root()
+ root.IncRef()
return &Accessor{
vfsObj: vfsObj,
mntns: mntns,
- root: mntns.Root(),
+ root: root,
creds: creds,
}, nil
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 3a38b8bb4..e058eda7a 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -53,6 +53,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
root := mntns.Root()
+ root.IncRef()
devpop := vfs.PathOperation{
Root: root,
Start: root,
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index c349b886e..2ee7cc7ac 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -70,6 +70,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
}
root := mntns.Root()
+ root.IncRef()
tearDown := func() {
root.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index aca258d40..38fb7962b 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -38,9 +38,6 @@ const Name = "ext"
// +stateify savable
type FilesystemType struct{}
-// Compiles only if FilesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
// getDeviceFd returns an io.ReaderAt to the underlying device.
// Currently there are two ways of mounting an ext(2/3/4) fs:
// 1. Specify a mount with our internal special MountType in the OCI spec.
@@ -101,6 +98,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 0989558cd..d9fd4590c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -82,6 +82,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
}
root := mntns.Root()
+ root.IncRef()
tearDown := func() {
root.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 65786e42a..e39df21c6 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -98,6 +98,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -249,14 +252,12 @@ func (fs *filesystem) Release(ctx context.Context) {
// +stateify savable
type inode struct {
inodeRefs
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
- kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
kernfs.OrderedChildren
- dentry kernfs.Dentry
-
// the owning filesystem. fs is immutable.
fs *filesystem
@@ -284,26 +285,24 @@ type inode struct {
}
func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
- i := &inode{fs: fs}
+ i := &inode{fs: fs, nodeID: 1}
i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
i.EnableLeakCheck()
- i.dentry.Init(i)
- i.nodeID = 1
- return &i.dentry
+ var d kernfs.Dentry
+ d.Init(&fs.Filesystem, i)
+ return &d
}
-func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry {
+func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) kernfs.Inode {
i := &inode{fs: fs, nodeID: nodeID}
creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
atomic.StoreUint64(&i.size, attr.Size)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
i.EnableLeakCheck()
- i.dentry.Init(i)
-
- return &i.dentry
+ return i
}
// Open implements kernfs.Inode.Open.
@@ -410,23 +409,27 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr
}
// Lookup implements kernfs.Inode.Lookup.
-func (i *inode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
+func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
in := linux.FUSELookupIn{Name: name}
return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
}
+// Keep implements kernfs.Inode.Keep.
+func (i *inode) Keep() bool {
+ // Return true so that kernfs keeps the new dentry pointing to this
+ // inode in the dentry tree. This is needed because inodes created via
+ // Lookup are not temporary. They might refer to existing files on server
+ // that can be Unlink'd/Rmdir'd.
+ return true
+}
+
// IterDirents implements kernfs.Inode.IterDirents.
func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
return offset, nil
}
-// Valid implements kernfs.Inode.Valid.
-func (*inode) Valid(ctx context.Context) bool {
- return true
-}
-
// NewFile implements kernfs.Inode.NewFile.
-func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) {
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
kernelTask := kernel.TaskFromContext(ctx)
if kernelTask == nil {
log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
@@ -444,7 +447,7 @@ func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions)
}
// NewNode implements kernfs.Inode.NewNode.
-func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*kernfs.Dentry, error) {
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) {
in := linux.FUSEMknodIn{
MknodMeta: linux.FUSEMknodMeta{
Mode: uint32(opts.Mode),
@@ -457,7 +460,7 @@ func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions)
}
// NewSymlink implements kernfs.Inode.NewSymlink.
-func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.Dentry, error) {
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) {
in := linux.FUSESymLinkIn{
Name: name,
Target: target,
@@ -466,7 +469,7 @@ func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.De
}
// Unlink implements kernfs.Inode.Unlink.
-func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) error {
+func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
kernelTask := kernel.TaskFromContext(ctx)
if kernelTask == nil {
log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
@@ -482,14 +485,11 @@ func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) e
return err
}
// only return error, discard res.
- if err := res.Error(); err != nil {
- return err
- }
- return i.dentry.RemoveChildLocked(name, child)
+ return res.Error()
}
// NewDir implements kernfs.Inode.NewDir.
-func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) {
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
in := linux.FUSEMkdirIn{
MkdirMeta: linux.FUSEMkdirMeta{
Mode: uint32(opts.Mode),
@@ -501,7 +501,7 @@ func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions)
}
// RmDir implements kernfs.Inode.RmDir.
-func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) error {
+func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
fusefs := i.fs
task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
@@ -515,16 +515,12 @@ func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) er
if err != nil {
return err
}
- if err := res.Error(); err != nil {
- return err
- }
-
- return i.dentry.RemoveChildLocked(name, child)
+ return res.Error()
}
// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
-func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*kernfs.Dentry, error) {
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) {
kernelTask := kernel.TaskFromContext(ctx)
if kernelTask == nil {
log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
@@ -734,8 +730,8 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *inode) DecRef(context.Context) {
- i.inodeRefs.DecRef(i.Destroy)
+func (i *inode) DecRef(ctx context.Context) {
+ i.inodeRefs.DecRef(func() { i.Destroy(ctx) })
}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 16787116f..ad0afc41b 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -52,6 +52,7 @@ go_library(
"//pkg/fspath",
"//pkg/log",
"//pkg/p9",
+ "//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 8608471f8..f1dad1b08 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -272,6 +272,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index eeaf6e444..f8b19bae7 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -395,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
End: gapEnd,
}
optMR := gap.Range()
- err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
+ err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.d.cache.Find(rw.off)
if !seg.Ok() {
@@ -403,10 +403,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
rw.d.handleMu.RUnlock()
return done, err
}
- // err might have occurred in part of gap.Range() outside
- // gapMR. Forget about it for now; if the error matters and
- // persists, we'll run into it again in a later iteration of
- // this loop.
+ // err might have occurred in part of gap.Range() outside gapMR
+ // (in particular, gap.End() might be beyond EOF). Forget about
+ // it for now; if the error matters and persists, we'll run
+ // into it again in a later iteration of this loop.
} else {
// Read directly from the file.
gapDsts := dsts.TakeFirst64(gapMR.Length())
@@ -780,7 +780,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
mf := d.fs.mfp.MemoryFile()
h := d.readHandleLocked()
- cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
+ cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index ffe4ddb32..698e913fe 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -118,7 +118,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
if err != nil {
return nil, err
}
- d.Init(i)
+ d.Init(&fs.Filesystem, i)
// i.open will take a reference on d.
defer d.DecRef(ctx)
@@ -151,6 +151,9 @@ func (filesystemType) Name() string {
return "none"
}
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
// NewFilesystem sets up and returns a new hostfs filesystem.
//
// Note that there should only ever be one instance of host.filesystem,
@@ -195,6 +198,7 @@ type inode struct {
kernfs.InodeNoStatFS
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
locks vfs.FileLocks
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 5e91e0536..858cc24ce 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -70,6 +70,17 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "synthetic_directory_refs",
+ out = "synthetic_directory_refs.go",
+ package = "kernfs",
+ prefix = "syntheticDirectory",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "syntheticDirectory",
+ },
+)
+
go_library(
name = "kernfs",
srcs = [
@@ -84,6 +95,7 @@ go_library(
"static_directory_refs.go",
"symlink.go",
"synthetic_directory.go",
+ "synthetic_directory_refs.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 0a4cd4057..abf1905d6 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -201,12 +201,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
// these.
childIdx := fd.off - 2
for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
- stat, err := it.Dentry.inode.Stat(ctx, fd.filesystem(), opts)
+ stat, err := it.inode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
dirent := vfs.Dirent{
- Name: it.Name,
+ Name: it.name,
Type: linux.FileMode(stat.Mode).DirentType(),
Ino: stat.Ino,
NextOff: fd.off + 1,
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 5cc1c4281..6426a55f6 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -89,7 +89,7 @@ afterSymlink:
}
if targetVD.Ok() {
err := rp.HandleJump(targetVD)
- targetVD.DecRef(ctx)
+ fs.deferDecRefVD(ctx, targetVD)
if err != nil {
return nil, err
}
@@ -120,22 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// Cached dentry exists, revalidate.
if !child.inode.Valid(ctx) {
delete(parent.children, name)
- vfsObj.InvalidateDentry(ctx, &child.vfsd)
- fs.deferDecRef(child) // Reference from Lookup.
+ if child.inode.Keep() {
+ // Drop the ref owned by kernfs.
+ fs.deferDecRef(child)
+ }
+ vfsObj.InvalidateDentry(ctx, child.VFSDentry())
child = nil
}
}
if child == nil {
// Dentry isn't cached; it either doesn't exist or failed revalidation.
// Attempt to resolve it via Lookup.
- c, err := parent.inode.Lookup(ctx, name)
+ childInode, err := parent.inode.Lookup(ctx, name)
if err != nil {
return nil, err
}
- // Reference on c (provided by Lookup) will be dropped when the dentry
- // fails validation.
- parent.InsertChildLocked(name, c)
- child = c
+ var newChild Dentry
+ newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
+ parent.insertChildLocked(name, &newChild)
+ child = &newChild
+
+ // Drop the ref on newChild. This will cause the dentry to get pruned
+ // from the dentry tree by the end of current filesystem operation
+ // (before returning to the VFS layer) if another ref is not picked on
+ // this dentry.
+ if !childInode.Keep() {
+ fs.deferDecRef(&newChild)
+ }
}
return child, nil
}
@@ -191,7 +202,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
}
// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// directory parent, then returns rp.Component().
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
@@ -298,9 +309,9 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
parent, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -324,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EPERM
}
- child, err := parent.inode.NewLink(ctx, pc, d.inode)
+ childI, err := parent.inode.NewLink(ctx, pc, d.inode)
if err != nil {
return err
}
- parent.InsertChildLocked(pc, child)
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChildLocked(pc, &child)
return nil
}
@@ -338,9 +351,9 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
parent, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -355,14 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- child, err := parent.inode.NewDir(ctx, pc, opts)
+ childI, err := parent.inode.NewDir(ctx, pc, opts)
if err != nil {
if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
return err
}
- child = newSyntheticDirectory(rp.Credentials(), opts.Mode)
+ childI = newSyntheticDirectory(rp.Credentials(), opts.Mode)
}
- parent.InsertChildLocked(pc, child)
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChildLocked(pc, &child)
return nil
}
@@ -372,9 +387,9 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
parent, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -389,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- newD, err := parent.inode.NewNode(ctx, pc, opts)
+ newI, err := parent.inode.NewNode(ctx, pc, opts)
if err != nil {
return err
}
- parent.InsertChildLocked(pc, newD)
+ var newD Dentry
+ newD.Init(fs, newI)
+ parent.insertChildLocked(pc, &newD)
return nil
}
@@ -409,22 +426,23 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
return nil, err
}
if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
return nil, err
}
- d.inode.IncRef()
- defer d.inode.DecRef(ctx)
+ // Open may block so we need to unlock fs.mu. IncRef d to prevent
+ // its destruction while fs.mu is unlocked.
+ d.IncRef()
fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
- return d.inode.Open(ctx, rp, d, opts)
+ fd, err := d.inode.Open(ctx, rp, d, opts)
+ d.DecRef(ctx)
+ return fd, err
}
// May create new file.
@@ -438,6 +456,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
unlocked = true
}
}
+ // Process all to-be-decref'd dentries at the end at once.
+ // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
+ // when this is executed.
+ defer fs.processDeferredDecRefs(ctx)
defer unlock()
if rp.Done() {
if rp.MustBeDir() {
@@ -449,14 +471,16 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- d.inode.IncRef()
- defer d.inode.DecRef(ctx)
+ // Open may block so we need to unlock fs.mu. IncRef d to prevent
+ // its destruction while fs.mu is unlocked.
+ d.IncRef()
unlock()
- return d.inode.Open(ctx, rp, d, opts)
+ fd, err := d.inode.Open(ctx, rp, d, opts)
+ d.DecRef(ctx)
+ return fd, err
}
afterTrailingSymlink:
parent, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return nil, err
}
@@ -487,18 +511,23 @@ afterTrailingSymlink:
}
defer rp.Mount().EndWrite()
// Create and open the child.
- child, err := parent.inode.NewFile(ctx, pc, opts)
+ childI, err := parent.inode.NewFile(ctx, pc, opts)
if err != nil {
return nil, err
}
+ var child Dentry
+ child.Init(fs, childI)
// FIXME(gvisor.dev/issue/1193): Race between checking existence with
- // fs.stepExistingLocked and parent.InsertChild. If possible, we should hold
+ // fs.stepExistingLocked and parent.insertChild. If possible, we should hold
// dirMu from one to the other.
- parent.InsertChild(pc, child)
- child.inode.IncRef()
- defer child.inode.DecRef(ctx)
+ parent.insertChild(pc, &child)
+ // Open may block so we need to unlock fs.mu. IncRef child to prevent
+ // its destruction while fs.mu is unlocked.
+ child.IncRef()
unlock()
- return child.inode.Open(ctx, rp, child, opts)
+ fd, err := child.inode.Open(ctx, rp, &child, opts)
+ child.DecRef(ctx)
+ return fd, err
}
if err != nil {
return nil, err
@@ -514,7 +543,7 @@ afterTrailingSymlink:
}
if targetVD.Ok() {
err := rp.HandleJump(targetVD)
- targetVD.DecRef(ctx)
+ fs.deferDecRefVD(ctx, targetVD)
if err != nil {
return nil, err
}
@@ -530,18 +559,21 @@ afterTrailingSymlink:
if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- child.inode.IncRef()
- defer child.inode.DecRef(ctx)
+ // Open may block so we need to unlock fs.mu. IncRef child to prevent
+ // its destruction while fs.mu is unlocked.
+ child.IncRef()
unlock()
- return child.inode.Open(ctx, rp, child, opts)
+ fd, err := child.inode.Open(ctx, rp, child, opts)
+ child.DecRef(ctx)
+ return fd, err
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return "", err
}
@@ -560,7 +592,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
fs.mu.Lock()
- defer fs.processDeferredDecRefsLocked(ctx)
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
// Resolve the destination directory first to verify that it's on this
@@ -632,24 +664,27 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
return err
}
- replaced, err := srcDir.inode.Rename(ctx, src.name, pc, src, dstDir)
+ err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode)
if err != nil {
virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
return err
}
delete(srcDir.children, src.name)
if srcDir != dstDir {
- fs.deferDecRef(srcDir)
- dstDir.IncRef()
+ fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
+ dstDir.IncRef() // child (src) takes a ref on the new parent.
}
src.parent = dstDir
src.name = pc
if dstDir.children == nil {
dstDir.children = make(map[string]*Dentry)
}
+ replaced := dstDir.children[pc]
dstDir.children[pc] = src
var replaceVFSD *vfs.Dentry
if replaced != nil {
+ // deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
+ fs.deferDecRef(replaced)
replaceVFSD = replaced.VFSDentry()
}
virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
@@ -659,10 +694,10 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -691,10 +726,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
- if err := parentDentry.inode.RmDir(ctx, d.name, d); err != nil {
+ if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
+ delete(parentDentry.children, d.name)
+ // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+ fs.deferDecRef(d)
virtfs.CommitDeleteDentry(ctx, vfsd)
return nil
}
@@ -702,9 +740,9 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return err
}
@@ -717,9 +755,9 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statx{}, err
}
@@ -729,9 +767,9 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statfs{}, err
}
@@ -744,9 +782,9 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
parent, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -761,21 +799,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return err
}
defer rp.Mount().EndWrite()
- child, err := parent.inode.NewSymlink(ctx, pc, target)
+ childI, err := parent.inode.NewSymlink(ctx, pc, target)
if err != nil {
return err
}
- parent.InsertChildLocked(pc, child)
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChildLocked(pc, &child)
return nil
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -799,10 +839,13 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, d.name, d); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
+ delete(parentDentry.children, d.name)
+ // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+ fs.deferDecRef(d)
virtfs.CommitDeleteDentry(ctx, vfsd)
return nil
}
@@ -810,9 +853,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
d, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return nil, err
}
@@ -825,9 +868,9 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
_, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return nil, err
}
@@ -838,9 +881,9 @@ func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
_, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return "", err
}
@@ -851,9 +894,9 @@ func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
_, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return err
}
@@ -864,9 +907,9 @@ func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
_, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs(ctx)
if err != nil {
return err
}
@@ -880,3 +923,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
defer fs.mu.RUnlock()
return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
}
+
+func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
+ if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+ // The following is equivalent to vd.DecRef(ctx). This is needed
+ // because if d belongs to this filesystem, we can not DecRef it right
+ // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
+ // defer the DecRef to when locks are dropped.
+ vd.Mount().DecRef(ctx)
+ fs.deferDecRef(d)
+ } else {
+ vd.DecRef(ctx)
+ }
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 49210e748..122b10591 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -34,6 +34,7 @@ import (
//
// +stateify savable
type InodeNoopRefCount struct {
+ InodeTemporary
}
// IncRef implements Inode.IncRef.
@@ -57,27 +58,27 @@ func (InodeNoopRefCount) TryIncRef() bool {
type InodeDirectoryNoNewChildren struct{}
// NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
return nil, syserror.EPERM
}
// NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
return nil, syserror.EPERM
}
// NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
return nil, syserror.EPERM
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
return nil, syserror.EPERM
}
// NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
return nil, syserror.EPERM
}
@@ -88,6 +89,7 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
//
// +stateify savable
type InodeNotDirectory struct {
+ InodeAlwaysValid
}
// HasChildren implements Inode.HasChildren.
@@ -96,47 +98,47 @@ func (InodeNotDirectory) HasChildren() bool {
}
// NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
panic("NewFile called on non-directory inode")
}
// NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
panic("NewDir called on non-directory inode")
}
// NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
panic("NewLink called on non-directory inode")
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
panic("NewSymlink called on non-directory inode")
}
// NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
panic("NewNode called on non-directory inode")
}
// Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
panic("Unlink called on non-directory inode")
}
// RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
panic("RmDir called on non-directory inode")
}
// Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *Dentry, *Dentry) (*Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
panic("Rename called on non-directory inode")
}
// Lookup implements Inode.Lookup.
-func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
panic("Lookup called on non-directory inode")
}
@@ -145,35 +147,6 @@ func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDiren
panic("IterDirents called on non-directory inode")
}
-// Valid implements Inode.Valid.
-func (InodeNotDirectory) Valid(context.Context) bool {
- return true
-}
-
-// InodeNoDynamicLookup partially implements the Inode interface, specifically
-// the inodeDynamicLookup sub interface. Directory inodes that do not support
-// dymanic entries (i.e. entries that are not "hashed" into the
-// vfs.Dentry.children) can embed this to provide no-op implementations for
-// functions related to dynamic entries.
-//
-// +stateify savable
-type InodeNoDynamicLookup struct{}
-
-// Lookup implements Inode.Lookup.
-func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*Dentry, error) {
- return nil, syserror.ENOENT
-}
-
-// IterDirents implements Inode.IterDirents.
-func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
- return offset, nil
-}
-
-// Valid implements Inode.Valid.
-func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
- return true
-}
-
// InodeNotSymlink partially implements the Inode interface, specifically the
// inodeSymlink sub interface. All inodes that are not symlinks may embed this
// to return the appropriate errors from symlink-related functions.
@@ -273,7 +246,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
// SetInodeStat sets the corresponding attributes from opts to InodeAttrs.
// This function can be used by other kernfs-based filesystem implementation to
-// sets the unexported attributes into kernfs.InodeAttrs.
+// sets the unexported attributes into InodeAttrs.
func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
if opts.Stat.Mask == 0 {
return nil
@@ -344,8 +317,9 @@ func (a *InodeAttrs) DecLinks() {
// +stateify savable
type slot struct {
- Name string
- Dentry *Dentry
+ name string
+ inode Inode
+ static bool
slotEntry
}
@@ -361,10 +335,18 @@ type OrderedChildrenOptions struct {
}
// OrderedChildren partially implements the Inode interface. OrderedChildren can
-// be embedded in directory inodes to keep track of the children in the
+// be embedded in directory inodes to keep track of children in the
// directory, and can then be used to implement a generic directory FD -- see
-// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
-// directories.
+// GenericDirectoryFD.
+//
+// OrderedChildren can represent a node in an Inode tree. The children inodes
+// might be directories themselves using OrderedChildren; hence extending the
+// tree. The parent inode (OrderedChildren user) holds a ref on all its static
+// children. This lets the static inodes outlive their associated dentry.
+// While the dentry might have to be regenerated via a Lookup() call, we can
+// keep reusing the same static inode. These static children inodes are finally
+// DecRef'd when this directory inode is being destroyed. This makes
+// OrderedChildren suitable for static directory entries as well.
//
// Must be initialize with Init before first use.
//
@@ -388,33 +370,63 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
// Destroy clears the children stored in o. It should be called by structs
// embedding OrderedChildren upon destruction, i.e. when their reference count
// reaches zero.
-func (o *OrderedChildren) Destroy() {
+func (o *OrderedChildren) Destroy(ctx context.Context) {
o.mu.Lock()
defer o.mu.Unlock()
+ // Drop the ref that o owns on the static inodes it holds.
+ for _, s := range o.set {
+ if s.static {
+ s.inode.DecRef(ctx)
+ }
+ }
o.order.Reset()
o.set = nil
}
-// Populate inserts children into this OrderedChildren, and d's dentry
-// cache. Populate returns the number of directories inserted, which the caller
+// Populate inserts static children into this OrderedChildren.
+// Populate returns the number of directories inserted, which the caller
// may use to update the link count for the parent directory.
//
-// Precondition: d must represent a directory inode. children must not contain
-// any conflicting entries already in o.
-func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+// Precondition:
+// * d must represent a directory inode.
+// * children must not contain any conflicting entries already in o.
+// * Caller must hold a reference on all inodes passed.
+//
+// Postcondition: Caller's references on inodes are transferred to o.
+func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
var links uint32
for name, child := range children {
- if child.isDir() {
+ if child.Mode().IsDir() {
links++
}
- if err := o.Insert(name, child); err != nil {
- panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+ if err := o.insert(name, child, true); err != nil {
+ panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
}
- d.InsertChild(name, child)
}
return links
}
+// Lookup implements Inode.Lookup.
+func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
+ o.mu.RLock()
+ defer o.mu.RUnlock()
+
+ s, ok := o.set[name]
+ if !ok {
+ return nil, syserror.ENOENT
+ }
+
+ s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
+ return s.inode, nil
+}
+
+// IterDirents implements Inode.IterDirents.
+func (o *OrderedChildren) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+ // All entries from OrderedChildren have already been handled in
+ // GenericDirectoryFD.IterDirents.
+ return offset, nil
+}
+
// HasChildren implements Inode.HasChildren.
func (o *OrderedChildren) HasChildren() bool {
o.mu.RLock()
@@ -422,17 +434,27 @@ func (o *OrderedChildren) HasChildren() bool {
return len(o.set) > 0
}
-// Insert inserts child into o. This ignores the writability of o, as this is
-// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *Dentry) error {
+// Insert inserts a dynamic child into o. This ignores the writability of o, as
+// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child Inode) error {
+ return o.insert(name, child, false)
+}
+
+// insert inserts child into o.
+//
+// Precondition: Caller must be holding a ref on child if static is true.
+//
+// Postcondition: Caller's ref on child is transferred to o if static is true.
+func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
o.mu.Lock()
defer o.mu.Unlock()
if _, ok := o.set[name]; ok {
return syserror.EEXIST
}
s := &slot{
- Name: name,
- Dentry: child,
+ name: name,
+ inode: child,
+ static: static,
}
o.order.PushBack(s)
o.set[name] = s
@@ -442,44 +464,49 @@ func (o *OrderedChildren) Insert(name string, child *Dentry) error {
// Precondition: caller must hold o.mu for writing.
func (o *OrderedChildren) removeLocked(name string) {
if s, ok := o.set[name]; ok {
+ if s.static {
+ panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
+ }
delete(o.set, name)
o.order.Remove(s)
}
}
// Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *Dentry) *Dentry {
+func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) {
if s, ok := o.set[name]; ok {
+ if s.static {
+ panic(fmt.Sprintf("replacing a static inode: %v", s.inode))
+ }
+
// Existing slot with given name, simply replace the dentry.
- var old *Dentry
- old, s.Dentry = s.Dentry, new
- return old
+ s.inode = newI
}
// No existing slot with given name, create and hash new slot.
s := &slot{
- Name: name,
- Dentry: new,
+ name: name,
+ inode: newI,
+ static: false,
}
o.order.PushBack(s)
o.set[name] = s
- return nil
}
// Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
s, ok := o.set[name]
if !ok {
return syserror.ENOENT
}
- if s.Dentry != child {
- panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+ if s.inode != child {
+ panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
}
return nil
}
// Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
if !o.writable {
return syserror.EPERM
}
@@ -494,8 +521,8 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry
return nil
}
-// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *Dentry) error {
+// RmDir implements Inode.RmDir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
// We're not responsible for checking that child is a directory, that it's
// empty, or updating any link counts; so this is the same as unlink.
return o.Unlink(ctx, name, child)
@@ -517,13 +544,13 @@ func (renameAcrossDifferentImplementationsError) Error() string {
// that will support Rename.
//
// Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (*Dentry, error) {
- dst, ok := dstDir.inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
+ dst, ok := dstDir.(interface{}).(*OrderedChildren)
if !ok {
- return nil, renameAcrossDifferentImplementationsError{}
+ return renameAcrossDifferentImplementationsError{}
}
if !o.writable || !dst.writable {
- return nil, syserror.EPERM
+ return syserror.EPERM
}
// Note: There's a potential deadlock below if concurrent calls to Rename
// refer to the same src and dst directories in reverse. We avoid any
@@ -536,12 +563,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
defer dst.mu.Unlock()
}
if err := o.checkExistingLocked(oldname, child); err != nil {
- return nil, err
+ return err
}
// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
- replaced := dst.replaceChildLocked(newname, child)
- return replaced, nil
+ dst.replaceChildLocked(ctx, newname, child)
+ return nil
}
// nthLocked returns an iterator to the nth child tracked by this object. The
@@ -576,11 +603,12 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry,
//
// +stateify savable
type StaticDirectory struct {
+ InodeAlwaysValid
InodeAttrs
InodeDirectoryNoNewChildren
- InodeNoDynamicLookup
InodeNoStatFS
InodeNotSymlink
+ InodeTemporary
OrderedChildren
StaticDirectoryRefs
@@ -591,19 +619,16 @@ type StaticDirectory struct {
var _ Inode = (*StaticDirectory)(nil)
// NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry {
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
inode := &StaticDirectory{}
inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts)
inode.EnableLeakCheck()
- dentry := &Dentry{}
- dentry.Init(inode)
-
inode.OrderedChildren.Init(OrderedChildrenOptions{})
- links := inode.OrderedChildren.Populate(dentry, children)
+ links := inode.OrderedChildren.Populate(children)
inode.IncLinks(links)
- return dentry
+ return inode
}
// Init initializes StaticDirectory.
@@ -615,7 +640,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
}
-// Open implements kernfs.Inode.Open.
+// Open implements Inode.Open.
func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
if err != nil {
@@ -624,26 +649,36 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *De
return fd.VFSFileDescription(), nil
}
-// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.DecRef.
-func (s *StaticDirectory) DecRef(context.Context) {
- s.StaticDirectoryRefs.DecRef(s.Destroy)
+// DecRef implements Inode.DecRef.
+func (s *StaticDirectory) DecRef(ctx context.Context) {
+ s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
}
-// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+// InodeAlwaysValid partially implements Inode.
//
// +stateify savable
-type AlwaysValid struct{}
+type InodeAlwaysValid struct{}
-// Valid implements kernfs.inodeDynamicLookup.Valid.
-func (*AlwaysValid) Valid(context.Context) bool {
+// Valid implements Inode.Valid.
+func (*InodeAlwaysValid) Valid(context.Context) bool {
return true
}
+// InodeTemporary partially implements Inode.
+//
+// +stateify savable
+type InodeTemporary struct{}
+
+// Keep implements Inode.Keep.
+func (*InodeTemporary) Keep() bool {
+ return false
+}
+
// InodeNoStatFS partially implements the Inode interface, where the client
// filesystem doesn't support statfs(2).
//
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 6d3d79333..606081e68 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -29,12 +29,16 @@
//
// Reference Model:
//
-// Kernfs dentries represents named pointers to inodes. Dentries and inodes have
+// Kernfs dentries represents named pointers to inodes. Kernfs is solely
+// reponsible for maintaining and modifying its dentry tree; inode
+// implementations can not access the tree. Dentries and inodes have
// independent lifetimes and reference counts. A child dentry unconditionally
// holds a reference on its parent directory's dentry. A dentry also holds a
-// reference on the inode it points to. Multiple dentries can point to the same
-// inode (for example, in the case of hardlinks). File descriptors hold a
-// reference to the dentry they're opened on.
+// reference on the inode it points to (although that might not be the only
+// reference on the inode). Due to this inodes can outlive the dentries that
+// point to them. Multiple dentries can point to the same inode (for example,
+// in the case of hardlinks). File descriptors hold a reference to the dentry
+// they're opened on.
//
// Dentries are guaranteed to exist while holding Filesystem.mu for
// reading. Dropping dentries require holding Filesystem.mu for writing. To
@@ -47,8 +51,8 @@
// kernfs.Dentry.dirMu
// vfs.VirtualFilesystem.mountMu
// vfs.Dentry.mu
-// kernfs.Filesystem.droppedDentriesMu
// (inode implementation locks, if any)
+// kernfs.Filesystem.droppedDentriesMu
package kernfs
import (
@@ -60,7 +64,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
@@ -95,7 +98,7 @@ type Filesystem struct {
// example:
//
// fs.mu.RLock()
- // fs.mu.processDeferredDecRefs()
+ // defer fs.processDeferredDecRefs()
// defer fs.mu.RUnlock()
// ...
// fs.deferDecRef(dentry)
@@ -108,8 +111,7 @@ type Filesystem struct {
// deferDecRef defers dropping a dentry ref until the next call to
// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
-//
-// Precondition: d must not already be pending destruction.
+// This may be called while Filesystem.mu or Dentry.dirMu is locked.
func (fs *Filesystem) deferDecRef(d *Dentry) {
fs.droppedDentriesMu.Lock()
fs.droppedDentries = append(fs.droppedDentries, d)
@@ -118,17 +120,14 @@ func (fs *Filesystem) deferDecRef(d *Dentry) {
// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
// droppedDentries list. See comment on Filesystem.mu.
+//
+// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
- fs.mu.Lock()
- fs.processDeferredDecRefsLocked(ctx)
- fs.mu.Unlock()
-}
-
-// Precondition: fs.mu must be held for writing.
-func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
fs.droppedDentriesMu.Lock()
for _, d := range fs.droppedDentries {
- d.DecRef(ctx)
+ // Defer the DecRef call so that we are not holding droppedDentriesMu
+ // when DecRef is called.
+ defer d.DecRef(ctx)
}
fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
fs.droppedDentriesMu.Unlock()
@@ -157,17 +156,19 @@ const (
//
// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
// named reference to an inode. A dentry generally lives as long as it's part of
-// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
-// to them are removed. Dentries hold a single reference to the inode they point
+// a mounted filesystem tree. Kernfs drops dentries once all references to them
+// are dropped. Dentries hold a single reference to the inode they point
// to, and child dentries hold a reference on their parent.
//
// Must be initialized by Init prior to first use.
//
// +stateify savable
type Dentry struct {
+ vfsd vfs.Dentry
DentryRefs
- vfsd vfs.Dentry
+ // fs is the owning filesystem. fs is immutable.
+ fs *Filesystem
// flags caches useful information about the dentry from the inode. See the
// dflags* consts above. Must be accessed by atomic ops.
@@ -192,8 +193,9 @@ type Dentry struct {
// Precondition: Caller must hold a reference on inode.
//
// Postcondition: Caller's reference on inode is transferred to the dentry.
-func (d *Dentry) Init(inode Inode) {
+func (d *Dentry) Init(fs *Filesystem, inode Inode) {
d.vfsd.Init(d)
+ d.fs = fs
d.inode = inode
ftype := inode.Mode().FileType()
if ftype == linux.ModeDirectory {
@@ -222,14 +224,28 @@ func (d *Dentry) isSymlink() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *Dentry) DecRef(ctx context.Context) {
- // Before the destructor is called, Dentry must be removed from VFS' dentry cache.
+ decRefParent := false
+ d.fs.mu.Lock()
d.DentryRefs.DecRef(func() {
d.inode.DecRef(ctx) // IncRef from Init.
d.inode = nil
if d.parent != nil {
- d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
+ // We will DecRef d.parent once all locks are dropped.
+ decRefParent = true
+ d.parent.dirMu.Lock()
+ // Remove d from parent.children. It might already have been
+ // removed due to invalidation.
+ if _, ok := d.parent.children[d.name]; ok {
+ delete(d.parent.children, d.name)
+ d.fs.VFSFilesystem().VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
+ }
+ d.parent.dirMu.Unlock()
}
})
+ d.fs.mu.Unlock()
+ if decRefParent {
+ d.parent.DecRef(ctx) // IncRef from Dentry.insertChild.
+ }
}
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
@@ -247,26 +263,26 @@ func (d *Dentry) Watches() *vfs.Watches {
// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
func (d *Dentry) OnZeroWatches(context.Context) {}
-// InsertChild inserts child into the vfs dentry cache with the given name under
+// insertChild inserts child into the vfs dentry cache with the given name under
// this dentry. This does not update the directory inode, so calling this on its
// own isn't sufficient to insert a child into a directory.
//
// Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *Dentry) {
+func (d *Dentry) insertChild(name string, child *Dentry) {
d.dirMu.Lock()
- d.InsertChildLocked(name, child)
+ d.insertChildLocked(name, child)
d.dirMu.Unlock()
}
-// InsertChildLocked is equivalent to InsertChild, with additional
+// insertChildLocked is equivalent to insertChild, with additional
// preconditions.
//
// Preconditions:
// * d must represent a directory inode.
// * d.dirMu must be locked.
-func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
+func (d *Dentry) insertChildLocked(name string, child *Dentry) {
if !d.isDir() {
- panic(fmt.Sprintf("InsertChildLocked called on non-directory Dentry: %+v.", d))
+ panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
}
d.IncRef() // DecRef in child's Dentry.destroy.
child.parent = d
@@ -277,36 +293,6 @@ func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
d.children[name] = child
}
-// RemoveChild removes child from the vfs dentry cache. This does not update the
-// directory inode or modify the inode to be unlinked. So calling this on its own
-// isn't sufficient to remove a child from a directory.
-//
-// Precondition: d must represent a directory inode.
-func (d *Dentry) RemoveChild(name string, child *Dentry) error {
- d.dirMu.Lock()
- defer d.dirMu.Unlock()
- return d.RemoveChildLocked(name, child)
-}
-
-// RemoveChildLocked is equivalent to RemoveChild, with additional
-// preconditions.
-//
-// Precondition: d.dirMu must be locked.
-func (d *Dentry) RemoveChildLocked(name string, child *Dentry) error {
- if !d.isDir() {
- panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d))
- }
- c, ok := d.children[name]
- if !ok {
- return syserror.ENOENT
- }
- if c != child {
- panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child))
- }
- delete(d.children, name)
- return nil
-}
-
// Inode returns the dentry's inode.
func (d *Dentry) Inode() Inode {
return d.inode
@@ -348,11 +334,6 @@ type Inode interface {
// a blanket implementation for all non-directory inodes.
inodeDirectory
- // Method for inodes that represent dynamic directories and their
- // children. InodeNoDynamicLookup provides a blanket implementation for all
- // non-dynamic-directory inodes.
- inodeDynamicLookup
-
// Open creates a file description for the filesystem object represented by
// this inode. The returned file description should hold a reference on the
// dentry for its lifetime.
@@ -365,6 +346,14 @@ type Inode interface {
// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
// doesn't support statfs(2), this should return ENOSYS.
StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
+
+ // Keep indicates whether the dentry created after Inode.Lookup should be
+ // kept in the kernfs dentry tree.
+ Keep() bool
+
+ // Valid should return true if this inode is still valid, or needs to
+ // be resolved again by a call to Lookup.
+ Valid(ctx context.Context) bool
}
type inodeRefs interface {
@@ -397,8 +386,8 @@ type inodeMetadata interface {
// Precondition: All methods in this interface may only be called on directory
// inodes.
type inodeDirectory interface {
- // The New{File,Dir,Node,Symlink} methods below should return a new inode
- // hashed into this inode.
+ // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
+ // that will be hashed into the dentry tree.
//
// These inode constructors are inode-level operations rather than
// filesystem-level operations to allow client filesystems to mix different
@@ -409,60 +398,54 @@ type inodeDirectory interface {
HasChildren() bool
// NewFile creates a new regular file inode.
- NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error)
+ NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
// NewDir creates a new directory inode.
- NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error)
+ NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
// NewLink creates a new hardlink to a specified inode in this
// directory. Implementations should create a new kernfs Dentry pointing to
// target, and update target's link count.
- NewLink(ctx context.Context, name string, target Inode) (*Dentry, error)
+ NewLink(ctx context.Context, name string, target Inode) (Inode, error)
// NewSymlink creates a new symbolic link inode.
- NewSymlink(ctx context.Context, name, target string) (*Dentry, error)
+ NewSymlink(ctx context.Context, name, target string) (Inode, error)
// NewNode creates a new filesystem node for a mknod syscall.
- NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error)
+ NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
// Unlink removes a child dentry from this directory inode.
- Unlink(ctx context.Context, name string, child *Dentry) error
+ Unlink(ctx context.Context, name string, child Inode) error
// RmDir removes an empty child directory from this directory
// inode. Implementations must update the parent directory's link count,
// if required. Implementations are not responsible for checking that child
// is a directory, checking for an empty directory.
- RmDir(ctx context.Context, name string, child *Dentry) error
+ RmDir(ctx context.Context, name string, child Inode) error
// Rename is called on the source directory containing an inode being
// renamed. child should point to the resolved child in the source
- // directory. If Rename replaces a dentry in the destination directory, it
- // should return the replaced dentry or nil otherwise.
+ // directory.
//
// Precondition: Caller must serialize concurrent calls to Rename.
- Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (replaced *Dentry, err error)
-}
+ Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
-type inodeDynamicLookup interface {
- // Lookup should return an appropriate dentry if name should resolve to a
- // child of this dynamic directory inode. This gives the directory an
- // opportunity on every lookup to resolve additional entries that aren't
- // hashed into the directory. This is only called when the inode is a
- // directory. If the inode is not a directory, or if the directory only
- // contains a static set of children, the implementer can unconditionally
- // return an appropriate error (ENOTDIR and ENOENT respectively).
+ // Lookup should return an appropriate inode if name should resolve to a
+ // child of this directory inode. This gives the directory an opportunity
+ // on every lookup to resolve additional entries. This is only called when
+ // the inode is a directory.
//
- // The child returned by Lookup will be hashed into the VFS dentry tree. Its
- // lifetime can be controlled by the filesystem implementation with an
- // appropriate implementation of Valid.
+ // The child returned by Lookup will be hashed into the VFS dentry tree,
+ // atleast for the duration of the current FS operation.
//
- // Lookup returns the child with an extra reference and the caller owns this
- // reference.
- Lookup(ctx context.Context, name string) (*Dentry, error)
-
- // Valid should return true if this inode is still valid, or needs to
- // be resolved again by a call to Lookup.
- Valid(ctx context.Context) bool
+ // Lookup must return the child with an extra reference whose ownership is
+ // transferred to the dentry that is created to point to that inode. If
+ // Inode.Keep returns false, that new dentry will be dropped at the end of
+ // the current filesystem operation (before returning back to the VFS
+ // layer) if no other ref is picked on that dentry. If Inode.Keep returns
+ // true, then the dentry will be cached into the dentry tree until it is
+ // Unlink'd or RmDir'd.
+ Lookup(ctx context.Context, name string) (Inode, error)
// IterDirents is used to iterate over dynamically created entries. It invokes
// cb on each entry in the directory represented by the Inode.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index e413242dc..82fa19c03 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file."
// RootDentryFn is a generator function for creating the root dentry of a test
// filesystem. See newTestSystem.
-type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+type RootDentryFn func(*auth.Credentials, *filesystem) kernfs.Inode
// newTestSystem sets up a minimal environment for running a test, including an
// instance of a test filesystem. Tests can control the contents of the
@@ -72,14 +72,11 @@ type file struct {
content string
}
-func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) kernfs.Inode {
f := &file{}
f.content = content
f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
-
- d := &kernfs.Dentry{}
- d.Init(f)
- return d
+ return f
}
func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
@@ -98,27 +95,23 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
type readonlyDir struct {
readonlyDirRefs
attrs
+ kernfs.InodeAlwaysValid
kernfs.InodeDirectoryNoNewChildren
- kernfs.InodeNoDynamicLookup
kernfs.InodeNoStatFS
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
locks vfs.FileLocks
-
- dentry kernfs.Dentry
}
-func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
dir := &readonlyDir{}
dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
dir.EnableLeakCheck()
- dir.dentry.Init(dir)
-
- dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
- return &dir.dentry
+ dir.IncLinks(dir.OrderedChildren.Populate(contents))
+ return dir
}
func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
@@ -131,35 +124,33 @@ func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernf
return fd.VFSFileDescription(), nil
}
-func (d *readonlyDir) DecRef(context.Context) {
- d.readonlyDirRefs.DecRef(d.Destroy)
+func (d *readonlyDir) DecRef(ctx context.Context) {
+ d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) })
}
type dir struct {
dirRefs
attrs
- kernfs.InodeNoDynamicLookup
+ kernfs.InodeAlwaysValid
kernfs.InodeNotSymlink
- kernfs.OrderedChildren
kernfs.InodeNoStatFS
+ kernfs.InodeTemporary
+ kernfs.OrderedChildren
locks vfs.FileLocks
- fs *filesystem
- dentry kernfs.Dentry
+ fs *filesystem
}
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
dir := &dir{}
dir.fs = fs
dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
dir.EnableLeakCheck()
- dir.dentry.Init(dir)
- dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
- return &dir.dentry
+ dir.IncLinks(dir.OrderedChildren.Populate(contents))
+ return dir
}
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
@@ -172,11 +163,11 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry
return fd.VFSFileDescription(), nil
}
-func (d *dir) DecRef(context.Context) {
- d.dirRefs.DecRef(d.Destroy)
+func (d *dir) DecRef(ctx context.Context) {
+ d.dirRefs.DecRef(func() { d.Destroy(ctx) })
}
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) {
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
creds := auth.CredentialsFromContext(ctx)
dir := d.fs.newDir(creds, opts.Mode, nil)
if err := d.OrderedChildren.Insert(name, dir); err != nil {
@@ -187,7 +178,7 @@ func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*
return dir, nil
}
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
creds := auth.CredentialsFromContext(ctx)
f := d.fs.newFile(creds, "")
if err := d.OrderedChildren.Insert(name, f); err != nil {
@@ -197,15 +188,15 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*
return f, nil
}
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*kernfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) {
return nil, syserror.EPERM
}
-func (*dir) NewSymlink(context.Context, string, string) (*kernfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) {
return nil, syserror.EPERM
}
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*kernfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) {
return nil, syserror.EPERM
}
@@ -213,18 +204,22 @@ func (fsType) Name() string {
return "kernfs"
}
+func (fsType) Release(ctx context.Context) {}
+
func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
fs := &filesystem{}
fs.VFSFilesystem().Init(vfsObj, &fst, fs)
root := fst.rootFn(creds, fs)
- return fs.VFSFilesystem(), root.VFSDentry(), nil
+ var d kernfs.Dentry
+ d.Init(&fs.Filesystem, root)
+ return fs.VFSFilesystem(), d.VFSDentry(), nil
}
// -------------------- Remainder of the file are test cases --------------------
func TestBasic(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
"file1": fs.newFile(creds, staticFileContent),
})
})
@@ -233,8 +228,8 @@ func TestBasic(t *testing.T) {
}
func TestMkdirGetDentry(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
"dir1": fs.newDir(creds, 0755, nil),
})
})
@@ -248,8 +243,8 @@ func TestMkdirGetDentry(t *testing.T) {
}
func TestReadStaticFile(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
"file1": fs.newFile(creds, staticFileContent),
})
})
@@ -274,8 +269,8 @@ func TestReadStaticFile(t *testing.T) {
}
func TestCreateNewFileInStaticDir(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
"dir1": fs.newDir(creds, 0755, nil),
})
})
@@ -301,7 +296,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
}
func TestDirFDReadWrite(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
return fs.newReadonlyDir(creds, 0755, nil)
})
defer sys.Destroy()
@@ -325,11 +320,11 @@ func TestDirFDReadWrite(t *testing.T) {
}
func TestDirFDIterDirents(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
// Fill root with nodes backed by various inode implementations.
"dir1": fs.newReadonlyDir(creds, 0755, nil),
- "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+ "dir2": fs.newDir(creds, 0755, map[string]kernfs.Inode{
"dir3": fs.newDir(creds, 0755, nil),
}),
"file1": fs.newFile(creds, staticFileContent),
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 58a93eaac..934cc6c9e 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -38,13 +38,10 @@ type StaticSymlink struct {
var _ Inode = (*StaticSymlink)(nil)
// NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode {
inode := &StaticSymlink{}
inode.Init(creds, devMajor, devMinor, ino, target)
-
- d := &Dentry{}
- d.Init(inode)
- return d
+ return inode
}
// Init initializes the instance.
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
index ea7f073eb..d0ed17b18 100644
--- a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -29,24 +29,22 @@ import (
//
// +stateify savable
type syntheticDirectory struct {
+ InodeAlwaysValid
InodeAttrs
InodeNoStatFS
- InodeNoopRefCount
- InodeNoDynamicLookup
InodeNotSymlink
OrderedChildren
+ syntheticDirectoryRefs
locks vfs.FileLocks
}
var _ Inode = (*syntheticDirectory)(nil)
-func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *Dentry {
+func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) Inode {
inode := &syntheticDirectory{}
inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
- d := &Dentry{}
- d.Init(inode)
- return d
+ return inode
}
func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
@@ -69,34 +67,46 @@ func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath,
}
// NewFile implements Inode.NewFile.
-func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) {
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) {
return nil, syserror.EPERM
}
// NewDir implements Inode.NewDir.
-func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) {
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) {
if !opts.ForSyntheticMountpoint {
return nil, syserror.EPERM
}
- subdird := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
- if err := dir.OrderedChildren.Insert(name, subdird); err != nil {
- subdird.DecRef(ctx)
+ subdirI := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+ if err := dir.OrderedChildren.Insert(name, subdirI); err != nil {
+ subdirI.DecRef(ctx)
return nil, err
}
- return subdird, nil
+ return subdirI, nil
}
// NewLink implements Inode.NewLink.
-func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) {
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) {
return nil, syserror.EPERM
}
// NewSymlink implements Inode.NewSymlink.
-func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*Dentry, error) {
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) {
return nil, syserror.EPERM
}
// NewNode implements Inode.NewNode.
-func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) {
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) {
return nil, syserror.EPERM
}
+
+// DecRef implements Inode.DecRef.
+func (dir *syntheticDirectory) DecRef(ctx context.Context) {
+ dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) })
+}
+
+// Keep implements Inode.Keep. This is redundant because inodes will never be
+// created via Lookup and inodes are always valid. Makes sense to return true
+// because these inodes are not temporary and should only be removed on RmDir.
+func (dir *syntheticDirectory) Keep() bool {
+ return true
+}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index dfbccd05f..e5f506d2e 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -60,6 +60,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
// FilesystemType.GetFilesystem.
//
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 4e2da4810..e44b79b68 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -39,6 +39,9 @@ func (filesystemType) Name() string {
return "pipefs"
}
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("pipefs.filesystemType.GetFilesystem should never be called")
@@ -165,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf
fs := mnt.Filesystem().Impl().(*filesystem)
inode := newInode(ctx, fs)
var d kernfs.Dentry
- d.Init(inode)
+ d.Init(&fs.Filesystem, inode)
defer d.DecRef(ctx)
return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
}
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 05d7948ea..fd70a07de 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -34,13 +34,14 @@ const Name = "proc"
// +stateify savable
type FilesystemType struct{}
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -73,7 +74,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
cgroups = data.Cgroups
}
- _, dentry := procfs.newTasksInode(k, pidns, cgroups)
+ inode := procfs.newTasksInode(k, pidns, cgroups)
+ var dentry kernfs.Dentry
+ dentry.Init(&procfs.Filesystem, inode)
return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
}
@@ -94,12 +97,9 @@ type dynamicInode interface {
Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
}
-func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
- inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+func (fs *filesystem) newInode(creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode {
+ inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm)
+ return inode
}
// +stateify savable
@@ -114,8 +114,8 @@ func newStaticFile(data string) *staticFile {
return &staticFile{StaticData: vfs.StaticData{Data: data}}
}
-func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
- return kernfs.NewStaticDir(creds, devMajor, devMinor, ino, perm, children, kernfs.GenericDirectoryFDOptions{
+func (fs *filesystem) newStaticDir(creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode {
+ return kernfs.NewStaticDir(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
})
}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 47ecd941c..bad2fab4f 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -32,10 +32,11 @@ import (
// +stateify savable
type subtasksInode struct {
implStatFS
- kernfs.AlwaysValid
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
subtasksInodeRefs
@@ -49,7 +50,7 @@ type subtasksInode struct {
var _ kernfs.Inode = (*subtasksInode)(nil)
-func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode {
subInode := &subtasksInode{
fs: fs,
task: task,
@@ -62,14 +63,11 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
subInode.EnableLeakCheck()
inode := &taskOwnedInode{Inode: subInode, owner: task}
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
-
- return dentry
+ return inode
}
-// Lookup implements kernfs.inodeDynamicLookup.Lookup.
-func (i *subtasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
tid, err := strconv.ParseUint(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -82,10 +80,10 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry
if subTask.ThreadGroup() != i.task.ThreadGroup() {
return nil, syserror.ENOENT
}
- return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers), nil
+ return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
}
-// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
if len(tasks) == 0 {
@@ -186,6 +184,6 @@ func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credential
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *subtasksInode) DecRef(context.Context) {
- i.subtasksInodeRefs.DecRef(i.Destroy)
+func (i *subtasksInode) DecRef(ctx context.Context) {
+ i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a7cd6f57e..b63a4eca0 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -35,8 +35,8 @@ type taskInode struct {
implStatFS
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
- kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
taskInodeRefs
@@ -47,41 +47,44 @@ type taskInode struct {
var _ kernfs.Inode = (*taskInode)(nil)
-func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
- // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
- contents := map[string]*kernfs.Dentry{
- "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
- "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) {
+ if task.ExitState() == kernel.TaskExitDead {
+ return nil, syserror.ESRCH
+ }
+
+ contents := map[string]kernfs.Inode{
+ "auxv": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}),
+ "cmdline": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
"comm": fs.newComm(task, fs.NextIno(), 0444),
"cwd": fs.newCwdSymlink(task, fs.NextIno()),
- "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+ "environ": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
"exe": fs.newExeSymlink(task, fs.NextIno()),
"fd": fs.newFDDirInode(task),
"fdinfo": fs.newFDInfoDirInode(task),
- "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
- "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
- "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
- "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
- "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+ "gid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+ "io": fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+ "maps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}),
+ "mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+ "mounts": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}),
"net": fs.newTaskNetDir(task),
- "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+ "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{
"net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
"pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
}),
- "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
- "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
- "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
- "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
- "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
- "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
- "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
+ "oom_score": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+ "oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+ "smaps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}),
+ "stat": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+ "statm": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}),
+ "status": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+ "uid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
}
if isThreadGroup {
contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
}
if len(cgroupControllers) > 0 {
- contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
+ contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
}
taskInode := &taskInode{task: task}
@@ -90,17 +93,15 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace
taskInode.EnableLeakCheck()
inode := &taskOwnedInode{Inode: taskInode, owner: task}
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- links := taskInode.OrderedChildren.Populate(dentry, contents)
+ links := taskInode.OrderedChildren.Populate(contents)
taskInode.IncLinks(links)
- return dentry
+ return inode, nil
}
-// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// Valid implements kernfs.Inode.Valid. This inode remains valid as long
// as the task is still running. When it's dead, another tasks with the same
// PID could replace it.
func (i *taskInode) Valid(ctx context.Context) bool {
@@ -124,8 +125,8 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *taskInode) DecRef(context.Context) {
- i.taskInodeRefs.DecRef(i.Destroy)
+func (i *taskInode) DecRef(ctx context.Context) {
+ i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
}
// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
@@ -141,34 +142,23 @@ type taskOwnedInode struct {
var _ kernfs.Inode = (*taskOwnedInode)(nil)
-func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
// Note: credentials are overridden by taskOwnedInode.
inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
- taskInode := &taskOwnedInode{Inode: inode, owner: task}
- d := &kernfs.Dentry{}
- d.Init(taskInode)
- return d
+ return &taskOwnedInode{Inode: inode, owner: task}
}
-func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
- dir := &kernfs.StaticDirectory{}
-
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
// Note: credentials are overridden by taskOwnedInode.
- dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{
- SeekEnd: kernfs.SeekEndZero,
- })
- dir.EnableLeakCheck()
-
- inode := &taskOwnedInode{Inode: dir, owner: task}
- d := &kernfs.Dentry{}
- d.Init(inode)
+ fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
+ dir := kernfs.NewStaticDir(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
- dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- links := dir.OrderedChildren.Populate(d, children)
- dir.IncLinks(links)
+ return &taskOwnedInode{Inode: dir, owner: task}
+}
- return d
+func (i *taskOwnedInode) Valid(ctx context.Context) bool {
+ return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
}
// Stat implements kernfs.Inode.Stat.
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 0866cea2b..2c80ac5c2 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -63,7 +63,7 @@ type fdDir struct {
produceSymlink bool
}
-// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
var fds []int32
i.task.WithMuLocked(func(t *kernel.Task) {
@@ -109,16 +109,17 @@ type fdDirInode struct {
fdDir
fdDirInodeRefs
implStatFS
- kernfs.AlwaysValid
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
}
var _ kernfs.Inode = (*fdDirInode)(nil)
-func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode {
inode := &fdDirInode{
fdDir: fdDir{
fs: fs,
@@ -128,16 +129,17 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
}
inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
inode.EnableLeakCheck()
-
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ return inode
+}
- return dentry
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDirInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+ return i.fdDir.IterDirents(ctx, cb, offset, relOffset)
}
-// Lookup implements kernfs.inodeDynamicLookup.Lookup.
-func (i *fdDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -183,8 +185,8 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *fdDirInode) DecRef(context.Context) {
- i.fdDirInodeRefs.DecRef(i.Destroy)
+func (i *fdDirInode) DecRef(ctx context.Context) {
+ i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
}
// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
@@ -202,16 +204,13 @@ type fdSymlink struct {
var _ kernfs.Inode = (*fdSymlink)(nil)
-func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
inode := &fdSymlink{
task: task,
fd: fd,
}
inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ return inode
}
func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
@@ -236,6 +235,11 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
return vd, "", nil
}
+// Valid implements kernfs.Inode.Valid.
+func (s *fdSymlink) Valid(ctx context.Context) bool {
+ return taskFDExists(ctx, s.task, s.fd)
+}
+
// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
//
// +stateify savable
@@ -243,16 +247,17 @@ type fdInfoDirInode struct {
fdDir
fdInfoDirInodeRefs
implStatFS
- kernfs.AlwaysValid
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
}
var _ kernfs.Inode = (*fdInfoDirInode)(nil)
-func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode {
inode := &fdInfoDirInode{
fdDir: fdDir{
fs: fs,
@@ -261,16 +266,12 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
}
inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
inode.EnableLeakCheck()
-
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-
- return dentry
+ return inode
}
-// Lookup implements kernfs.inodeDynamicLookup.Lookup.
-func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -283,7 +284,12 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentr
task: i.task,
fd: fd,
}
- return i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data), nil
+ return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil
+}
+
+// IterDirents implements Inode.IterDirents.
+func (i *fdInfoDirInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+ return i.fdDir.IterDirents(ctx, cb, offset, relOffset)
}
// Open implements kernfs.Inode.Open.
@@ -298,8 +304,8 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *ker
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *fdInfoDirInode) DecRef(context.Context) {
- i.fdInfoDirInodeRefs.DecRef(i.Destroy)
+func (i *fdInfoDirInode) DecRef(ctx context.Context) {
+ i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
}
// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
@@ -328,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "flags:\t0%o\n", flags)
return nil
}
+
+// Valid implements kernfs.Inode.Valid.
+func (d *fdInfoData) Valid(ctx context.Context) bool {
+ return taskFDExists(ctx, d.task, d.fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 3fbf081a6..79f8b7e9f 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -247,13 +247,10 @@ type commInode struct {
task *kernel.Task
}
-func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
inode := &commInode{task: task}
inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ return inode
}
func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -658,13 +655,10 @@ type exeSymlink struct {
var _ kernfs.Inode = (*exeSymlink)(nil)
-func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
inode := &exeSymlink{task: task}
inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ return inode
}
// Readlink implements kernfs.Inode.Readlink.
@@ -737,13 +731,10 @@ type cwdSymlink struct {
var _ kernfs.Inode = (*cwdSymlink)(nil)
-func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
inode := &cwdSymlink{task: task}
inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ return inode
}
// Readlink implements kernfs.Inode.Readlink.
@@ -851,7 +842,7 @@ type namespaceSymlink struct {
task *kernel.Task
}
-func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode {
// Namespace symlinks should contain the namespace name and the inode number
// for the namespace instance, so for example user:[123456]. We currently fake
// the inode number by sticking the symlink inode in its place.
@@ -862,9 +853,7 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
taskInode := &taskOwnedInode{Inode: inode, owner: task}
- d := &kernfs.Dentry{}
- d.Init(taskInode)
- return d
+ return taskInode
}
// Readlink implements kernfs.Inode.Readlink.
@@ -882,11 +871,12 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir
}
// Create a synthetic inode to represent the namespace.
+ fs := mnt.Filesystem().Impl().(*filesystem)
dentry := &kernfs.Dentry{}
- dentry.Init(&namespaceInode{})
+ dentry.Init(&fs.Filesystem, &namespaceInode{})
vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
- vd.IncRef()
- dentry.DecRef(ctx)
+ // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
+ mnt.IncRef()
return vd, "", nil
}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index e7f748655..3425e8698 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -37,12 +37,12 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode {
k := task.Kernel()
pidns := task.PIDNamespace()
root := auth.NewRootCredentials(pidns.UserNamespace())
- var contents map[string]*kernfs.Dentry
+ var contents map[string]kernfs.Inode
if stack := task.NetworkNamespace().Stack(); stack != nil {
const (
arp = "IP address HW type Flags HW address Mask Device\n"
@@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
// network namespace.
- contents = map[string]*kernfs.Dentry{
- "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
- "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
+ contents = map[string]kernfs.Inode{
+ "dev": fs.newInode(root, 0444, &netDevData{stack: stack}),
+ "snmp": fs.newInode(root, 0444, &netSnmpData{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, if the file contains a header the stub is just the header
// otherwise it is an empty file.
- "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
- "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
- "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
- "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
- "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
+ "arp": fs.newInode(root, 0444, newStaticFile(arp)),
+ "netlink": fs.newInode(root, 0444, newStaticFile(netlink)),
+ "netstat": fs.newInode(root, 0444, &netStatData{}),
+ "packet": fs.newInode(root, 0444, newStaticFile(packet)),
+ "protocols": fs.newInode(root, 0444, newStaticFile(protocols)),
// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
// high res timer ticks per sec (ClockGetres returns 1ns resolution).
- "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
- "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
- "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
- "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
- "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
- "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
+ "psched": fs.newInode(root, 0444, newStaticFile(psched)),
+ "ptype": fs.newInode(root, 0444, newStaticFile(ptype)),
+ "route": fs.newInode(root, 0444, &netRouteData{stack: stack}),
+ "tcp": fs.newInode(root, 0444, &netTCPData{kernel: k}),
+ "udp": fs.newInode(root, 0444, &netUDPData{kernel: k}),
+ "unix": fs.newInode(root, 0444, &netUnixData{kernel: k}),
}
if stack.SupportsIPv6() {
- contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
- contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
- contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
- contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
+ contents["if_inet6"] = fs.newInode(root, 0444, &ifinet6{stack: stack})
+ contents["ipv6_route"] = fs.newInode(root, 0444, newStaticFile(""))
+ contents["tcp6"] = fs.newInode(root, 0444, &netTCP6Data{kernel: k})
+ contents["udp6"] = fs.newInode(root, 0444, newStaticFile(upd6))
}
}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index d8f5dd509..3259c3732 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -38,10 +38,11 @@ const (
// +stateify savable
type tasksInode struct {
implStatFS
- kernfs.AlwaysValid
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
kernfs.OrderedChildren
tasksInodeRefs
@@ -52,8 +53,6 @@ type tasksInode struct {
// '/proc/self' and '/proc/thread-self' have custom directory offsets in
// Linux. So handle them outside of OrderedChildren.
- selfSymlink *kernfs.Dentry
- threadSelfSymlink *kernfs.Dentry
// cgroupControllers is a map of controller name to directory in the
// cgroup hierarchy. These controllers are immutable and will be listed
@@ -63,52 +62,53 @@ type tasksInode struct {
var _ kernfs.Inode = (*tasksInode)(nil)
-func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode {
root := auth.NewRootCredentials(pidns.UserNamespace())
- contents := map[string]*kernfs.Dentry{
- "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
- "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
- "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
+ contents := map[string]kernfs.Inode{
+ "cpuinfo": fs.newInode(root, 0444, newStaticFileSetStat(cpuInfoData(k))),
+ "filesystems": fs.newInode(root, 0444, &filesystemsData{}),
+ "loadavg": fs.newInode(root, 0444, &loadavgData{}),
"sys": fs.newSysDir(root, k),
- "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
+ "meminfo": fs.newInode(root, 0444, &meminfoData{}),
"mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
"net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
- "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
- "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
- "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
+ "stat": fs.newInode(root, 0444, &statData{}),
+ "uptime": fs.newInode(root, 0444, &uptimeData{}),
+ "version": fs.newInode(root, 0444, &versionData{}),
}
inode := &tasksInode{
pidns: pidns,
fs: fs,
- selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns),
- threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns),
cgroupControllers: cgroupControllers,
}
inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
inode.EnableLeakCheck()
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
-
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- links := inode.OrderedChildren.Populate(dentry, contents)
+ links := inode.OrderedChildren.Populate(contents)
inode.IncLinks(links)
- return inode, dentry
+ return inode
}
-// Lookup implements kernfs.inodeDynamicLookup.Lookup.
-func (i *tasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
- // Try to lookup a corresponding task.
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+ // Check if a static entry was looked up.
+ if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+ return d, nil
+ }
+
+ // Not a static entry. Try to lookup a corresponding task.
tid, err := strconv.ParseUint(name, 10, 64)
if err != nil {
+ root := auth.NewRootCredentials(i.pidns.UserNamespace())
// If it failed to parse, check if it's one of the special handled files.
switch name {
case selfName:
- return i.selfSymlink, nil
+ return i.newSelfSymlink(root), nil
case threadSelfName:
- return i.threadSelfSymlink, nil
+ return i.newThreadSelfSymlink(root), nil
}
return nil, syserror.ENOENT
}
@@ -118,10 +118,10 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, e
return nil, syserror.ENOENT
}
- return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers), nil
+ return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
}
-// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
const FIRST_PROCESS_ENTRY = 256
@@ -229,8 +229,8 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
}
// DecRef implements kernfs.Inode.DecRef.
-func (i *tasksInode) DecRef(context.Context) {
- i.tasksInodeRefs.DecRef(i.Destroy)
+func (i *tasksInode) DecRef(ctx context.Context) {
+ i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
}
// staticFileSetStat implements a special static file that allows inode
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index f268c59b0..07c27cdd9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -43,13 +43,10 @@ type selfSymlink struct {
var _ kernfs.Inode = (*selfSymlink)(nil)
-func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
- inode := &selfSymlink{pidns: pidns}
- inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+func (i *tasksInode) newSelfSymlink(creds *auth.Credentials) kernfs.Inode {
+ inode := &selfSymlink{pidns: i.pidns}
+ inode.Init(creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+ return inode
}
func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
@@ -87,13 +84,10 @@ type threadSelfSymlink struct {
var _ kernfs.Inode = (*threadSelfSymlink)(nil)
-func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
- inode := &threadSelfSymlink{pidns: pidns}
- inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+func (i *tasksInode) newThreadSelfSymlink(creds *auth.Credentials) kernfs.Inode {
+ inode := &threadSelfSymlink{pidns: i.pidns}
+ inode.Init(creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+ return inode
}
func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 3312b0418..95420368d 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -40,93 +40,93 @@ const (
)
// newSysDir returns the dentry corresponding to /proc/sys directory.
-func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
- return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "kernel": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
- "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
- "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
- "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
+func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+ return fs.newStaticDir(root, map[string]kernfs.Inode{
+ "kernel": fs.newStaticDir(root, map[string]kernfs.Inode{
+ "hostname": fs.newInode(root, 0444, &hostnameData{}),
+ "shmall": fs.newInode(root, 0444, shmData(linux.SHMALL)),
+ "shmmax": fs.newInode(root, 0444, shmData(linux.SHMMAX)),
+ "shmmni": fs.newInode(root, 0444, shmData(linux.SHMMNI)),
}),
- "vm": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
- "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
+ "vm": fs.newStaticDir(root, map[string]kernfs.Inode{
+ "mmap_min_addr": fs.newInode(root, 0444, &mmapMinAddrData{k: k}),
+ "overcommit_memory": fs.newInode(root, 0444, newStaticFile("0\n")),
}),
"net": fs.newSysNetDir(root, k),
})
}
// newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
- var contents map[string]*kernfs.Dentry
+func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+ var contents map[string]kernfs.Inode
// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
// network namespace of the calling process.
if stack := k.RootNetworkNamespace().Stack(); stack != nil {
- contents = map[string]*kernfs.Dentry{
- "ipv4": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
- "tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
- "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
- "tcp_wmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
- "ip_forward": fs.newDentry(root, fs.NextIno(), 0444, &ipForwarding{stack: stack}),
+ contents = map[string]kernfs.Inode{
+ "ipv4": fs.newStaticDir(root, map[string]kernfs.Inode{
+ "tcp_recovery": fs.newInode(root, 0644, &tcpRecoveryData{stack: stack}),
+ "tcp_rmem": fs.newInode(root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
+ "tcp_sack": fs.newInode(root, 0644, &tcpSackData{stack: stack}),
+ "tcp_wmem": fs.newInode(root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+ "ip_forward": fs.newInode(root, 0444, &ipForwarding{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
// value closest to the actual netstack behavior or any empty file, all
// of these files will have mode 0444 (read-only for all users).
- "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")),
- "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
- "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
- "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+ "ip_local_port_range": fs.newInode(root, 0444, newStaticFile("16000 65535")),
+ "ip_local_reserved_ports": fs.newInode(root, 0444, newStaticFile("")),
+ "ipfrag_time": fs.newInode(root, 0444, newStaticFile("30")),
+ "ip_nonlocal_bind": fs.newInode(root, 0444, newStaticFile("0")),
+ "ip_no_pmtu_disc": fs.newInode(root, 0444, newStaticFile("1")),
// tcp_allowed_congestion_control tell the user what they are able to
// do as an unprivledged process so we leave it empty.
- "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
- "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
- "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+ "tcp_allowed_congestion_control": fs.newInode(root, 0444, newStaticFile("")),
+ "tcp_available_congestion_control": fs.newInode(root, 0444, newStaticFile("reno")),
+ "tcp_congestion_control": fs.newInode(root, 0444, newStaticFile("reno")),
// Many of the following stub files are features netstack doesn't
// support. The unsupported features return "0" to indicate they are
// disabled.
- "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
- "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
- "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
- "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
- "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
- "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
- "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
- "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
- "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
- "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
- "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+ "tcp_base_mss": fs.newInode(root, 0444, newStaticFile("1280")),
+ "tcp_dsack": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_early_retrans": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_fack": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_fastopen": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_fastopen_key": fs.newInode(root, 0444, newStaticFile("")),
+ "tcp_invalid_ratelimit": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_keepalive_intvl": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_keepalive_probes": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_keepalive_time": fs.newInode(root, 0444, newStaticFile("7200")),
+ "tcp_mtu_probing": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_no_metrics_save": fs.newInode(root, 0444, newStaticFile("1")),
+ "tcp_probe_interval": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_probe_threshold": fs.newInode(root, 0444, newStaticFile("0")),
+ "tcp_retries1": fs.newInode(root, 0444, newStaticFile("3")),
+ "tcp_retries2": fs.newInode(root, 0444, newStaticFile("15")),
+ "tcp_rfc1337": fs.newInode(root, 0444, newStaticFile("1")),
+ "tcp_slow_start_after_idle": fs.newInode(root, 0444, newStaticFile("1")),
+ "tcp_synack_retries": fs.newInode(root, 0444, newStaticFile("5")),
+ "tcp_syn_retries": fs.newInode(root, 0444, newStaticFile("3")),
+ "tcp_timestamps": fs.newInode(root, 0444, newStaticFile("1")),
}),
- "core": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
- "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
- "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
- "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
- "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
- "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
- "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
- "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+ "core": fs.newStaticDir(root, map[string]kernfs.Inode{
+ "default_qdisc": fs.newInode(root, 0444, newStaticFile("pfifo_fast")),
+ "message_burst": fs.newInode(root, 0444, newStaticFile("10")),
+ "message_cost": fs.newInode(root, 0444, newStaticFile("5")),
+ "optmem_max": fs.newInode(root, 0444, newStaticFile("0")),
+ "rmem_default": fs.newInode(root, 0444, newStaticFile("212992")),
+ "rmem_max": fs.newInode(root, 0444, newStaticFile("212992")),
+ "somaxconn": fs.newInode(root, 0444, newStaticFile("128")),
+ "wmem_default": fs.newInode(root, 0444, newStaticFile("212992")),
+ "wmem_max": fs.newInode(root, 0444, newStaticFile("212992")),
}),
}
}
- return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+ return fs.newStaticDir(root, contents)
}
// mmapMinAddrData implements vfs.DynamicBytesSource for
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 6975af5a7..2582ababd 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -109,9 +109,12 @@ func setup(t *testing.T) *testutil.System {
if err != nil {
t.Fatalf("NewMountNamespace(): %v", err)
}
+ root := mntns.Root()
+ root.IncRef()
+ defer root.DecRef(ctx)
pop := &vfs.PathOperation{
- Root: mntns.Root(),
- Start: mntns.Root(),
+ Root: root,
+ Start: root,
Path: fspath.Parse("/proc"),
}
if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
@@ -119,8 +122,8 @@ func setup(t *testing.T) *testutil.System {
}
pop = &vfs.PathOperation{
- Root: mntns.Root(),
- Start: mntns.Root(),
+ Root: root,
+ Start: root,
Path: fspath.Parse("/proc"),
}
mntOpts := &vfs.MountOptions{
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index 29e5371d6..cf91ea36c 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -46,6 +46,9 @@ func (filesystemType) Name() string {
return "sockfs"
}
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -114,6 +117,6 @@ func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
d := &kernfs.Dentry{}
- d.Init(i)
+ d.Init(&fs.Filesystem, i)
return d.VFSDentry()
}
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
index 1a6749e53..94366d429 100644
--- a/pkg/sentry/fsimpl/sys/kcov.go
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -27,12 +27,10 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) *kernfs.Dentry {
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
k := &kcovInode{}
k.InodeAttrs.Init(creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
- d := &kernfs.Dentry{}
- d.Init(k)
- return d
+ return k
}
// kcovInode implements kernfs.Inode.
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 1568c581f..1ad679830 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -52,6 +52,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -64,15 +67,15 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
- root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+ root := fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
"block": fs.newDir(creds, defaultSysDirMode, nil),
"bus": fs.newDir(creds, defaultSysDirMode, nil),
- "class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+ "class": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
"power_supply": fs.newDir(creds, defaultSysDirMode, nil),
}),
"dev": fs.newDir(creds, defaultSysDirMode, nil),
- "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+ "devices": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
+ "system": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
"cpu": cpuDir(ctx, fs, creds),
}),
}),
@@ -82,13 +85,15 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
"module": fs.newDir(creds, defaultSysDirMode, nil),
"power": fs.newDir(creds, defaultSysDirMode, nil),
})
- return fs.VFSFilesystem(), root.VFSDentry(), nil
+ var rootD kernfs.Dentry
+ rootD.Init(&fs.Filesystem, root)
+ return fs.VFSFilesystem(), rootD.VFSDentry(), nil
}
-func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
k := kernel.KernelFromContext(ctx)
maxCPUCores := k.ApplicationCores()
- children := map[string]*kernfs.Dentry{
+ children := map[string]kernfs.Inode{
"online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
"possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
"present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
@@ -99,14 +104,14 @@ func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernf
return fs.newDir(creds, defaultSysDirMode, children)
}
-func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
// If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
// should be mounted at debug/, but for our purposes, it is sufficient to
// keep it in sys.
- var children map[string]*kernfs.Dentry
+ var children map[string]kernfs.Inode
if coverage.KcovAvailable() {
- children = map[string]*kernfs.Dentry{
- "debug": fs.newDir(creds, linux.FileMode(0700), map[string]*kernfs.Dentry{
+ children = map[string]kernfs.Inode{
+ "debug": fs.newDir(creds, linux.FileMode(0700), map[string]kernfs.Inode{
"kcov": fs.newKcovFile(ctx, creds),
}),
}
@@ -125,27 +130,23 @@ func (fs *filesystem) Release(ctx context.Context) {
// +stateify savable
type dir struct {
dirRefs
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
- kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeTemporary
kernfs.OrderedChildren
locks vfs.FileLocks
-
- dentry kernfs.Dentry
}
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
d := &dir{}
d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
d.EnableLeakCheck()
- d.dentry.Init(d)
-
- d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
-
- return &d.dentry
+ d.IncLinks(d.OrderedChildren.Populate(contents))
+ return d
}
// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
@@ -165,8 +166,8 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry
}
// DecRef implements kernfs.Inode.DecRef.
-func (d *dir) DecRef(context.Context) {
- d.dirRefs.DecRef(d.Destroy)
+func (d *dir) DecRef(ctx context.Context) {
+ d.dirRefs.DecRef(func() { d.Destroy(ctx) })
}
// StatFS implements kernfs.Inode.StatFS.
@@ -190,12 +191,10 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
-func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
c := &cpuFile{maxCores: maxCores}
c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
- d := &kernfs.Dentry{}
- d.Init(c)
- return d
+ return c
}
// +stateify savable
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 568132121..1a8525b06 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -46,16 +46,18 @@ type System struct {
// NewSystem constructs a System.
//
-// Precondition: Caller must hold a reference on MntNs, whose ownership
+// Precondition: Caller must hold a reference on mns, whose ownership
// is transferred to the new System.
func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+ root := mns.Root()
+ root.IncRef()
s := &System{
t: t,
Ctx: ctx,
Creds: auth.CredentialsFromContext(ctx),
VFS: v,
MntNs: mns,
- Root: mns.Root(),
+ Root: root,
}
return s
}
@@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error {
defer d.mu.Unlock()
dirent, ok := d.dirents[name]
if !ok {
- return fmt.Errorf("No dirent named %q found", name)
+ return fmt.Errorf("no dirent named %q found", name)
}
if dirent.Type != typ {
- return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+ return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
}
return nil
}
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 5209a17af..3cc63e732 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -193,6 +193,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
// Create nested directories with given depth.
root := mntns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
vd := root
vd.IncRef()
@@ -387,6 +388,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
// Create the mount point.
root := mntns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
pop := vfs.PathOperation{
Root: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index be29a2363..2f856ce36 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -165,6 +165,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
// Create the pipe.
root := mntns.Root()
+ root.IncRef()
pop := vfs.PathOperation{
Root: root,
Start: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index a199eb33d..ce4e3eda7 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -293,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
optional.End = pgend
}
- cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index cefec8fde..e2a0aac69 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -74,6 +74,8 @@ type filesystem struct {
mu sync.RWMutex `state:"nosave"`
nextInoMinusOne uint64 // accessed using atomic memory operations
+
+ root *dentry
}
// Name implements vfs.FilesystemType.Name.
@@ -81,6 +83,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// FilesystemOpts is used to pass configuration data to tmpfs.
//
// +stateify savable
@@ -194,6 +199,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs.vfsfs.DecRef(ctx)
return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
}
+ fs.root = root
return &fs.vfsfs, &root.vfsd, nil
}
@@ -205,6 +211,37 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+ fs.mu.Lock()
+ if fs.root.inode.isDir() {
+ fs.root.releaseChildrenLocked(ctx)
+ }
+ fs.mu.Unlock()
+}
+
+// releaseChildrenLocked is called on the mount point by filesystem.Release() to
+// destroy all objects in the mount. It performs a depth-first walk of the
+// filesystem and "unlinks" everything by decrementing link counts
+// appropriately. There should be no open file descriptors when this is called,
+// so each inode should only have one outstanding reference that is removed once
+// its link count hits zero.
+//
+// Note that we do not update filesystem state precisely while tearing down (for
+// instance, the child maps are ignored)--we only care to remove all remaining
+// references so that every filesystem object gets destroyed. Also note that we
+// do not need to trigger DecRef on the mount point itself or any child mount;
+// these are taken care of by the destructor of the enclosing MountNamespace.
+//
+// Precondition: filesystem.mu is held.
+func (d *dentry) releaseChildrenLocked(ctx context.Context) {
+ dir := d.inode.impl.(*directory)
+ for _, child := range dir.childMap {
+ if child.inode.isDir() {
+ child.releaseChildrenLocked(ctx)
+ child.inode.decLinksLocked(ctx) // link for child/.
+ dir.inode.decLinksLocked(ctx) // link for child/..
+ }
+ child.inode.decLinksLocked(ctx) // link for child
+ }
}
// immutable
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 99c8e3c0f..fc5323abc 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -46,6 +46,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
}
root := mntns.Root()
+ root.IncRef()
return vfsObj, root, func() {
root.DecRef(ctx)
mntns.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 7779271a9..03da505e1 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -156,11 +156,10 @@ afterSymlink:
return child, nil
}
-// verifyChild verifies the root hash of child against the already verified
-// root hash of the parent to ensure the child is expected. verifyChild
-// triggers a sentry panic if unexpected modifications to the file system are
-// detected. In noCrashOnVerificationFailure mode it returns a syserror
-// instead.
+// verifyChild verifies the hash of child against the already verified hash of
+// the parent to ensure the child is expected. verifyChild triggers a sentry
+// panic if unexpected modifications to the file system are detected. In
+// noCrashOnVerificationFailure mode it returns a syserror instead.
// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
// TODO(b/166474175): Investigate all possible errors returned in this
// function, and make sure we differentiate all errors that indicate unexpected
@@ -175,12 +174,12 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
return nil, err
}
- verityMu.RLock()
- defer verityMu.RUnlock()
+ fs.verityMu.RLock()
+ defer fs.verityMu.RUnlock()
// Read the offset of the child from the extended attributes of the
// corresponding Merkle tree file.
- // This is the offset of the root hash for child in its parent's Merkle
- // tree file.
+ // This is the offset of the hash for child in its parent's Merkle tree
+ // file.
off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
Root: child.lowerMerkleVD,
Start: child.lowerMerkleVD,
@@ -205,7 +204,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
}
- // Open parent Merkle tree file to read and verify child's root hash.
+ // Open parent Merkle tree file to read and verify child's hash.
parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
Root: parent.lowerMerkleVD,
Start: parent.lowerMerkleVD,
@@ -224,7 +223,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// dataSize is the size of raw data for the Merkle tree. For a file,
// dataSize is the size of the whole file. For a directory, dataSize is
- // the size of all its children's root hashes.
+ // the size of all its children's hashes.
dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
Size: sizeOfStringInt32,
@@ -264,7 +263,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
}
// Since we are verifying against a directory Merkle tree, buf should
- // contain the root hash of the children in the parent Merkle tree when
+ // contain the hash of the children in the parent Merkle tree when
// Verify returns with success.
var buf bytes.Buffer
if _, err := merkletree.Verify(&merkletree.VerifyParams{
@@ -278,21 +277,21 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
GID: parentStat.GID,
ReadOffset: int64(offset),
ReadSize: int64(merkletree.DigestSize()),
- ExpectedRoot: parent.rootHash,
+ Expected: parent.hash,
DataAndTreeInSameFile: true,
}); err != nil && err != io.EOF {
return nil, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification for %s failed: %v", childPath, err))
}
- // Cache child root hash when it's verified the first time.
- if len(child.rootHash) == 0 {
- child.rootHash = buf.Bytes()
+ // Cache child hash when it's verified the first time.
+ if len(child.hash) == 0 {
+ child.hash = buf.Bytes()
}
return child, nil
}
-// verifyStat verifies the stat against the verified root hash. The mode/uid/gid
-// of the file is cached after verified.
+// verifyStat verifies the stat against the verified hash. The mode/uid/gid of
+// the file is cached after verified.
func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error {
vfsObj := fs.vfsfs.VirtualFilesystem()
@@ -303,8 +302,8 @@ func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Stat
return err
}
- verityMu.RLock()
- defer verityMu.RUnlock()
+ fs.verityMu.RLock()
+ defer fs.verityMu.RUnlock()
fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
Root: d.lowerMerkleVD,
@@ -353,7 +352,7 @@ func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Stat
ReadOffset: 0,
// Set read size to 0 so only the metadata is verified.
ReadSize: 0,
- ExpectedRoot: d.rootHash,
+ Expected: d.hash,
DataAndTreeInSameFile: false,
}
if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
@@ -375,15 +374,15 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
// If enabling verification on files/directories is not allowed
// during runtime, all cached children are already verified. If
// runtime enable is allowed and the parent directory is
- // enabled, we should verify the child root hash here because
- // it may be cached before enabled.
+ // enabled, we should verify the child hash here because it may
+ // be cached before enabled.
if fs.allowRuntimeEnable {
- if isEnabled(parent) {
+ if parent.verityEnabled() {
if _, err := fs.verifyChild(ctx, parent, child); err != nil {
return nil, err
}
}
- if isEnabled(child) {
+ if child.verityEnabled() {
vfsObj := fs.vfsfs.VirtualFilesystem()
mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
@@ -481,9 +480,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
// file open and ready to use.
// This may cause empty and unused Merkle tree files in
// allowRuntimeEnable mode, if they are never enabled. This
- // does not affect verification, as we rely on cached root hash
- // to decide whether to perform verification, not the existence
- // of the Merkle tree file. Also, those Merkle tree files are
+ // does not affect verification, as we rely on cached hash to
+ // decide whether to perform verification, not the existence of
+ // the Merkle tree file. Also, those Merkle tree files are
// always hidden and cannot be accessed by verity fs users.
if fs.allowRuntimeEnable {
childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
@@ -551,16 +550,16 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
child.uid = stat.UID
child.gid = stat.GID
- // Verify child root hash. This should always be performed unless in
+ // Verify child hash. This should always be performed unless in
// allowRuntimeEnable mode and the parent directory hasn't been enabled
// yet.
- if isEnabled(parent) {
+ if parent.verityEnabled() {
if _, err := fs.verifyChild(ctx, parent, child); err != nil {
child.destroyLocked(ctx)
return nil, err
}
}
- if isEnabled(child) {
+ if child.verityEnabled() {
if err := fs.verifyStat(ctx, child, stat); err != nil {
child.destroyLocked(ctx)
return nil, err
@@ -916,7 +915,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err != nil {
return linux.Statx{}, err
}
- if isEnabled(d) {
+ if d.verityEnabled() {
if err := fs.verifyStat(ctx, d, stat); err != nil {
return linux.Statx{}, err
}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 3eb972237..8dc9e26bc 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -49,12 +49,12 @@ const Name = "verity"
const merklePrefix = ".merkle.verity."
// merkleoffsetInParentXattr is the extended attribute name specifying the
-// offset of child root hash in its parent's Merkle tree.
+// offset of child hash in its parent's Merkle tree.
const merkleOffsetInParentXattr = "user.merkle.offset"
// merkleSizeXattr is the extended attribute name specifying the size of data
// hashed by the corresponding Merkle tree. For a file, it's the size of the
-// whole file. For a directory, it's the size of all its children's root hashes.
+// whole file. For a directory, it's the size of all its children's hashes.
const merkleSizeXattr = "user.merkle.size"
// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
@@ -68,11 +68,6 @@ const sizeOfStringInt32 = 10
// flag.
var noCrashOnVerificationFailure bool
-// verityMu synchronizes enabling verity files, protects files or directories
-// from being enabled by different threads simultaneously. It also ensures that
-// verity does not access files that are being enabled.
-var verityMu sync.RWMutex
-
// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
@@ -106,6 +101,17 @@ type filesystem struct {
// to ensure consistent lock ordering between dentry.dirMu in different
// dentries.
renameMu sync.RWMutex `state:"nosave"`
+
+ // verityMu synchronizes enabling verity files, protects files or
+ // directories from being enabled by different threads simultaneously.
+ // It also ensures that verity does not access files that are being
+ // enabled.
+ //
+ // Also, the directory Merkle trees depends on the generated trees of
+ // its children. So they shouldn't be enabled the same time. This lock
+ // is for the whole file system to ensure that no more than one file is
+ // enabled the same time.
+ verityMu sync.RWMutex
}
// InternalFilesystemOptions may be passed as
@@ -142,13 +148,8 @@ func (FilesystemType) Name() string {
return Name
}
-// isEnabled checks whether the target is enabled with verity features. It
-// should always be true if runtime enable is not allowed. In runtime enable
-// mode, it returns true if the target has been enabled with
-// ioctl(FS_IOC_ENABLE_VERITY).
-func isEnabled(d *dentry) bool {
- return !d.fs.allowRuntimeEnable || len(d.rootHash) != 0
-}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
// alertIntegrityViolation alerts a violation of integrity, which usually means
// unexpected modification to the file system is detected. In
@@ -256,7 +257,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
d.mode = uint32(stat.Mode)
d.uid = stat.UID
d.gid = stat.GID
- d.rootHash = make([]byte, len(iopts.RootHash))
+ d.hash = make([]byte, len(iopts.RootHash))
if !fs.allowRuntimeEnable {
if err := fs.verifyStat(ctx, d, stat); err != nil {
@@ -264,7 +265,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
- copy(d.rootHash, iopts.RootHash)
+ copy(d.hash, iopts.RootHash)
d.vfsd.Init(d)
fs.rootDentry = d
@@ -316,8 +317,8 @@ type dentry struct {
// in the underlying file system.
lowerMerkleVD vfs.VirtualDentry
- // rootHash is the rootHash for the current file or directory.
- rootHash []byte
+ // hash is the calculated hash for the current file or directory.
+ hash []byte
}
// newDentry creates a new dentry representing the given verity file. The
@@ -439,6 +440,14 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+ return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
func (d *dentry) readlink(ctx context.Context) (string, error) {
return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
Root: d.lowerVD,
@@ -501,7 +510,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
if err != nil {
return linux.Statx{}, err
}
- if isEnabled(fd.d) {
+ if fd.d.verityEnabled() {
if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil {
return linux.Statx{}, err
}
@@ -516,11 +525,11 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
}
// generateMerkle generates a Merkle tree file for fd. If fd points to a file
-// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The root
-// hash of the generated Merkle tree and the data size is returned.
-// If fd points to a regular file, the data is the content of the file. If fd
-// points to a directory, the data is all root hahes of its children, written
-// to the Merkle tree file.
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
+// of the generated Merkle tree and the data size is returned. If fd points to
+// a regular file, the data is the content of the file. If fd points to a
+// directory, the data is all hahes of its children, written to the Merkle tree
+// file.
func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
fdReader := vfs.FileReadWriteSeeker{
FD: fd.lowerFD,
@@ -557,9 +566,9 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64,
params.GID = stat.GID
params.DataAndTreeInSameFile = false
case linux.S_IFDIR:
- // For a directory, generate a Merkle tree based on the root
- // hashes of its children that has already been written to the
- // Merkle tree file.
+ // For a directory, generate a Merkle tree based on the hashes
+ // of its children that has already been written to the Merkle
+ // tree file.
merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
if err != nil {
return nil, 0, err
@@ -583,21 +592,19 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64,
// enable other types of file.
return nil, 0, syserror.EINVAL
}
- rootHash, err := merkletree.Generate(params)
- return rootHash, uint64(params.Size), err
+ hash, err := merkletree.Generate(params)
+ return hash, uint64(params.Size), err
}
// enableVerity enables verity features on fd by generating a Merkle tree file
-// and stores its root hash in its parent directory's Merkle tree.
+// and stores its hash in its parent directory's Merkle tree.
func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) {
if !fd.d.fs.allowRuntimeEnable {
return 0, syserror.EPERM
}
- // Lock to prevent other threads performing enable or access the file
- // while it's being enabled.
- verityMu.Lock()
- defer verityMu.Unlock()
+ fd.d.fs.verityMu.Lock()
+ defer fd.d.fs.verityMu.Unlock()
// In allowRuntimeEnable mode, the underlying fd and read/write fd for
// the Merkle tree file should have all been initialized. For any file
@@ -607,7 +614,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (ui
return 0, alertIntegrityViolation(syserror.EIO, "Unexpected verity fd: missing expected underlying fds")
}
- rootHash, dataSize, err := fd.generateMerkle(ctx)
+ hash, dataSize, err := fd.generateMerkle(ctx)
if err != nil {
return 0, err
}
@@ -618,15 +625,15 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (ui
return 0, err
}
- // Write the root hash of fd to the parent directory's Merkle
- // tree file, as it should be part of the parent Merkle tree
- // data. parentMerkleWriter is open with O_APPEND, so it
- // should write directly to the end of the file.
- if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(rootHash), vfs.WriteOptions{}); err != nil {
+ // Write the hash of fd to the parent directory's Merkle tree
+ // file, as it should be part of the parent Merkle tree data.
+ // parentMerkleWriter is open with O_APPEND, so it should write
+ // directly to the end of the file.
+ if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
return 0, err
}
- // Record the offset of the root hash of fd in parent directory's
+ // Record the offset of the hash of fd in parent directory's
// Merkle tree file.
if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
Name: merkleOffsetInParentXattr,
@@ -643,37 +650,37 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (ui
}); err != nil {
return 0, err
}
- fd.d.rootHash = append(fd.d.rootHash, rootHash...)
+ fd.d.hash = append(fd.d.hash, hash...)
return 0, nil
}
-// measureVerity returns the root hash of fd, saved in args[2].
+// measureVerity returns the hash of fd, saved in verityDigest.
func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) {
t := kernel.TaskFromContext(ctx)
var metadata linux.DigestMetadata
- // If allowRuntimeEnable is true, an empty fd.d.rootHash indicates that
+ // If allowRuntimeEnable is true, an empty fd.d.hash indicates that
// verity is not enabled for the file. If allowRuntimeEnable is false,
// this is an integrity violation because all files should have verity
- // enabled, in which case fd.d.rootHash should be set.
- if len(fd.d.rootHash) == 0 {
+ // enabled, in which case fd.d.hash should be set.
+ if len(fd.d.hash) == 0 {
if fd.d.fs.allowRuntimeEnable {
return 0, syserror.ENODATA
}
- return 0, alertIntegrityViolation(syserror.ENODATA, "Ioctl measureVerity: no root hash found")
+ return 0, alertIntegrityViolation(syserror.ENODATA, "Ioctl measureVerity: no hash found")
}
// The first part of VerityDigest is the metadata.
if _, err := metadata.CopyIn(t, verityDigest); err != nil {
return 0, err
}
- if metadata.DigestSize < uint16(len(fd.d.rootHash)) {
+ if metadata.DigestSize < uint16(len(fd.d.hash)) {
return 0, syserror.EOVERFLOW
}
// Populate the output digest size, since DigestSize is both input and
// output.
- metadata.DigestSize = uint16(len(fd.d.rootHash))
+ metadata.DigestSize = uint16(len(fd.d.hash))
// First copy the metadata.
if _, err := metadata.CopyOut(t, verityDigest); err != nil {
@@ -681,16 +688,16 @@ func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, ve
}
// Now copy the root hash bytes to the memory after metadata.
- _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.rootHash)
+ _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
return 0, err
}
func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) {
f := int32(0)
- // All enabled files should store a root hash. This flag is not settable
- // via FS_IOC_SETFLAGS.
- if len(fd.d.rootHash) != 0 {
+ // All enabled files should store a hash. This flag is not settable via
+ // FS_IOC_SETFLAGS.
+ if len(fd.d.hash) != 0 {
f |= linux.FS_VERITY_FL
}
@@ -719,10 +726,12 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
// No need to verify if the file is not enabled yet in
// allowRuntimeEnable mode.
- if !isEnabled(fd.d) {
+ if !fd.d.verityEnabled() {
return fd.lowerFD.PRead(ctx, dst, offset, opts)
}
+ fd.d.fs.verityMu.RLock()
+ defer fd.d.fs.verityMu.RUnlock()
// dataSize is the size of the whole file.
dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
@@ -767,7 +776,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
GID: fd.d.gid,
ReadOffset: offset,
ReadSize: dst.NumBytes(),
- ExpectedRoot: fd.d.rootHash,
+ Expected: fd.d.hash,
DataAndTreeInSameFile: false,
})
if err != nil {
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
index 8d0926bc4..e301d35f5 100644
--- a/pkg/sentry/fsimpl/verity/verity_test.go
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -70,6 +70,7 @@ func newVerityRoot(ctx context.Context, t *testing.T) (*vfs.VirtualFilesystem, v
return nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace: %v", err)
}
root := mntns.Root()
+ root.IncRef()
t.Helper()
t.Cleanup(func() {
root.DecRef(ctx)
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 5de70aecb..9a24c6bdb 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -97,6 +97,17 @@ go_template_instance(
)
go_template_instance(
+ name = "ipc_namespace_refs",
+ out = "ipc_namespace_refs.go",
+ package = "kernel",
+ prefix = "IPCNamespace",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "IPCNamespace",
+ },
+)
+
+go_template_instance(
name = "process_group_refs",
out = "process_group_refs.go",
package = "kernel",
@@ -137,6 +148,7 @@ go_library(
"fs_context.go",
"fs_context_refs.go",
"ipc_namespace.go",
+ "ipc_namespace_refs.go",
"kcov.go",
"kcov_unsafe.go",
"kernel.go",
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index dd5f0f5fa..bb94769c4 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
}
// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
-// or nil if there is no such IPC namespace.
+// or nil if there is no such IPC namespace. It takes a reference on the
+// namespace.
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
if v := ctx.Value(CtxIPCNamespace); v != nil {
return v.(*IPCNamespace)
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 80a070d7e..3f34ee0db 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
@@ -24,6 +25,8 @@ import (
//
// +stateify savable
type IPCNamespace struct {
+ IPCNamespaceRefs
+
// User namespace which owns this IPC namespace. Immutable.
userNS *auth.UserNamespace
@@ -33,11 +36,13 @@ type IPCNamespace struct {
// NewIPCNamespace creates a new IPC namespace.
func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
- return &IPCNamespace{
+ ns := &IPCNamespace{
userNS: userNS,
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
+ ns.EnableLeakCheck()
+ return ns
}
// SemaphoreRegistry returns the semaphore set registry for this namespace.
@@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
return i.shms
}
+// DecRef implements refs_vfs2.RefCounter.DecRef.
+func (i *IPCNamespace) DecRef(ctx context.Context) {
+ i.IPCNamespaceRefs.DecRef(func() {
+ i.shms.Release(ctx)
+ })
+}
+
// IPCNamespace returns the task's IPC namespace.
func (t *Task) IPCNamespace() *IPCNamespace {
t.mu.Lock()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index d6c21adb7..652cbb732 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -828,7 +828,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.args.UTSNamespace
case CtxIPCNamespace:
- return ctx.args.IPCNamespace
+ ipcns := ctx.args.IPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
@@ -841,14 +843,16 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
if ctx.args.MountNamespaceVFS2 == nil {
return nil
}
- // MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
- return ctx.args.MountNamespaceVFS2.Root()
+ root := ctx.args.MountNamespaceVFS2.Root()
+ root.IncRef()
+ return root
case vfs.CtxMountNamespace:
if ctx.k.globalInit == nil {
return nil
}
- // MountNamespaceVFS2 takes a reference for us.
- return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns.IncRef()
+ return mntns
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case inet.CtxStack:
@@ -904,14 +908,13 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if VFS2Enabled {
mntnsVFS2 = args.MountNamespaceVFS2
if mntnsVFS2 == nil {
- // MountNamespaceVFS2 adds a reference to the namespace, which is
- // transferred to the new process.
+ // Add a reference to the namespace, which is transferred to the new process.
mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
+ mntnsVFS2.IncRef()
}
// Get the root directory from the MountNamespace.
root := mntnsVFS2.Root()
- // The call to newFSContext below will take a reference on root, so we
- // don't need to hold this one.
+ root.IncRef()
defer root.DecRef(ctx)
// Grab the working directory.
@@ -1373,8 +1376,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace {
return k.rootUTSNamespace
}
-// RootIPCNamespace returns the root IPCNamespace.
+// RootIPCNamespace takes a reference and returns the root IPCNamespace.
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+ k.rootIPCNamespace.IncRef()
return k.rootIPCNamespace
}
@@ -1635,7 +1639,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.k.rootUTSNamespace
case CtxIPCNamespace:
- return ctx.k.rootIPCNamespace
+ ipcns := ctx.k.rootIPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
@@ -1648,16 +1654,16 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
if ctx.k.globalInit == nil {
return vfs.VirtualDentry{}
}
- mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
- defer mntns.DecRef(ctx)
- // Root() takes a reference on the root dirent for us.
- return mntns.Root()
+ root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root()
+ root.IncRef()
+ return root
case vfs.CtxMountNamespace:
if ctx.k.globalInit == nil {
return nil
}
- // MountNamespaceVFS2() takes a reference for us.
- return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns.IncRef()
+ return mntns
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case inet.CtxStack:
@@ -1738,3 +1744,20 @@ func (k *Kernel) ShmMount() *vfs.Mount {
func (k *Kernel) SocketMount() *vfs.Mount {
return k.socketMount
}
+
+// Release releases resources owned by k.
+//
+// Precondition: This should only be called after the kernel is fully
+// initialized, e.g. after k.Start() has been called.
+func (k *Kernel) Release() {
+ ctx := k.SupervisorContext()
+ if VFS2Enabled {
+ k.hostMount.DecRef(ctx)
+ k.pipeMount.DecRef(ctx)
+ k.shmMount.DecRef(ctx)
+ k.socketMount.DecRef(ctx)
+ k.vfs.Release(ctx)
+ }
+ k.timekeeper.Destroy()
+ k.vdso.Release(ctx)
+}
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index b7e4b480d..f8a382fd8 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -27,6 +27,7 @@ go_library(
"//pkg/context",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refs_vfs2",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 00c03585e..ebbebf46b 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -321,9 +321,32 @@ func (r *Registry) remove(s *Shm) {
r.totalPages -= s.effectiveSize / usermem.PageSize
}
+// Release drops the self-reference of each active shm segment in the registry.
+// It is called when the kernel.IPCNamespace containing r is being destroyed.
+func (r *Registry) Release(ctx context.Context) {
+ // Because Shm.DecRef() may acquire the same locks, collect the segments to
+ // release first. Note that this should not race with any updates to r, since
+ // the IPC namespace containing it has no more references.
+ toRelease := make([]*Shm, 0)
+ r.mu.Lock()
+ for _, s := range r.keysToShms {
+ s.mu.Lock()
+ if !s.pendingDestruction {
+ toRelease = append(toRelease, s)
+ }
+ s.mu.Unlock()
+ }
+ r.mu.Unlock()
+
+ for _, s := range toRelease {
+ r.dissociateKey(s)
+ s.DecRef(ctx)
+ }
+}
+
// Shm represents a single shared memory segment.
//
-// Shm segment are backed directly by an allocation from platform memory.
+// Shm segments are backed directly by an allocation from platform memory.
// Segments are always mapped as a whole, greatly simplifying how mappings are
// tracked. However note that mremap and munmap calls may cause the vma for a
// segment to become fragmented; which requires special care when unmapping a
@@ -652,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) {
s.registry.dissociateKey(s)
s.mu.Lock()
- defer s.mu.Unlock()
- if !s.pendingDestruction {
- s.pendingDestruction = true
- // Drop the self-reference so destruction occurs when all
- // external references are gone.
- //
- // N.B. This cannot be the final DecRef, as the caller also
- // holds a reference.
- s.DecRef(ctx)
+ if s.pendingDestruction {
+ s.mu.Unlock()
return
}
+ s.pendingDestruction = true
+ s.mu.Unlock()
+
+ // Drop the self-reference so destruction occurs when all
+ // external references are gone.
+ //
+ // N.B. This cannot be the final DecRef, as the caller also
+ // holds a reference.
+ s.DecRef(ctx)
+ return
}
// checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f796e0fa3..037971393 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -656,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return t.utsns
case CtxIPCNamespace:
- return t.ipcns
+ ipcns := t.IPCNamespace()
+ ipcns.IncRef()
+ return ipcns
case CtxTask:
return t
case auth.CtxCredentials:
@@ -735,7 +737,6 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
func (t *Task) IsChrooted() bool {
if VFS2Enabled {
realRoot := t.mountNamespaceVFS2.Root()
- defer realRoot.DecRef(t)
root := t.fsContext.RootDirectoryVFS2()
defer root.DecRef(t)
return root != realRoot
@@ -868,7 +869,6 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
t.mu.Lock()
defer t.mu.Unlock()
- t.mountNamespaceVFS2.IncRef()
return t.mountNamespaceVFS2
}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index fce1064a7..7a053f369 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -203,6 +203,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
ipcns = NewIPCNamespace(userns)
+ } else {
+ ipcns.IncRef()
}
netns := t.NetworkNamespace()
@@ -218,6 +220,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
if err != nil {
+ ipcns.DecRef(t)
return 0, nil, err
}
// clone() returns 0 in the child.
@@ -227,6 +230,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
if opts.SetTLS {
if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+ ipcns.DecRef(t)
return 0, nil, syserror.EPERM
}
}
@@ -509,6 +513,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
+ t.ipcns.DecRef(t)
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
var oldFDTable *FDTable
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b400a8b41..239551eb6 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -280,6 +280,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.mountNamespaceVFS2.DecRef(t)
t.mountNamespaceVFS2 = nil
}
+ t.ipcns.DecRef(t)
t.mu.Unlock()
// If this is the last task to exit from the thread group, release the
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 64c1e120a..6e2ff573a 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -104,6 +104,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
cfg.TaskContext.release()
cfg.FSContext.DecRef(t)
cfg.FDTable.DecRef(t)
+ cfg.IPCNamespace.DecRef(t)
if cfg.MountNamespaceVFS2 != nil {
cfg.MountNamespaceVFS2.DecRef(t)
}
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 05a294fe6..241d87835 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -380,3 +380,9 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF)
return vdsoAddr, nil
}
+
+// Release drops references on mappings held by v.
+func (v *VDSO) Release(ctx context.Context) {
+ v.ParamPage.DecRef(ctx)
+ v.vdso.DecRef(ctx)
+}
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index dd2bbeb12..8ce411102 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -64,6 +64,7 @@ go_test(
name = "kvm_test",
srcs = [
"kvm_amd64_test.go",
+ "kvm_arm64_test.go",
"kvm_test.go",
"virtual_map_test.go",
],
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 979be5d89..eb05950cd 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -62,6 +62,9 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
//
//go:nosplit
func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+ // Increment our counter.
+ atomic.AddUint64(&c.guestExits, 1)
+
// Copy out registers.
bluepillArchExit(c, bluepillArchContext(context))
@@ -89,9 +92,6 @@ func bluepillHandler(context unsafe.Pointer) {
// Sanitize the registers; interrupts must always be disabled.
c := bluepillArchEnter(bluepillArchContext(context))
- // Increment the number of switches.
- atomic.AddUint32(&c.switches, 1)
-
// Mark this as guest mode.
switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
case vCPUUser: // Expected case.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index 6e6b76416..17268d127 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -15,6 +15,8 @@
package kvm
import (
+ "sync/atomic"
+
pkgcontext "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -75,6 +77,9 @@ func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac a
// Clear the address space.
cpu.active.set(nil)
+ // Increment the number of user exits.
+ atomic.AddUint64(&cpu.userExits, 1)
+
// Release resources.
c.machine.Put(cpu)
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go
new file mode 100644
index 000000000..0e3d84d95
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go
@@ -0,0 +1,31 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+)
+
+func TestKernelTLS(t *testing.T) {
+ bluepillTest(t, func(c *vCPU) {
+ if !testutil.TLSWorks() {
+ t.Errorf("tls does not work, and it should!")
+ }
+ })
+}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 2e12470aa..e58acc071 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -412,9 +412,9 @@ func TestWrongVCPU(t *testing.T) {
// Basic test, one then the other.
bluepill(c1)
bluepill(c2)
- if c2.switches == 0 {
+ if c2.guestExits == 0 {
// Don't allow the test to proceed if this fails.
- t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ t.Fatalf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
}
// Alternate vCPUs; we expect to need to trigger the
@@ -423,11 +423,11 @@ func TestWrongVCPU(t *testing.T) {
bluepill(c1)
bluepill(c2)
}
- if count := c1.switches; count < 90 {
- t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ if count := c1.guestExits; count < 90 {
+ t.Errorf("wrong vCPU#1 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
}
- if count := c2.switches; count < 90 {
- t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ if count := c2.guestExits; count < 90 {
+ t.Errorf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
}
return false
})
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 75da253c5..61ed24d01 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -103,8 +103,11 @@ type vCPU struct {
// tid is the last set tid.
tid uint64
- // switches is a count of world switches (informational only).
- switches uint32
+ // userExits is the count of user exits.
+ userExits uint64
+
+ // guestExits is the count of guest to host world switches.
+ guestExits uint64
// faults is a count of world faults (informational only).
faults uint32
@@ -127,6 +130,7 @@ type vCPU struct {
// vCPUArchState is the architecture-specific state.
vCPUArchState
+ // dieState holds state related to vCPU death.
dieState dieState
}
@@ -468,6 +472,19 @@ func (m *machine) newDirtySet() *dirtySet {
}
}
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
+ // Clear from all PCIDs.
+ for _, c := range m.vCPUsByID {
+ if c != nil && c.PCIDs != nil {
+ c.PCIDs.Drop(pt)
+ }
+ }
+}
+
// lock marks the vCPU as in user mode.
//
// This should only be called directly when known to be safe, i.e. when
@@ -527,6 +544,8 @@ var pid = syscall.Getpid()
//
// This effectively unwinds the state machine.
func (c *vCPU) bounce(forceGuestExit bool) {
+ origGuestExits := atomic.LoadUint64(&c.guestExits)
+ origUserExits := atomic.LoadUint64(&c.userExits)
for {
switch state := atomic.LoadUint32(&c.state); state {
case vCPUReady, vCPUWaiter:
@@ -582,6 +601,14 @@ func (c *vCPU) bounce(forceGuestExit bool) {
// Should not happen: the above is exhaustive.
panic("invalid state")
}
+
+ // Check if we've missed the state transition, but
+ // we can safely return at this point in time.
+ newGuestExits := atomic.LoadUint64(&c.guestExits)
+ newUserExits := atomic.LoadUint64(&c.userExits)
+ if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
+ return
+ }
}
}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 451953008..c67127d95 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -87,19 +87,6 @@ const (
poolPCIDs = 8
)
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
- m.mu.Lock()
- defer m.mu.Unlock()
-
- // Clear from all PCIDs.
- for _, c := range m.vCPUsByID {
- if c != nil && c.PCIDs != nil {
- c.PCIDs.Drop(pt)
- }
- }
-}
-
// initArchState initializes architecture-specific state.
func (c *vCPU) initArchState() error {
var (
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 2df762991..54837f20c 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -113,19 +113,6 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return phyRegions
}
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
- m.mu.Lock()
- defer m.mu.Unlock()
-
- // Clear from all PCIDs.
- for _, c := range m.vCPUsByID {
- if c.PCIDs != nil {
- c.PCIDs.Drop(pt)
- }
- }
-}
-
// nonCanonical generates a canonical address return.
//
//go:nosplit
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index 4dad877ba..c5235ca9d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -23,6 +23,11 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
)
+// TLSWorks is a tls test.
+//
+// It returns true or false.
+func TLSWorks() bool
+
// SetTestTarget sets the rip appropriately.
func SetTestTarget(regs *arch.Registers, fn func()) {
regs.Pc = uint64(reflect.ValueOf(fn).Pointer())
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 6caf7282d..7348c29a5 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -50,6 +50,22 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0
start:
B start
+TEXT ·TLSWorks(SB),NOSPLIT,$0-8
+ NO_LOCAL_POINTERS
+ MOVD $0x6789, R5
+ MSR R5, TPIDR_EL0
+ MOVD $SYS_GETPID, R8 // getpid
+ SVC
+ MRS TPIDR_EL0, R6
+ CMP R5, R6
+ BNE isNaN
+ MOVD $1, R0
+ MOVD R0, ret+0(FP)
+ RET
+isNaN:
+ MOVD $0, ret+0(FP)
+ RET
+
TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
NO_LOCAL_POINTERS
// gc will touch fpsimd, so we should test it.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 494baaa4d..2370a9276 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -46,10 +46,11 @@
#define SCTLR_M 1 << 0
#define SCTLR_C 1 << 2
#define SCTLR_I 1 << 12
+#define SCTLR_DZE 1 << 14
#define SCTLR_UCT 1 << 15
#define SCTLR_UCI 1 << 26
-#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI)
+#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI | SCTLR_DZE)
// cntkctl_el1: counter-timer kernel control register el1.
#define CNTKCTL_EL0PCTEN 1 << 0
@@ -297,9 +298,7 @@
LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
MOVD RSV_REG, RSP; \
- WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
- ISB $15; \
- DSB $15;
+ WORD $0xd538d092; //MRS TPIDR_EL1, R18
// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
#define SWITCH_TO_APP_PAGETABLE(from) \
@@ -382,8 +381,6 @@ TEXT ·Halt(SB),NOSPLIT,$0
BNE mmio_exit
MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
- // Flush dcache.
- WORD $0xd5087e52 // DC CISW
mmio_exit:
// Disable fpsimd.
WORD $0xd5381041 // MRS CPACR_EL1, R1
@@ -401,9 +398,6 @@ mmio_exit:
MRS VBAR_EL1, R9
MOVD R0, 0x0(R9)
- // Flush dcahce.
- WORD $0xd5087e52 // DC CISW
-
RET
// HaltAndResume halts execution and point the pointer to the resume function.
@@ -522,8 +516,6 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
// Start is the CPU entrypoint.
TEXT ·Start(SB),NOSPLIT,$0
- // Flush dcache.
- WORD $0xd5087e52 // DC CISW
// Init.
MOVD $SCTLR_EL1_DEFAULT, R1
MSR R1, SCTLR_EL1
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 5ddcd4be5..3baad098b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,6 +16,7 @@
package netlink
import (
+ "io"
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -748,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t
buf := make([]byte, src.NumBytes())
n, err := src.CopyIn(ctx, buf)
+ // io.EOF can be only returned if src is a file, this means that
+ // sendMsg is called from splice and the error has to be ignored in
+ // this case.
+ if err == io.EOF {
+ err = nil
+ }
if err != nil {
// Don't partially consume messages.
return 0, syserr.FromError(err)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 87e30d742..211f07947 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -587,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
}
v := buffer.NewView(size)
if _, err := i.src.CopyIn(i.ctx, v); err != nil {
+ // EOF can be returned only if src is a file and this means it
+ // is in a splice syscall and the error has to be ignored.
+ if err == io.EOF {
+ return v, nil
+ }
return nil, tcpip.ErrBadAddress
}
return v, nil
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index f80011ce4..a4a76d0a3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -573,13 +573,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
if dst.NumBytes() == 0 {
return 0, nil
}
- return dst.CopyOutFrom(ctx, &EndpointReader{
+ r := &EndpointReader{
Ctx: ctx,
Endpoint: s.ep,
NumRights: 0,
Peek: false,
From: nil,
- })
+ }
+ n, err := dst.CopyOutFrom(ctx, r)
+ // Drop control messages.
+ r.Control.Release(ctx)
+ return n, err
}
// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 3345124cc..678355fb9 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -267,13 +267,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.
if dst.NumBytes() == 0 {
return 0, nil
}
- return dst.CopyOutFrom(ctx, &EndpointReader{
+ r := &EndpointReader{
Ctx: ctx,
Endpoint: s.ep,
NumRights: 0,
Peek: false,
From: nil,
- })
+ }
+ n, err := dst.CopyOutFrom(ctx, r)
+ // Drop control messages.
+ r.Control.Release(ctx)
+ return n, err
}
// PWrite implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 9feaca0da..9cd052c3d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1052,7 +1052,9 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
- if err != nil {
+ // Control messages should be released on error as well as for zero-length
+ // messages, which are discarded by the receiver.
+ if n == 0 || err != nil {
controlMessages.Release(t)
}
return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
index 066ee0863..c8ce2aabc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/execve.go
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -110,8 +110,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
}
// Load the new TaskContext.
- mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
- defer mntns.DecRef(t)
+ mntns := t.MountNamespaceVFS2()
wd := t.FSContext().WorkingDirectoryVFS2()
defer wd.DecRef(t)
remainingTraversals := uint(linux.MaxSymlinkTraversals)
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index bfae6b7e9..7b33b3f59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -1055,7 +1055,9 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
- if err != nil {
+ // Control messages should be released on error as well as for zero-length
+ // messages, which are discarded by the receiver.
+ if n == 0 || err != nil {
controlMessages.Release(t)
}
return uintptr(n), err
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index bdfd3ca8f..7ad0eaf86 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -61,11 +61,14 @@ func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *au
panic("cannot instaniate an anon filesystem")
}
-// Name implemenents FilesystemType.Name.
+// Name implements FilesystemType.Name.
func (anonFilesystemType) Name() string {
return "none"
}
+// Release implemenents FilesystemType.Release.
+func (anonFilesystemType) Release(ctx context.Context) {}
+
// anonFilesystem is the implementation of FilesystemImpl that backs
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
//
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 1eba0270f..183957ad8 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -827,7 +827,7 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn
}
// FileReadWriteSeeker is a helper struct to pass a FileDescription as
-// io.Reader/io.Writer/io.ReadSeeker/etc.
+// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
type FileReadWriteSeeker struct {
FD *FileDescription
Ctx context.Context
@@ -835,11 +835,18 @@ type FileReadWriteSeeker struct {
WOpts WriteOptions
}
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
+ return int(n), err
+}
+
// Read implements io.ReadWriteSeeker.Read.
func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
dst := usermem.BytesIOSequence(p)
- ret, err := f.FD.Read(f.Ctx, dst, f.ROpts)
- return int(ret), err
+ n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
+ return int(n), err
}
// Seek implements io.ReadWriteSeeker.Seek.
@@ -847,9 +854,16 @@ func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
return f.FD.Seek(f.Ctx, offset, int32(whence))
}
+// WriteAt implements io.WriterAt.WriteAt.
+func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
+ return int(n), err
+}
+
// Write implements io.ReadWriteSeeker.Write.
func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
buf := usermem.BytesIOSequence(p)
- ret, err := f.FD.Write(f.Ctx, buf, f.WOpts)
- return int(ret), err
+ n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
+ return int(n), err
}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index bc19db1d5..9d54cc4ed 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -33,6 +33,9 @@ type FilesystemType interface {
// Name returns the name of this FilesystemType.
Name() string
+
+ // Release releases all resources held by this FilesystemType.
+ Release(ctx context.Context)
}
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index dfc3ae6c0..78f115bfa 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -46,8 +46,9 @@ import (
// +stateify savable
type Mount struct {
// vfs, fs, root are immutable. References are held on fs and root.
+ // Note that for a disconnected mount, root may be nil.
//
- // Invariant: root belongs to fs.
+ // Invariant: if not nil, root belongs to fs.
vfs *VirtualFilesystem
fs *Filesystem
root *Dentry
@@ -498,7 +499,9 @@ func (mnt *Mount) DecRef(ctx context.Context) {
mnt.vfs.mounts.seq.EndWrite()
mnt.vfs.mountMu.Unlock()
}
- mnt.root.DecRef(ctx)
+ if mnt.root != nil {
+ mnt.root.DecRef(ctx)
+ }
mnt.fs.DecRef(ctx)
if vd.Ok() {
vd.DecRef(ctx)
@@ -724,14 +727,12 @@ func (mnt *Mount) Root() *Dentry {
return mnt.root
}
-// Root returns mntns' root. A reference is taken on the returned
-// VirtualDentry.
+// Root returns mntns' root. It does not take a reference on the returned Dentry.
func (mntns *MountNamespace) Root() VirtualDentry {
vd := VirtualDentry{
mount: mntns.root,
dentry: mntns.root.root,
}
- vd.IncRef()
return vd
}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 5bd756ea5..38d2701d2 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -158,6 +158,16 @@ func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
return nil
}
+// Release drops references on filesystem objects held by vfs.
+//
+// Precondition: This must be called after VFS.Init() has succeeded.
+func (vfs *VirtualFilesystem) Release(ctx context.Context) {
+ vfs.anonMount.DecRef(ctx)
+ for _, fst := range vfs.fsTypes {
+ fst.fsType.Release(ctx)
+ }
+}
+
// PathOperation specifies the path operated on by a VFS method.
//
// PathOperation is passed to VFS methods by pointer to reduce memory copying:
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index d4d785cca..6f81b0164 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -178,6 +178,24 @@ func PayloadLen(payloadLength int) NetworkChecker {
}
}
+// IPPayload creates a checker that checks the payload.
+func IPPayload(payload []byte) NetworkChecker {
+ return func(t *testing.T, h []header.Network) {
+ t.Helper()
+
+ got := h[0].Payload()
+
+ // cmp.Diff does not consider nil slices equal to empty slices, but we do.
+ if len(got) == 0 && len(payload) == 0 {
+ return
+ }
+
+ if diff := cmp.Diff(payload, got); diff != "" {
+ t.Errorf("payload mismatch (-want +got):\n%s", diff)
+ }
+ }
+}
+
// IPv4Options returns a checker that checks the options in an IPv4 packet.
func IPv4Options(want []byte) NetworkChecker {
return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 6be31beeb..4303fc5d5 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -49,11 +49,6 @@ const (
// neighbor advertisement packet.
ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize
- // ICMPv6NeighborAdvertSize is size of a neighbor advertisement
- // including the NDP Target Link Layer option for an Ethernet
- // address.
- ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + NDPLinkLayerAddressSize
-
// ICMPv6EchoMinimumSize is the minimum size of a valid echo packet.
ICMPv6EchoMinimumSize = 8
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 59710352b..c118a2929 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -12,6 +12,7 @@ go_test(
"//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
+ "//pkg/tcpip/checker",
"//pkg/tcpip/header",
"//pkg/tcpip/link/channel",
"//pkg/tcpip/link/loopback",
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index d436873b6..c1848a19c 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -15,11 +15,13 @@
package ip_test
import (
+ "strings"
"testing"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/checker"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
@@ -1020,3 +1022,406 @@ func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer
_, _ = pkt.NetworkHeader().Consume(netHdrLen)
return pkt
}
+
+func TestWriteHeaderIncludedPacket(t *testing.T) {
+ const (
+ nicID = 1
+ transportProto = 5
+
+ dataLen = 4
+ optionsLen = 4
+ )
+
+ dataBuf := [dataLen]byte{1, 2, 3, 4}
+ data := dataBuf[:]
+
+ ipv4OptionsBuf := [optionsLen]byte{0, 1, 0, 1}
+ ipv4Options := ipv4OptionsBuf[:]
+
+ ipv6FragmentExtHdrBuf := [header.IPv6FragmentExtHdrLength]byte{transportProto, 0, 62, 4, 1, 2, 3, 4}
+ ipv6FragmentExtHdr := ipv6FragmentExtHdrBuf[:]
+
+ var ipv6PayloadWithExtHdrBuf [dataLen + header.IPv6FragmentExtHdrLength]byte
+ ipv6PayloadWithExtHdr := ipv6PayloadWithExtHdrBuf[:]
+ if n := copy(ipv6PayloadWithExtHdr, ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+ }
+ if n := copy(ipv6PayloadWithExtHdr[header.IPv6FragmentExtHdrLength:], data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+
+ tests := []struct {
+ name string
+ protoFactory stack.NetworkProtocolFactory
+ protoNum tcpip.NetworkProtocolNumber
+ nicAddr tcpip.Address
+ remoteAddr tcpip.Address
+ pktGen func(*testing.T, tcpip.Address) buffer.View
+ checker func(*testing.T, *stack.PacketBuffer, tcpip.Address)
+ expectedErr *tcpip.Error
+ }{
+ {
+ name: "IPv4",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv4MinimumSize + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv4Any {
+ src = localIPv4Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv4MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+ }
+
+ checker.IPv4(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv4Addr),
+ checker.IPv4HeaderLength(header.IPv4MinimumSize),
+ checker.IPFullLength(uint16(header.IPv4MinimumSize+len(data))),
+ checker.IPPayload(data),
+ )
+ },
+ },
+ {
+ name: "IPv4 with IHL too small",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv4MinimumSize + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize - 1,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ expectedErr: tcpip.ErrMalformedHeader,
+ },
+ {
+ name: "IPv4 too small",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip[:len(ip)-1])
+ },
+ expectedErr: tcpip.ErrMalformedHeader,
+ },
+ {
+ name: "IPv4 minimum size",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip)
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv4Any {
+ src = localIPv4Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv4MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+ }
+
+ checker.IPv4(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv4Addr),
+ checker.IPv4HeaderLength(header.IPv4MinimumSize),
+ checker.IPFullLength(header.IPv4MinimumSize),
+ checker.IPPayload(nil),
+ )
+ },
+ },
+ {
+ name: "IPv4 with options",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ipHdrLen := header.IPv4MinimumSize + len(ipv4Options)
+ totalLen := ipHdrLen + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv4(hdr.Prepend(ipHdrLen))
+ ip.Encode(&header.IPv4Fields{
+ IHL: uint8(ipHdrLen),
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ if n := copy(ip.Options(), ipv4Options); n != len(ipv4Options) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv4Options))
+ }
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv4Any {
+ src = localIPv4Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ hdrLen := header.IPv4MinimumSize + len(ipv4Options)
+ if len(netHdr.View()) != hdrLen {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen)
+ }
+
+ checker.IPv4(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv4Addr),
+ checker.IPv4HeaderLength(hdrLen),
+ checker.IPFullLength(uint16(hdrLen+len(data))),
+ checker.IPv4Options(ipv4Options),
+ checker.IPPayload(data),
+ )
+ },
+ },
+ {
+ name: "IPv6",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv6MinimumSize + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: transportProto,
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv6Any {
+ src = localIPv6Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv6MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv6Addr),
+ checker.IPFullLength(uint16(header.IPv6MinimumSize+len(data))),
+ checker.IPPayload(data),
+ )
+ },
+ },
+ {
+ name: "IPv6 with extension header",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv6MinimumSize + len(ipv6FragmentExtHdr) + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ if n := copy(hdr.Prepend(len(ipv6FragmentExtHdr)), ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+ }
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: uint8(header.IPv6FragmentExtHdrIdentifier),
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv6Any {
+ src = localIPv6Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if want := header.IPv6MinimumSize + len(ipv6FragmentExtHdr); len(netHdr.View()) != want {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), want)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv6Addr),
+ checker.IPFullLength(uint16(header.IPv6MinimumSize+len(ipv6PayloadWithExtHdr))),
+ checker.IPPayload(ipv6PayloadWithExtHdr),
+ )
+ },
+ },
+ {
+ name: "IPv6 minimum size",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: transportProto,
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip)
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv6Any {
+ src = localIPv6Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv6MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv6Addr),
+ checker.IPFullLength(header.IPv6MinimumSize),
+ checker.IPPayload(nil),
+ )
+ },
+ },
+ {
+ name: "IPv6 too small",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: transportProto,
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip[:len(ip)-1])
+ },
+ expectedErr: tcpip.ErrMalformedHeader,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ subTests := []struct {
+ name string
+ srcAddr tcpip.Address
+ }{
+ {
+ name: "unspecified source",
+ srcAddr: tcpip.Address(strings.Repeat("\x00", len(test.nicAddr))),
+ },
+ {
+ name: "random source",
+ srcAddr: tcpip.Address(strings.Repeat("\xab", len(test.nicAddr))),
+ },
+ }
+
+ for _, subTest := range subTests {
+ t.Run(subTest.name, func(t *testing.T) {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocolFactory{test.protoFactory},
+ })
+ e := channel.New(1, 1280, "")
+ if err := s.CreateNIC(nicID, e); err != nil {
+ t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+ }
+ if err := s.AddAddress(nicID, test.protoNum, test.nicAddr); err != nil {
+ t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, test.protoNum, test.nicAddr, err)
+ }
+
+ s.SetRouteTable([]tcpip.Route{{Destination: test.remoteAddr.WithPrefix().Subnet(), NIC: nicID}})
+
+ r, err := s.FindRoute(nicID, test.nicAddr, test.remoteAddr, test.protoNum, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, test.remoteAddr, test.nicAddr, test.protoNum, err)
+ }
+ defer r.Release()
+
+ if err := r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
+ Data: test.pktGen(t, subTest.srcAddr).ToVectorisedView(),
+ })); err != test.expectedErr {
+ t.Fatalf("got r.WriteHeaderIncludedPacket(_) = %s, want = %s", err, test.expectedErr)
+ }
+
+ if test.expectedErr != nil {
+ return
+ }
+
+ pkt, ok := e.Read()
+ if !ok {
+ t.Fatal("expected a packet to be written")
+ }
+ test.checker(t, pkt.Pkt, subTest.srcAddr)
+ })
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index c5ac7b8b5..7e2e53523 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -237,7 +237,10 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
e.addIPHeader(r, pkt, params)
+ return e.writePacket(r, gso, pkt)
+}
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer) *tcpip.Error {
// iptables filtering. All packets that reach here are locally
// generated.
nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
@@ -347,30 +350,27 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
return n + len(dropped), nil
}
-// WriteHeaderIncludedPacket writes a packet already containing a network
-// header through the given route.
+// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
// The packet already has an IP header, but there are a few required
// checks.
h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
if !ok {
- return tcpip.ErrInvalidOptionValue
+ return tcpip.ErrMalformedHeader
}
ip := header.IPv4(h)
- if !ip.IsValid(pkt.Data.Size()) {
- return tcpip.ErrInvalidOptionValue
- }
// Always set the total length.
- ip.SetTotalLength(uint16(pkt.Data.Size()))
+ pktSize := pkt.Data.Size()
+ ip.SetTotalLength(uint16(pktSize))
// Set the source address when zero.
- if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) {
+ if ip.SourceAddress() == header.IPv4Any {
ip.SetSourceAddress(r.LocalAddress)
}
- // Set the destination. If the packet already included a destination,
- // it will be part of the route.
+ // Set the destination. If the packet already included a destination, it will
+ // be part of the route anyways.
ip.SetDestinationAddress(r.RemoteAddress)
// Set the packet ID when zero.
@@ -387,19 +387,17 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
ip.SetChecksum(0)
ip.SetChecksum(^ip.CalculateChecksum())
- if r.Loop&stack.PacketLoop != 0 {
- e.HandlePacket(r, pkt.Clone())
- }
- if r.Loop&stack.PacketOut == 0 {
- return nil
+ // Populate the packet buffer's network header and don't allow an invalid
+ // packet to be sent.
+ //
+ // Note that parsing only makes sure that the packet is well formed as per the
+ // wire format. We also want to check if the header's fields are valid before
+ // sending the packet.
+ if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
+ return tcpip.ErrMalformedHeader
}
- if err := e.nic.WritePacket(r, nil /* gso */, ProtocolNumber, pkt); err != nil {
- r.Stats().IP.OutgoingPacketErrors.Increment()
- return err
- }
- r.Stats().IP.PacketsSent.Increment()
- return nil
+ return e.writePacket(r, nil /* gso */, pkt)
}
// HandlePacket is called by the link layer when new ipv4 packets arrive for
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 9916d783f..819b5d71f 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -15,9 +15,9 @@
package ipv4_test
import (
- "bytes"
"context"
"encoding/hex"
+ "fmt"
"math"
"net"
"testing"
@@ -243,7 +243,7 @@ func TestIPv4Sanity(t *testing.T) {
// Default routes for IPv4 so ICMP can find a route to the remote
// node when attempting to send the ICMP Echo Reply.
s.SetRouteTable([]tcpip.Route{
- tcpip.Route{
+ {
Destination: header.IPv4EmptySubnet,
NIC: nicID,
},
@@ -369,11 +369,10 @@ func TestIPv4Sanity(t *testing.T) {
// comparePayloads compared the contents of all the packets against the contents
// of the source packet.
-func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
- t.Helper()
- // Make a complete array of the sourcePacketInfo packet.
- source := header.IPv4(packets[0].NetworkHeader().View()[:header.IPv4MinimumSize])
- vv := buffer.NewVectorisedView(sourcePacketInfo.Size(), sourcePacketInfo.Views())
+func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketBuffer, mtu uint32) error {
+ // Make a complete array of the sourcePacket packet.
+ source := header.IPv4(packets[0].NetworkHeader().View())
+ vv := buffer.NewVectorisedView(sourcePacket.Size(), sourcePacket.Views())
source = append(source, vv.ToView()...)
// Make a copy of the IP header, which will be modified in some fields to make
@@ -384,46 +383,49 @@ func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketI
sourceCopy.SetTotalLength(0)
var offset uint16
// Build up an array of the bytes sent.
- var reassembledPayload []byte
+ var reassembledPayload buffer.VectorisedView
for i, packet := range packets {
// Confirm that the packet is valid.
allBytes := buffer.NewVectorisedView(packet.Size(), packet.Views())
- ip := header.IPv4(allBytes.ToView())
- if !ip.IsValid(len(ip)) {
- t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
+ fragmentIPHeader := header.IPv4(allBytes.ToView())
+ if !fragmentIPHeader.IsValid(len(fragmentIPHeader)) {
+ return fmt.Errorf("fragment #%d: IP packet is invalid:\n%s", i, hex.Dump(fragmentIPHeader))
}
- if got, want := ip.CalculateChecksum(), uint16(0xffff); got != want {
- t.Errorf("ip.CalculateChecksum() got %#x, want %#x", got, want)
+ if got, want := fragmentIPHeader.CalculateChecksum(), uint16(0xffff); got != want {
+ return fmt.Errorf("fragment #%d: fragmentIPHeader.CalculateChecksum() got %#x, want %#x", i, got, want)
}
- if got, want := len(ip), int(mtu); got > want {
- t.Errorf("fragment is too large, got %d want %d", got, want)
+ if got := len(fragmentIPHeader); got > int(mtu) {
+ return fmt.Errorf("fragment #%d: got len(fragmentIPHeader) = %d, want <= %d", i, got, mtu)
}
- if got, want := packet.AvailableHeaderBytes(), sourcePacketInfo.AvailableHeaderBytes()-header.IPv4MinimumSize; got != want {
- t.Errorf("fragment #%d should have the same available space for prepending as source: got %d, want %d", i, got, want)
+ if got, want := packet.AvailableHeaderBytes(), sourcePacket.AvailableHeaderBytes()-header.IPv4MinimumSize; got != want {
+ return fmt.Errorf("fragment #%d: should have the same available space for prepending as source: got %d, want %d", i, got, want)
}
- if got, want := packet.NetworkProtocolNumber, sourcePacketInfo.NetworkProtocolNumber; got != want {
- t.Errorf("fragment #%d has wrong network protocol number: got %d, want %d", i, got, want)
+ if got, want := packet.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber; got != want {
+ return fmt.Errorf("fragment #%d: has wrong network protocol number: got %d, want %d", i, got, want)
}
if i < len(packets)-1 {
sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, offset)
} else {
sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, offset)
}
- reassembledPayload = append(reassembledPayload, ip.Payload()...)
- offset += ip.TotalLength() - uint16(ip.HeaderLength())
+ reassembledPayload.AppendView(packet.TransportHeader().View())
+ reassembledPayload.Append(packet.Data)
+ offset += fragmentIPHeader.TotalLength() - uint16(fragmentIPHeader.HeaderLength())
// Clear out the checksum and length from the ip because we can't compare
// it.
- sourceCopy.SetTotalLength(uint16(len(ip)))
+ sourceCopy.SetTotalLength(uint16(len(fragmentIPHeader)))
sourceCopy.SetChecksum(0)
sourceCopy.SetChecksum(^sourceCopy.CalculateChecksum())
- if !bytes.Equal(ip[:ip.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]) {
- t.Errorf("ip[:ip.HeaderLength()] got:\n%s\nwant:\n%s", hex.Dump(ip[:ip.HeaderLength()]), hex.Dump(sourceCopy[:sourceCopy.HeaderLength()]))
+ if diff := cmp.Diff(fragmentIPHeader[:fragmentIPHeader.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]); diff != "" {
+ return fmt.Errorf("fragment #%d: fragmentIPHeader[:fragmentIPHeader.HeaderLength()] mismatch (-want +got):\n%s", i, diff)
}
}
- expected := source[source.HeaderLength():]
- if !bytes.Equal(reassembledPayload, expected) {
- t.Errorf("reassembledPayload got:\n%s\nwant:\n%s", hex.Dump(reassembledPayload), hex.Dump(expected))
+ expected := buffer.View(source[source.HeaderLength():])
+ if diff := cmp.Diff(expected, reassembledPayload.ToView()); diff != "" {
+ return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
}
+
+ return nil
}
func TestFragmentation(t *testing.T) {
@@ -477,7 +479,9 @@ func TestFragmentation(t *testing.T) {
if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
}
- compareFragments(t, ep.WrittenPackets, source, ft.mtu)
+ if err := compareFragments(ep.WrittenPackets, source, ft.mtu); err != nil {
+ t.Error(err)
+ }
})
}
}
@@ -1633,7 +1637,7 @@ func TestPacketQueing(t *testing.T) {
}
s.SetRouteTable([]tcpip.Route{
- tcpip.Route{
+ {
Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
NIC: nicID,
},
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 7be35c78b..ead6bedcb 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -252,26 +252,29 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
return
}
- it, err := ns.Options().Iter(false /* check */)
- if err != nil {
- // Options are not valid as per the wire format, silently drop the packet.
- received.Invalid.Increment()
- return
- }
+ var sourceLinkAddr tcpip.LinkAddress
+ {
+ it, err := ns.Options().Iter(false /* check */)
+ if err != nil {
+ // Options are not valid as per the wire format, silently drop the
+ // packet.
+ received.Invalid.Increment()
+ return
+ }
- sourceLinkAddr, ok := getSourceLinkAddr(it)
- if !ok {
- received.Invalid.Increment()
- return
+ sourceLinkAddr, ok = getSourceLinkAddr(it)
+ if !ok {
+ received.Invalid.Increment()
+ return
+ }
}
- unspecifiedSource := r.RemoteAddress == header.IPv6Any
-
// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
// NOT be included when the source IP address is the unspecified address.
// Otherwise, on link layers that have addresses this option MUST be
// included in multicast solicitations and SHOULD be included in unicast
// solicitations.
+ unspecifiedSource := r.RemoteAddress == header.IPv6Any
if len(sourceLinkAddr) == 0 {
if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
received.Invalid.Increment()
@@ -297,54 +300,71 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
return
}
- // ICMPv6 Neighbor Solicit messages are always sent to
- // specially crafted IPv6 multicast addresses. As a result, the
- // route we end up with here has as its LocalAddress such a
- // multicast address. It would be nonsense to claim that our
- // source address is a multicast address, so we manually set
- // the source address to the target address requested in the
- // solicit message. Since that requires mutating the route, we
- // must first clone it.
- r := r.Clone()
- defer r.Release()
- r.LocalAddress = targetAddr
-
- // As per RFC 4861 section 7.2.4, if the the source of the solicitation is
- // the unspecified address, the node MUST set the Solicited flag to zero and
- // multicast the advertisement to the all-nodes address.
- solicited := true
+ // As per RFC 4861 section 7.2.4:
+ //
+ // If the source of the solicitation is the unspecified address, the node
+ // MUST [...] and multicast the advertisement to the all-nodes address.
+ //
+ remoteAddr := r.RemoteAddress
if unspecifiedSource {
- solicited = false
- r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+ remoteAddr = header.IPv6AllNodesMulticastAddress
}
- // If the NS has a source link-layer option, use the link address it
- // specifies as the remote link address for the response instead of the
- // source link address of the packet.
+ // Even if we were able to receive a packet from some remote, we may not
+ // have a route to it - the remote may be blocked via routing rules. We must
+ // always consult our routing table and find a route to the remote before
+ // sending any packet.
+ r, err := e.protocol.stack.FindRoute(e.nic.ID(), targetAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ // If we cannot find a route to the destination, silently drop the packet.
+ return
+ }
+ defer r.Release()
+
+ // If the NS has a source link-layer option, resolve the route immediately
+ // to avoid querying the neighbor table when the neighbor entry was updated
+ // as probing the neighbor table for a link address will transition the
+ // entry's state from stale to delay.
+ //
+ // Note, if the source link address is unspecified and this is a unicast
+ // solicitation, we may need to perform neighbor discovery to send the
+ // neighbor advertisement response. This is expected as per RFC 4861 section
+ // 7.2.4:
+ //
+ // Because unicast Neighbor Solicitations are not required to include a
+ // Source Link-Layer Address, it is possible that a node sending a
+ // solicited Neighbor Advertisement does not have a corresponding link-
+ // layer address for its neighbor in its Neighbor Cache. In such
+ // situations, a node will first have to use Neighbor Discovery to
+ // determine the link-layer address of its neighbor (i.e., send out a
+ // multicast Neighbor Solicitation).
//
- // TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
- // address cache for the right destination link address instead of manually
- // patching the route with the remote link address if one is specified in a
- // Source Link-Layer Address option.
if len(sourceLinkAddr) != 0 {
- r.RemoteLinkAddress = sourceLinkAddr
+ r.ResolveWith(sourceLinkAddr)
}
optsSerializer := header.NDPOptionsSerializer{
- header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+ header.NDPTargetLinkLayerAddressOption(e.nic.LinkAddress()),
}
+ neighborAdvertSize := header.ICMPv6NeighborAdvertMinimumSize + optsSerializer.Length()
pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
- ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()),
+ ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborAdvertSize,
})
- packet := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+ packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize))
packet.SetType(header.ICMPv6NeighborAdvert)
na := header.NDPNeighborAdvert(packet.NDPPayload())
- na.SetSolicitedFlag(solicited)
+
+ // As per RFC 4861 section 7.2.4:
+ //
+ // If the source of the solicitation is the unspecified address, the node
+ // MUST set the Solicited flag to zero and [..]. Otherwise, the node MUST
+ // set the Solicited flag to one and [..].
+ //
+ na.SetSolicitedFlag(!unspecifiedSource)
na.SetOverrideFlag(true)
na.SetTargetAddress(targetAddr)
- opts := na.Options()
- opts.Serialize(optsSerializer)
+ na.Options().Serialize(optsSerializer)
packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
@@ -361,7 +381,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
case header.ICMPv6NeighborAdvert:
received.NeighborAdvert.Increment()
- if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborAdvertSize {
+ if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborAdvertMinimumSize {
received.Invalid.Increment()
return
}
@@ -419,19 +439,19 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
// If the NA message has the target link layer option, update the link
// address cache with the link address for the target of the message.
- if len(targetLinkAddr) != 0 {
- if e.nud == nil {
+ if e.nud == nil {
+ if len(targetLinkAddr) != 0 {
e.linkAddrCache.AddLinkAddress(e.nic.ID(), targetAddr, targetLinkAddr)
- return
}
-
- e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
- Solicited: na.SolicitedFlag(),
- Override: na.OverrideFlag(),
- IsRouter: na.RouterFlag(),
- })
+ return
}
+ e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
+ Solicited: na.SolicitedFlag(),
+ Override: na.OverrideFlag(),
+ IsRouter: na.RouterFlag(),
+ })
+
case header.ICMPv6EchoRequest:
received.EchoRequest.Increment()
icmpHdr, ok := pkt.TransportHeader().Consume(header.ICMPv6EchoMinimumSize)
@@ -620,18 +640,6 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
}
}
-const (
- ndpSolicitedFlag = 1 << 6
- ndpOverrideFlag = 1 << 5
-
- ndpOptSrcLinkAddr = 1
- ndpOptDstLinkAddr = 2
-
- icmpV6FlagOffset = 4
- icmpV6OptOffset = 24
- icmpV6LengthOffset = 25
-)
-
var _ stack.LinkAddressResolver = (*protocol)(nil)
// LinkAddressProtocol implements stack.LinkAddressResolver.
@@ -647,6 +655,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAdd
r := stack.Route{
LocalAddress: localAddr,
RemoteAddress: addr,
+ LocalLinkAddress: linkEP.LinkAddress(),
RemoteLinkAddress: remoteLinkAddr,
}
@@ -658,17 +667,20 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAdd
r.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(r.RemoteAddress)
}
+ optsSerializer := header.NDPOptionsSerializer{
+ header.NDPSourceLinkLayerAddressOption(linkEP.LinkAddress()),
+ }
+ neighborSolicitSize := header.ICMPv6NeighborSolicitMinimumSize + optsSerializer.Length()
pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
- ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize,
+ ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + neighborSolicitSize,
})
- icmpHdr := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
- icmpHdr.SetType(header.ICMPv6NeighborSolicit)
- copy(icmpHdr[icmpV6OptOffset-len(addr):], addr)
- icmpHdr[icmpV6OptOffset] = ndpOptSrcLinkAddr
- icmpHdr[icmpV6LengthOffset] = 1
- copy(icmpHdr[icmpV6LengthOffset+1:], linkEP.LinkAddress())
- icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+ packet := header.ICMPv6(pkt.TransportHeader().Push(neighborSolicitSize))
+ packet.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(packet.NDPPayload())
+ ns.SetTargetAddress(addr)
+ ns.Options().Serialize(optsSerializer)
+ packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
length := uint16(pkt.Size())
ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 3affcc4e4..8dc33c560 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -101,14 +101,19 @@ func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtoco
func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
}
-type stubNUDHandler struct{}
+type stubNUDHandler struct {
+ probeCount int
+ confirmationCount int
+}
var _ stack.NUDHandler = (*stubNUDHandler)(nil)
-func (*stubNUDHandler) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes stack.LinkAddressResolver) {
+func (s *stubNUDHandler) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes stack.LinkAddressResolver) {
+ s.probeCount++
}
-func (*stubNUDHandler) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags stack.ReachabilityConfirmationFlags) {
+func (s *stubNUDHandler) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags stack.ReachabilityConfirmationFlags) {
+ s.confirmationCount++
}
func (*stubNUDHandler) HandleUpperLevelConfirmation(addr tcpip.Address) {
@@ -118,6 +123,12 @@ var _ stack.NetworkInterface = (*testInterface)(nil)
type testInterface struct {
stack.NetworkLinkEndpoint
+
+ linkAddr tcpip.LinkAddress
+}
+
+func (i *testInterface) LinkAddress() tcpip.LinkAddress {
+ return i.linkAddr
}
func (*testInterface) ID() tcpip.NICID {
@@ -1492,3 +1503,240 @@ func TestPacketQueing(t *testing.T) {
})
}
}
+
+func TestCallsToNeighborCache(t *testing.T) {
+ tests := []struct {
+ name string
+ createPacket func() header.ICMPv6
+ multicast bool
+ source tcpip.Address
+ destination tcpip.Address
+ wantProbeCount int
+ wantConfirmationCount int
+ }{
+ {
+ name: "Unicast Neighbor Solicitation without source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ // "The source link-layer address option SHOULD be included in unicast
+ // solicitations." - RFC 4861 section 4.3
+ //
+ // A Neighbor Advertisement needs to be sent in response, but the
+ // Neighbor Cache shouldn't be updated since we have no useful
+ // information about the sender.
+ wantProbeCount: 0,
+ },
+ {
+ name: "Unicast Neighbor Solicitation with source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ ns.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPSourceLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ wantProbeCount: 1,
+ },
+ {
+ name: "Multicast Neighbor Solicitation without source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.SolicitedNodeAddr(lladdr0),
+ // "The source link-layer address option MUST be included in multicast
+ // solicitations." - RFC 4861 section 4.3
+ wantProbeCount: 0,
+ },
+ {
+ name: "Multicast Neighbor Solicitation with source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ ns.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPSourceLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.SolicitedNodeAddr(lladdr0),
+ wantProbeCount: 1,
+ },
+ {
+ name: "Unicast Neighbor Advertisement without target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ // "When responding to unicast solicitations, the target link-layer
+ // address option can be omitted since the sender of the solicitation has
+ // the correct link-layer address; otherwise, it would not be able to
+ // send the unicast solicitation in the first place."
+ // - RFC 4861 section 4.4
+ wantConfirmationCount: 1,
+ },
+ {
+ name: "Unicast Neighbor Advertisement with target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ na.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ wantConfirmationCount: 1,
+ },
+ {
+ name: "Multicast Neighbor Advertisement without target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(false)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.IPv6AllNodesMulticastAddress,
+ // "Target link-layer address MUST be included for multicast solicitations
+ // in order to avoid infinite Neighbor Solicitation "recursion" when the
+ // peer node does not have a cache entry to return a Neighbor
+ // Advertisements message." - RFC 4861 section 4.4
+ wantConfirmationCount: 0,
+ },
+ {
+ name: "Multicast Neighbor Advertisement with target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(false)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ na.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.IPv6AllNodesMulticastAddress,
+ wantConfirmationCount: 1,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+ TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+ UseNeighborCache: true,
+ })
+ {
+ if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+ t.Fatalf("CreateNIC(_, _) = %s", err)
+ }
+ if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+ }
+ }
+ {
+ subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+ if err != nil {
+ t.Fatal(err)
+ }
+ s.SetRouteTable(
+ []tcpip.Route{{
+ Destination: subnet,
+ NIC: nicID,
+ }},
+ )
+ }
+
+ netProto := s.NetworkProtocolInstance(ProtocolNumber)
+ if netProto == nil {
+ t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+ }
+ nudHandler := &stubNUDHandler{}
+ ep := netProto.NewEndpoint(&testInterface{linkAddr: linkAddr0}, &stubLinkAddressCache{}, nudHandler, &stubDispatcher{})
+ defer ep.Close()
+
+ if err := ep.Enable(); err != nil {
+ t.Fatalf("ep.Enable(): %s", err)
+ }
+
+ r, err := s.FindRoute(nicID, lladdr0, test.source, ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+ }
+ defer r.Release()
+
+ // TODO(gvisor.dev/issue/4517): Remove the need for this manual patch.
+ r.LocalAddress = test.destination
+
+ icmp := test.createPacket()
+ icmp.SetChecksum(header.ICMPv6Checksum(icmp, r.RemoteAddress, r.LocalAddress, buffer.VectorisedView{}))
+ pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+ ReserveHeaderBytes: header.IPv6MinimumSize,
+ Data: buffer.View(icmp).ToVectorisedView(),
+ })
+ ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ PayloadLength: uint16(len(icmp)),
+ NextHeader: uint8(header.ICMPv6ProtocolNumber),
+ HopLimit: header.NDPHopLimit,
+ SrcAddr: r.RemoteAddress,
+ DstAddr: r.LocalAddress,
+ })
+ ep.HandlePacket(&r, pkt)
+
+ // Confirm the endpoint calls the correct NUDHandler method.
+ if nudHandler.probeCount != test.wantProbeCount {
+ t.Errorf("got nudHandler.probeCount = %d, want = %d", nudHandler.probeCount, test.wantProbeCount)
+ }
+ if nudHandler.confirmationCount != test.wantConfirmationCount {
+ t.Errorf("got nudHandler.confirmationCount = %d, want = %d", nudHandler.confirmationCount, test.wantConfirmationCount)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 2bd8f4ece..632914dd6 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -426,7 +426,10 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, mtu uint32, p
// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
e.addIPHeader(r, pkt, params)
+ return e.writePacket(r, gso, pkt, params.Protocol)
+}
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber) *tcpip.Error {
// iptables filtering. All packets that reach here are locally
// generated.
nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
@@ -468,7 +471,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
}
if e.packetMustBeFragmented(pkt, gso) {
- sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, params.Protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+ sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
// fragment one by one using WritePacket() (current strategy) or if we
// want to create a PacketBufferList from the fragments and feed it to
@@ -569,11 +572,40 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
return n + len(dropped), nil
}
-// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
-// supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
- // TODO(b/146666412): Support IPv6 header-included packets.
- return tcpip.ErrNotSupported
+// WriteHeaderIncludedPacker implements stack.NetworkEndpoint.
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+ // The packet already has an IP header, but there are a few required checks.
+ h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+ if !ok {
+ return tcpip.ErrMalformedHeader
+ }
+ ip := header.IPv6(h)
+
+ // Always set the payload length.
+ pktSize := pkt.Data.Size()
+ ip.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize))
+
+ // Set the source address when zero.
+ if ip.SourceAddress() == header.IPv6Any {
+ ip.SetSourceAddress(r.LocalAddress)
+ }
+
+ // Set the destination. If the packet already included a destination, it will
+ // be part of the route anyways.
+ ip.SetDestinationAddress(r.RemoteAddress)
+
+ // Populate the packet buffer's network header and don't allow an invalid
+ // packet to be sent.
+ //
+ // Note that parsing only makes sure that the packet is well formed as per the
+ // wire format. We also want to check if the header's fields are valid before
+ // sending the packet.
+ proto, _, _, _, ok := parse.IPv6(pkt)
+ if !ok || !header.IPv6(pkt.NetworkHeader().View()).IsValid(pktSize) {
+ return tcpip.ErrMalformedHeader
+ }
+
+ return e.writePacket(r, nil /* gso */, pkt, proto)
}
// HandlePacket is called by the link layer when new ipv6 packets arrive for
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index e792ca9e2..bee18d1a8 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -57,8 +57,8 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
t.Helper()
// Receive ICMP packet.
- hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
- pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+ hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertMinimumSize)
+ pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertMinimumSize))
pkt.SetType(header.ICMPv6NeighborAdvert)
pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, dst, buffer.VectorisedView{}))
payloadLength := hdr.UsedLength()
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 9033a9ed5..ac20f217e 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -15,6 +15,7 @@
package ipv6
import (
+ "context"
"strings"
"testing"
"time"
@@ -398,16 +399,17 @@ func TestNeighorSolicitationResponse(t *testing.T) {
}
tests := []struct {
- name string
- nsOpts header.NDPOptionsSerializer
- nsSrcLinkAddr tcpip.LinkAddress
- nsSrc tcpip.Address
- nsDst tcpip.Address
- nsInvalid bool
- naDstLinkAddr tcpip.LinkAddress
- naSolicited bool
- naSrc tcpip.Address
- naDst tcpip.Address
+ name string
+ nsOpts header.NDPOptionsSerializer
+ nsSrcLinkAddr tcpip.LinkAddress
+ nsSrc tcpip.Address
+ nsDst tcpip.Address
+ nsInvalid bool
+ naDstLinkAddr tcpip.LinkAddress
+ naSolicited bool
+ naSrc tcpip.Address
+ naDst tcpip.Address
+ performsLinkResolution bool
}{
{
name: "Unspecified source to solicited-node multicast destination",
@@ -416,7 +418,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
nsSrc: header.IPv6Any,
nsDst: nicAddrSNMC,
nsInvalid: false,
- naDstLinkAddr: remoteLinkAddr0,
+ naDstLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllNodesMulticastAddress),
naSolicited: false,
naSrc: nicAddr,
naDst: header.IPv6AllNodesMulticastAddress,
@@ -449,7 +451,6 @@ func TestNeighorSolicitationResponse(t *testing.T) {
nsDst: nicAddr,
nsInvalid: true,
},
-
{
name: "Specified source with 1 source ll to multicast destination",
nsOpts: header.NDPOptionsSerializer{
@@ -509,6 +510,10 @@ func TestNeighorSolicitationResponse(t *testing.T) {
naSolicited: true,
naSrc: nicAddr,
naDst: remoteAddr,
+ // Since we send a unicast solicitations to a node without an entry for
+ // the remote, the node needs to perform neighbor discovery to get the
+ // remote's link address to send the advertisement response.
+ performsLinkResolution: true,
},
{
name: "Specified source with 1 source ll to unicast destination",
@@ -615,11 +620,78 @@ func TestNeighorSolicitationResponse(t *testing.T) {
t.Fatalf("got invalid = %d, want = 0", got)
}
- p, got := e.Read()
+ if test.performsLinkResolution {
+ p, got := e.ReadContext(context.Background())
+ if !got {
+ t.Fatal("expected an NDP NS response")
+ }
+
+ if p.Route.LocalAddress != nicAddr {
+ t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, nicAddr)
+ }
+ if p.Route.LocalLinkAddress != nicLinkAddr {
+ t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+ }
+ respNSDst := header.SolicitedNodeAddr(test.nsSrc)
+ if p.Route.RemoteAddress != respNSDst {
+ t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst)
+ }
+ if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want {
+ t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+ checker.SrcAddr(nicAddr),
+ checker.DstAddr(respNSDst),
+ checker.TTL(header.NDPHopLimit),
+ checker.NDPNS(
+ checker.NDPNSTargetAddress(test.nsSrc),
+ checker.NDPNSOptions([]header.NDPOption{
+ header.NDPSourceLinkLayerAddressOption(nicLinkAddr),
+ }),
+ ))
+
+ ser := header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(linkAddr1),
+ }
+ ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + ser.Length()
+ hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+ pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+ pkt.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(pkt.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(true)
+ na.SetTargetAddress(test.nsSrc)
+ na.Options().Serialize(ser)
+ pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, nicAddr, buffer.VectorisedView{}))
+ payloadLength := hdr.UsedLength()
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ PayloadLength: uint16(payloadLength),
+ NextHeader: uint8(header.ICMPv6ProtocolNumber),
+ HopLimit: header.NDPHopLimit,
+ SrcAddr: test.nsSrc,
+ DstAddr: nicAddr,
+ })
+ e.InjectLinkAddr(ProtocolNumber, "", stack.NewPacketBuffer(stack.PacketBufferOptions{
+ Data: hdr.View().ToVectorisedView(),
+ }))
+ }
+
+ p, got := e.ReadContext(context.Background())
if !got {
t.Fatal("expected an NDP NA response")
}
+ if p.Route.LocalAddress != test.naSrc {
+ t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, test.naSrc)
+ }
+ if p.Route.LocalLinkAddress != nicLinkAddr {
+ t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+ }
+ if p.Route.RemoteAddress != test.naDst {
+ t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, test.naDst)
+ }
if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
}
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index eba97334e..d09ebe7fa 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -123,6 +123,7 @@ go_test(
"//pkg/tcpip/header",
"//pkg/tcpip/link/channel",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/network/arp",
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/ports",
diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go
index 4d69a4de1..be61a21af 100644
--- a/pkg/tcpip/stack/neighbor_entry.go
+++ b/pkg/tcpip/stack/neighbor_entry.go
@@ -406,9 +406,9 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
// INCOMPLETE state." - RFC 4861 section 7.2.5
case Reachable, Stale, Delay, Probe:
- sameLinkAddr := e.neigh.LinkAddr == linkAddr
+ isLinkAddrDifferent := len(linkAddr) != 0 && e.neigh.LinkAddr != linkAddr
- if !sameLinkAddr {
+ if isLinkAddrDifferent {
if !flags.Override {
if e.neigh.State == Reachable {
e.dispatchChangeEventLocked(Stale)
@@ -431,7 +431,7 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
}
}
- if flags.Solicited && (flags.Override || sameLinkAddr) {
+ if flags.Solicited && (flags.Override || !isLinkAddrDifferent) {
if e.neigh.State != Reachable {
e.dispatchChangeEventLocked(Reachable)
}
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
index e79abebca..3ee2a3b31 100644
--- a/pkg/tcpip/stack/neighbor_entry_test.go
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -83,15 +83,18 @@ func eventDiffOptsWithSort() []cmp.Option {
// | Reachable | Stale | Reachable timer expired | | Changed |
// | Reachable | Stale | Probe or confirmation w/ different address | | Changed |
// | Stale | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
+// | Stale | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Stale | Stale | Override confirmation | Update LinkAddr | Changed |
// | Stale | Stale | Probe w/ different address | Update LinkAddr | Changed |
// | Stale | Delay | Packet sent | | Changed |
// | Delay | Reachable | Upper-layer confirmation | | Changed |
// | Delay | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
+// | Delay | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Delay | Stale | Probe or confirmation w/ different address | | Changed |
// | Delay | Probe | Delay timer expired | Send probe | Changed |
// | Probe | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
// | Probe | Reachable | Solicited confirmation w/ same address | Notify wakers | Changed |
+// | Probe | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Probe | Stale | Probe or confirmation w/ different address | | Changed |
// | Probe | Probe | Retransmit timer expired | Send probe | Changed |
// | Probe | Failed | Max probes sent without reply | Notify wakers | Removed |
@@ -1370,6 +1373,77 @@ func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
nudDisp.mu.Unlock()
}
+func TestEntryStaleToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+ c := DefaultNUDConfigurations()
+ e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+ e.mu.Lock()
+ e.handlePacketQueuedLocked()
+ e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+ Solicited: false,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Stale {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Stale)
+ }
+ e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+ Solicited: true,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Reachable {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+ }
+ if e.neigh.LinkAddr != entryTestLinkAddr1 {
+ t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+ }
+ e.mu.Unlock()
+
+ wantProbes := []entryTestProbeInfo{
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: tcpip.LinkAddress(""),
+ LocalAddress: entryTestAddr2,
+ },
+ }
+ linkRes.mu.Lock()
+ diff := cmp.Diff(linkRes.probes, wantProbes)
+ linkRes.mu.Unlock()
+ if diff != "" {
+ t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+ }
+
+ wantEvents := []testEntryEventInfo{
+ {
+ EventType: entryTestAdded,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: tcpip.LinkAddress(""),
+ State: Incomplete,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Reachable,
+ },
+ }
+ nudDisp.mu.Lock()
+ if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+ t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+ }
+ nudDisp.mu.Unlock()
+}
+
func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) {
c := DefaultNUDConfigurations()
e, nudDisp, linkRes, _ := entryTestSetup(c)
@@ -1752,6 +1826,100 @@ func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
nudDisp.mu.Unlock()
}
+func TestEntryDelayToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+ c := DefaultNUDConfigurations()
+ c.MaxMulticastProbes = 1
+ // Eliminate random factors from ReachableTime computation so the transition
+ // from Stale to Reachable will only take BaseReachableTime duration.
+ c.MinRandomFactor = 1
+ c.MaxRandomFactor = 1
+
+ e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+ e.mu.Lock()
+ e.handlePacketQueuedLocked()
+ e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+ Solicited: false,
+ Override: false,
+ IsRouter: false,
+ })
+ e.handlePacketQueuedLocked()
+ if e.neigh.State != Delay {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Delay)
+ }
+ e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+ Solicited: true,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Reachable {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+ }
+ if e.neigh.LinkAddr != entryTestLinkAddr1 {
+ t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+ }
+ e.mu.Unlock()
+
+ wantProbes := []entryTestProbeInfo{
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: tcpip.LinkAddress(""),
+ LocalAddress: entryTestAddr2,
+ },
+ }
+ linkRes.mu.Lock()
+ diff := cmp.Diff(linkRes.probes, wantProbes)
+ linkRes.mu.Unlock()
+ if diff != "" {
+ t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+ }
+
+ clock.Advance(c.BaseReachableTime)
+
+ wantEvents := []testEntryEventInfo{
+ {
+ EventType: entryTestAdded,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: tcpip.LinkAddress(""),
+ State: Incomplete,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Delay,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Reachable,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ }
+ nudDisp.mu.Lock()
+ if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+ t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+ }
+ nudDisp.mu.Unlock()
+}
+
func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) {
c := DefaultNUDConfigurations()
e, nudDisp, linkRes, _ := entryTestSetup(c)
@@ -2665,6 +2833,115 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
nudDisp.mu.Unlock()
}
+func TestEntryProbeToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+ c := DefaultNUDConfigurations()
+ // Eliminate random factors from ReachableTime computation so the transition
+ // from Stale to Reachable will only take BaseReachableTime duration.
+ c.MinRandomFactor = 1
+ c.MaxRandomFactor = 1
+
+ e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+ e.mu.Lock()
+ e.handlePacketQueuedLocked()
+ e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+ Solicited: false,
+ Override: false,
+ IsRouter: false,
+ })
+ e.handlePacketQueuedLocked()
+ e.mu.Unlock()
+
+ clock.Advance(c.DelayFirstProbeTime)
+
+ wantProbes := []entryTestProbeInfo{
+ // The first probe is caused by the Unknown-to-Incomplete transition.
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: tcpip.LinkAddress(""),
+ LocalAddress: entryTestAddr2,
+ },
+ // The second probe is caused by the Delay-to-Probe transition.
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: entryTestLinkAddr1,
+ LocalAddress: entryTestAddr2,
+ },
+ }
+ linkRes.mu.Lock()
+ diff := cmp.Diff(linkRes.probes, wantProbes)
+ linkRes.mu.Unlock()
+ if diff != "" {
+ t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+ }
+
+ e.mu.Lock()
+ if e.neigh.State != Probe {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Probe)
+ }
+ e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+ Solicited: true,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Reachable {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+ }
+ e.mu.Unlock()
+
+ clock.Advance(c.BaseReachableTime)
+
+ wantEvents := []testEntryEventInfo{
+ {
+ EventType: entryTestAdded,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: tcpip.LinkAddress(""),
+ State: Incomplete,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Delay,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Probe,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Reachable,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ }
+ nudDisp.mu.Lock()
+ if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+ t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+ }
+ nudDisp.mu.Unlock()
+}
+
func TestEntryProbeToFailed(t *testing.T) {
c := DefaultNUDConfigurations()
c.MaxMulticastProbes = 3
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 25f80c1f8..b76e2d37b 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -126,6 +126,12 @@ func (r *Route) GSOMaxSize() uint32 {
return 0
}
+// ResolveWith immediately resolves a route with the specified remote link
+// address.
+func (r *Route) ResolveWith(addr tcpip.LinkAddress) {
+ r.RemoteLinkAddress = addr
+}
+
// Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
// case address resolution requires blocking, e.g. wait for ARP reply. Waker is
// notified when address resolution is complete (success or not).
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 38994cca1..e75f58c64 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -34,6 +34,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -3498,6 +3499,52 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
}
}
+func TestResolveWith(t *testing.T) {
+ const (
+ unspecifiedNICID = 0
+ nicID = 1
+ )
+
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol},
+ })
+ ep := channel.New(0, defaultMTU, "")
+ ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+ if err := s.CreateNIC(nicID, ep); err != nil {
+ t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+ }
+ addr := tcpip.ProtocolAddress{
+ Protocol: header.IPv4ProtocolNumber,
+ AddressWithPrefix: tcpip.AddressWithPrefix{
+ Address: tcpip.Address([]byte{192, 168, 1, 58}),
+ PrefixLen: 24,
+ },
+ }
+ if err := s.AddProtocolAddress(nicID, addr); err != nil {
+ t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, addr, err)
+ }
+
+ s.SetRouteTable([]tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}})
+
+ remoteAddr := tcpip.Address([]byte{192, 168, 1, 59})
+ r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, remoteAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, remoteAddr, header.IPv4ProtocolNumber, err)
+ }
+ defer r.Release()
+
+ // Should initially require resolution.
+ if !r.IsResolutionRequired() {
+ t.Fatal("got r.IsResolutionRequired() = false, want = true")
+ }
+
+ // Manually resolving the route should no longer require resolution.
+ r.ResolveWith("\x01")
+ if r.IsResolutionRequired() {
+ t.Fatal("got r.IsResolutionRequired() = true, want = false")
+ }
+}
+
// TestRouteReleaseAfterAddrRemoval tests that releasing a Route after its
// associated address is removed should not cause a panic.
func TestRouteReleaseAfterAddrRemoval(t *testing.T) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index c42bb0991..d77848d61 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -111,6 +111,7 @@ var (
ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"}
ErrNotPermitted = &Error{msg: "operation not permitted"}
ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
+ ErrMalformedHeader = &Error{msg: "header is malformed"}
)
var messageToError map[string]*Error
@@ -159,6 +160,7 @@ func StringToError(s string) *Error {
ErrBroadcastDisabled,
ErrNotPermitted,
ErrAddressFamilyNotSupported,
+ ErrMalformedHeader,
}
messageToError = make(map[string]*Error)
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index c67e0ba95..3ae6cc221 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -81,6 +81,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
ep.ID = r.id
ep.route = r.route.Clone()
ep.dstPort = r.id.RemotePort
+ ep.effectiveNetProtos = []tcpip.NetworkProtocolNumber{r.route.NetProto}
ep.RegisterNICID = r.route.NICID()
ep.boundPortFlags = ep.portFlags