summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2021-04-03 02:39:12 +0000
committergVisor bot <gvisor-bot@google.com>2021-04-03 02:39:12 +0000
commit851ddb10033a81d9f5c4a1d685c39bbc9e2cd029 (patch)
treeba3588c875cd677b5414abcb7cc0cf011264c79b
parentd70f6e164f7eb42d93ed2b5e4a15bc49a6fc5a0e (diff)
parent491b106d62ba97cafb63252bf7d5bdd4749d417a (diff)
Merge release-20210322.0-36-g491b106d6 (automated)
-rw-r--r--pkg/merkletree/merkletree.go534
-rw-r--r--pkg/merkletree/merkletree_state_autogen.go3
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go1109
-rw-r--r--pkg/sentry/fsimpl/verity/save_restore.go27
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go1402
-rw-r--r--pkg/sentry/fsimpl/verity/verity_state_autogen.go237
-rw-r--r--runsc/boot/vfs.go76
-rw-r--r--runsc/cli/main.go1
-rw-r--r--runsc/cmd/do.go108
-rw-r--r--runsc/cmd/verity_prepare.go106
-rw-r--r--runsc/specutils/fs.go18
11 files changed, 3569 insertions, 52 deletions
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
new file mode 100644
index 000000000..961bd4dcf
--- /dev/null
+++ b/pkg/merkletree/merkletree.go
@@ -0,0 +1,534 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package merkletree implements Merkle tree generating and verification.
+package merkletree
+
+import (
+ "bytes"
+ "crypto/sha256"
+ "crypto/sha512"
+ "encoding/gob"
+ "fmt"
+ "io"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+
+ "gvisor.dev/gvisor/pkg/hostarch"
+)
+
+const (
+ // sha256DigestSize specifies the digest size of a SHA256 hash.
+ sha256DigestSize = 32
+ // sha512DigestSize specifies the digest size of a SHA512 hash.
+ sha512DigestSize = 64
+)
+
+// DigestSize returns the size (in bytes) of a digest.
+// TODO(b/156980949): Allow config SHA384.
+func DigestSize(hashAlgorithm int) int {
+ switch hashAlgorithm {
+ case linux.FS_VERITY_HASH_ALG_SHA256:
+ return sha256DigestSize
+ case linux.FS_VERITY_HASH_ALG_SHA512:
+ return sha512DigestSize
+ default:
+ return -1
+ }
+}
+
+// Layout defines the scale of a Merkle tree.
+type Layout struct {
+ // blockSize is the size of a data block to be hashed.
+ blockSize int64
+ // digestSize is the size of a generated hash.
+ digestSize int64
+ // levelOffset contains the offset of the beginning of each level in
+ // bytes. The number of levels in the tree is the length of the slice.
+ // The leaf nodes (level 0) contain hashes of blocks of the input data.
+ // Each level N contains hashes of the blocks in level N-1. The highest
+ // level is the root hash.
+ levelOffset []int64
+}
+
+// InitLayout initializes and returns a new Layout object describing the structure
+// of a tree. dataSize specifies the size of input data in bytes.
+func InitLayout(dataSize int64, hashAlgorithms int, dataAndTreeInSameFile bool) (Layout, error) {
+ layout := Layout{
+ blockSize: hostarch.PageSize,
+ }
+
+ // TODO(b/156980949): Allow config SHA384.
+ switch hashAlgorithms {
+ case linux.FS_VERITY_HASH_ALG_SHA256:
+ layout.digestSize = sha256DigestSize
+ case linux.FS_VERITY_HASH_ALG_SHA512:
+ layout.digestSize = sha512DigestSize
+ default:
+ return Layout{}, fmt.Errorf("unexpected hash algorithms")
+ }
+
+ // treeStart is the offset (in bytes) of the first level of the tree in
+ // the file. If data and tree are in different files, treeStart should
+ // be zero. If data is in the same file as the tree, treeStart points
+ // to the block after the last data block (which may be zero-padded).
+ var treeStart int64
+ if dataAndTreeInSameFile {
+ treeStart = dataSize
+ if dataSize%layout.blockSize != 0 {
+ treeStart += layout.blockSize - dataSize%layout.blockSize
+ }
+ }
+
+ numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
+ level := 0
+ offset := int64(0)
+
+ // Calculate the number of levels in the Merkle tree and the beginning
+ // offset of each level. Level 0 consists of the leaf nodes that
+ // contain the hashes of the data blocks, while level numLevels - 1 is
+ // the root.
+ for numBlocks > 1 {
+ layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
+ // Round numBlocks up to fill up a block.
+ numBlocks += (layout.hashesPerBlock() - numBlocks%layout.hashesPerBlock()) % layout.hashesPerBlock()
+ offset += numBlocks / layout.hashesPerBlock()
+ numBlocks = numBlocks / layout.hashesPerBlock()
+ level++
+ }
+ layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
+
+ return layout, nil
+}
+
+// hashesPerBlock() returns the number of digests in each block. For example,
+// if blockSize is 4096 bytes, and digestSize is 32 bytes, there will be 128
+// hashesPerBlock. Therefore 128 hashes in one level will be combined in one
+// hash in the level above.
+func (layout Layout) hashesPerBlock() int64 {
+ return layout.blockSize / layout.digestSize
+}
+
+// numLevels returns the total number of levels in the Merkle tree.
+func (layout Layout) numLevels() int {
+ return len(layout.levelOffset)
+}
+
+// rootLevel returns the level of the root hash.
+func (layout Layout) rootLevel() int {
+ return layout.numLevels() - 1
+}
+
+// digestOffset finds the offset of a digest from the beginning of the tree.
+// The target digest is at level of the tree, with index from the beginning of
+// the current level.
+func (layout Layout) digestOffset(level int, index int64) int64 {
+ return layout.levelOffset[level] + index*layout.digestSize
+}
+
+// blockOffset finds the offset of a block from the beginning of the tree. The
+// target block is at level of the tree, with index from the beginning of the
+// current level.
+func (layout Layout) blockOffset(level int, index int64) int64 {
+ return layout.levelOffset[level] + index*layout.blockSize
+}
+
+// VerityDescriptor is a struct that is serialized and hashed to get a file's
+// root hash, which contains the root hash of the raw content and the file's
+// meatadata.
+type VerityDescriptor struct {
+ Name string
+ FileSize int64
+ Mode uint32
+ UID uint32
+ GID uint32
+ Children map[string]struct{}
+ SymlinkTarget string
+ RootHash []byte
+}
+
+func (d *VerityDescriptor) String() string {
+ b := new(bytes.Buffer)
+ e := gob.NewEncoder(b)
+ e.Encode(d.Children)
+ return fmt.Sprintf("Name: %s, Size: %d, Mode: %d, UID: %d, GID: %d, Children: %v, Symlink: %s, RootHash: %v", d.Name, d.FileSize, d.Mode, d.UID, d.GID, b.Bytes(), d.SymlinkTarget, d.RootHash)
+}
+
+// verify generates a hash from d, and compares it with expected.
+func (d *VerityDescriptor) verify(expected []byte, hashAlgorithms int) error {
+ h, err := hashData([]byte(d.String()), hashAlgorithms)
+ if err != nil {
+ return err
+ }
+ if !bytes.Equal(h[:], expected) {
+ return fmt.Errorf("unexpected root hash")
+ }
+ return nil
+
+}
+
+// hashData hashes data and returns the result hash based on the hash
+// algorithms.
+func hashData(data []byte, hashAlgorithms int) ([]byte, error) {
+ var digest []byte
+ switch hashAlgorithms {
+ case linux.FS_VERITY_HASH_ALG_SHA256:
+ digestArray := sha256.Sum256(data)
+ digest = digestArray[:]
+ case linux.FS_VERITY_HASH_ALG_SHA512:
+ digestArray := sha512.Sum512(data)
+ digest = digestArray[:]
+ default:
+ return nil, fmt.Errorf("unexpected hash algorithms")
+ }
+ return digest, nil
+}
+
+// GenerateParams contains the parameters used to generate a Merkle tree for a
+// given file.
+type GenerateParams struct {
+ // File is a reader of the file to be hashed.
+ File io.ReaderAt
+ // Size is the size of the file.
+ Size int64
+ // Name is the name of the target file.
+ Name string
+ // Mode is the mode of the target file.
+ Mode uint32
+ // UID is the user ID of the target file.
+ UID uint32
+ // GID is the group ID of the target file.
+ GID uint32
+ // Children is a map of children names for a directory. It should be
+ // empty for a regular file.
+ Children map[string]struct{}
+ // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink.
+ SymlinkTarget string
+ // HashAlgorithms is the algorithms used to hash data.
+ HashAlgorithms int
+ // TreeReader is a reader for the Merkle tree.
+ TreeReader io.ReaderAt
+ // TreeWriter is a writer for the Merkle tree.
+ TreeWriter io.Writer
+ // DataAndTreeInSameFile is true if data and Merkle tree are in the same
+ // file, or false if Merkle tree is a separate file from data.
+ DataAndTreeInSameFile bool
+}
+
+// Generate constructs a Merkle tree for the contents of params.File. The
+// output is written to params.TreeWriter.
+//
+// Generate returns a hash of a VerityDescriptor, which contains the file
+// metadata and the hash from file content.
+func Generate(params *GenerateParams) ([]byte, error) {
+ descriptor := VerityDescriptor{
+ FileSize: params.Size,
+ Name: params.Name,
+ Mode: params.Mode,
+ UID: params.UID,
+ GID: params.GID,
+ SymlinkTarget: params.SymlinkTarget,
+ }
+
+ // If file is a symlink do not generate root hash for file content.
+ if params.SymlinkTarget != "" {
+ return hashData([]byte(descriptor.String()), params.HashAlgorithms)
+ }
+
+ layout, err := InitLayout(params.Size, params.HashAlgorithms, params.DataAndTreeInSameFile)
+ if err != nil {
+ return nil, err
+ }
+
+ numBlocks := (params.Size + layout.blockSize - 1) / layout.blockSize
+
+ // If the data is in the same file as the tree, zero pad the last data
+ // block.
+ bytesInLastBlock := params.Size % layout.blockSize
+ if params.DataAndTreeInSameFile && bytesInLastBlock != 0 {
+ zeroBuf := make([]byte, layout.blockSize-bytesInLastBlock)
+ if _, err := params.TreeWriter.Write(zeroBuf); err != nil {
+ return nil, err
+ }
+ }
+
+ var root []byte
+ for level := 0; level < layout.numLevels(); level++ {
+ for i := int64(0); i < numBlocks; i++ {
+ buf := make([]byte, layout.blockSize)
+ var (
+ n int
+ err error
+ )
+ if level == 0 {
+ // Read data block from the target file since level 0 includes hashes
+ // of blocks in the input data.
+ n, err = params.File.ReadAt(buf, i*layout.blockSize)
+ } else {
+ // Read data block from the tree file since levels higher than 0 are
+ // hashing the lower level hashes.
+ n, err = params.TreeReader.ReadAt(buf, layout.blockOffset(level-1, i))
+ }
+
+ // err is populated as long as the bytes read is smaller than the buffer
+ // size. This could be the case if we are reading the last block, and
+ // break in that case. If this is the last block, the end of the block
+ // will be zero-padded.
+ if n == 0 && err == io.EOF {
+ break
+ } else if err != nil && err != io.EOF {
+ return nil, err
+ }
+ // Hash the bytes in buf.
+ digest, err := hashData(buf, params.HashAlgorithms)
+ if err != nil {
+ return nil, err
+ }
+
+ if level == layout.rootLevel() {
+ root = digest
+ }
+
+ // Write the generated hash to the end of the tree file.
+ if _, err = params.TreeWriter.Write(digest[:]); err != nil {
+ return nil, err
+ }
+ }
+ // If the generated digests do not round up to a block, zero-padding the
+ // remaining of the last block. But no need to do so for root.
+ if level != layout.rootLevel() && numBlocks%layout.hashesPerBlock() != 0 {
+ zeroBuf := make([]byte, layout.blockSize-(numBlocks%layout.hashesPerBlock())*layout.digestSize)
+ if _, err := params.TreeWriter.Write(zeroBuf[:]); err != nil {
+ return nil, err
+ }
+ }
+ numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock()
+ }
+ descriptor.RootHash = root
+ return hashData([]byte(descriptor.String()), params.HashAlgorithms)
+}
+
+// VerifyParams contains the params used to verify a portion of a file against
+// a Merkle tree.
+type VerifyParams struct {
+ // Out will be filled with verified data.
+ Out io.Writer
+ // File is a handler on the file to be verified.
+ File io.ReaderAt
+ // tree is a handler on the Merkle tree used to verify file.
+ Tree io.ReaderAt
+ // Size is the size of the file.
+ Size int64
+ // Name is the name of the target file.
+ Name string
+ // Mode is the mode of the target file.
+ Mode uint32
+ // UID is the user ID of the target file.
+ UID uint32
+ // GID is the group ID of the target file.
+ GID uint32
+ // Children is a map of children names for a directory. It should be
+ // empty for a regular file.
+ Children map[string]struct{}
+ // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink.
+ SymlinkTarget string
+ // HashAlgorithms is the algorithms used to hash data.
+ HashAlgorithms int
+ // ReadOffset is the offset of the data range to be verified.
+ ReadOffset int64
+ // ReadSize is the size of the data range to be verified.
+ ReadSize int64
+ // Expected is a trusted hash for the file. It is compared with the
+ // calculated root hash to verify the content.
+ Expected []byte
+ // DataAndTreeInSameFile is true if data and Merkle tree are in the same
+ // file, or false if Merkle tree is a separate file from data.
+ DataAndTreeInSameFile bool
+}
+
+// verifyMetadata verifies the metadata by hashing a descriptor that contains
+// the metadata and compare the generated hash with expected.
+//
+// For verifyMetadata, params.data is not needed. It only accesses params.tree
+// for the raw root hash.
+func verifyMetadata(params *VerifyParams, layout *Layout) error {
+ var root []byte
+ // Only read the root hash if we expect that the file is not a symlink and its
+ // Merkle tree file is non-empty.
+ if params.Size != 0 && params.SymlinkTarget == "" {
+ root = make([]byte, layout.digestSize)
+ if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil {
+ return fmt.Errorf("failed to read root hash: %w", err)
+ }
+ }
+ descriptor := VerityDescriptor{
+ Name: params.Name,
+ FileSize: params.Size,
+ Mode: params.Mode,
+ UID: params.UID,
+ GID: params.GID,
+ Children: params.Children,
+ SymlinkTarget: params.SymlinkTarget,
+ RootHash: root,
+ }
+ return descriptor.verify(params.Expected, params.HashAlgorithms)
+}
+
+// Verify verifies the content read from data with offset. The content is
+// verified against tree. If content spans across multiple blocks, each block is
+// verified. Verification fails if the hash of the data does not match the tree
+// at any level, or if the final root hash does not match expected.
+// Once the data is verified, it will be written using params.Out.
+//
+// Verify checks for both target file content and metadata. If readSize is 0,
+// only metadata is checked.
+func Verify(params *VerifyParams) (int64, error) {
+ if params.ReadSize < 0 {
+ return 0, fmt.Errorf("unexpected read size: %d", params.ReadSize)
+ }
+ layout, err := InitLayout(int64(params.Size), params.HashAlgorithms, params.DataAndTreeInSameFile)
+ if err != nil {
+ return 0, err
+ }
+ if params.ReadSize == 0 {
+ return 0, verifyMetadata(params, &layout)
+ }
+
+ // Calculate the index of blocks that includes the target range in input
+ // data.
+ firstDataBlock := params.ReadOffset / layout.blockSize
+ lastDataBlock := (params.ReadOffset + params.ReadSize - 1) / layout.blockSize
+
+ buf := make([]byte, layout.blockSize)
+ var readErr error
+ total := int64(0)
+ for i := firstDataBlock; i <= lastDataBlock; i++ {
+ // Read a block that includes all or part of target range in
+ // input data.
+ bytesRead, err := params.File.ReadAt(buf, i*layout.blockSize)
+ readErr = err
+ // If at the end of input data and all previous blocks are
+ // verified, return the verified input data and EOF.
+ if readErr == io.EOF && bytesRead == 0 {
+ break
+ }
+ if readErr != nil && readErr != io.EOF {
+ return 0, fmt.Errorf("read from data failed: %w", err)
+ }
+ // If this is the end of file, zero the remaining bytes in buf,
+ // otherwise they are still from the previous block.
+ // TODO(b/162908070): Investigate possible issues with zero
+ // padding the data.
+ if bytesRead < len(buf) {
+ for j := bytesRead; j < len(buf); j++ {
+ buf[j] = 0
+ }
+ }
+ descriptor := VerityDescriptor{
+ Name: params.Name,
+ FileSize: params.Size,
+ Mode: params.Mode,
+ UID: params.UID,
+ GID: params.GID,
+ SymlinkTarget: params.SymlinkTarget,
+ Children: params.Children,
+ }
+ if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.HashAlgorithms, params.Expected); err != nil {
+ return 0, err
+ }
+
+ // startOff is the beginning of the read range within the
+ // current data block. Note that for all blocks other than the
+ // first, startOff should be 0.
+ startOff := int64(0)
+ if i == firstDataBlock {
+ startOff = params.ReadOffset % layout.blockSize
+ }
+ // endOff is the end of the read range within the current data
+ // block. Note that for all blocks other than the last, endOff
+ // should be the block size.
+ endOff := layout.blockSize
+ if i == lastDataBlock {
+ endOff = (params.ReadOffset+params.ReadSize-1)%layout.blockSize + 1
+ }
+ // If the provided size exceeds the end of input data, we should
+ // only copy the parts in buf that's part of input data.
+ if startOff > int64(bytesRead) {
+ startOff = int64(bytesRead)
+ }
+ if endOff > int64(bytesRead) {
+ endOff = int64(bytesRead)
+ }
+ n, err := params.Out.Write(buf[startOff:endOff])
+ if err != nil {
+ return total, err
+ }
+ total += int64(n)
+
+ }
+ return total, readErr
+}
+
+// verifyBlock verifies a block against tree. index is the number of block in
+// original data. The block is verified through each level of the tree. It
+// fails if the calculated hash from block is different from any level of
+// hashes stored in tree. And the final root hash is compared with
+// expected.
+func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, hashAlgorithms int, expected []byte) error {
+ if len(dataBlock) != int(layout.blockSize) {
+ return fmt.Errorf("incorrect block size")
+ }
+
+ expectedDigest := make([]byte, layout.digestSize)
+ treeBlock := make([]byte, layout.blockSize)
+ var digest []byte
+ for level := 0; level < layout.numLevels(); level++ {
+ // Calculate hash.
+ if level == 0 {
+ h, err := hashData(dataBlock, hashAlgorithms)
+ if err != nil {
+ return err
+ }
+ digest = h
+ } else {
+ // Read a block in previous level that contains the
+ // hash we just generated, and generate a next level
+ // hash from it.
+ if _, err := tree.ReadAt(treeBlock, layout.blockOffset(level-1, blockIndex)); err != nil {
+ return err
+ }
+ h, err := hashData(treeBlock, hashAlgorithms)
+ if err != nil {
+ return err
+ }
+ digest = h
+ }
+
+ // Read the digest for the current block and store in
+ // expectedDigest.
+ if _, err := tree.ReadAt(expectedDigest, layout.digestOffset(level, blockIndex)); err != nil {
+ return err
+ }
+
+ if !bytes.Equal(digest, expectedDigest) {
+ return fmt.Errorf("verification failed")
+ }
+ blockIndex = blockIndex / layout.hashesPerBlock()
+ }
+
+ // Verification for the tree succeeded. Now hash the descriptor with
+ // the root hash and compare it with expected.
+ descriptor.RootHash = digest
+ return descriptor.verify(expected, hashAlgorithms)
+}
diff --git a/pkg/merkletree/merkletree_state_autogen.go b/pkg/merkletree/merkletree_state_autogen.go
new file mode 100644
index 000000000..e5670cde4
--- /dev/null
+++ b/pkg/merkletree/merkletree_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package merkletree
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
new file mode 100644
index 000000000..6cb1a23e0
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -0,0 +1,1109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/merkletree"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // All files should be read-only.
+ return nil
+}
+
+var dentrySlicePool = sync.Pool{
+ New: func() interface{} {
+ ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+ return &ds
+ },
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+ if ds == nil {
+ ds = dentrySlicePool.Get().(*[]*dentry)
+ }
+ *ds = append(*ds, d)
+ return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+ // Allow dentries to be GC'd.
+ for i := range *ds {
+ (*ds)[i] = nil
+ }
+ *ds = (*ds)[:0]
+ dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ fs.renameMu.RUnlock()
+ if *ds == nil {
+ return
+ }
+ if len(**ds) != 0 {
+ fs.renameMu.Lock()
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ }
+ putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ if *ds == nil {
+ fs.renameMu.Unlock()
+ return
+ }
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+afterSymlink:
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
+ }
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return d.parent, nil
+ }
+ child, err := fs.getChildLocked(ctx, d, name, ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return child, nil
+}
+
+// verifyChildLocked verifies the hash of child against the already verified
+// hash of the parent to ensure the child is expected. verifyChild triggers a
+// sentry panic if unexpected modifications to the file system are detected. In
+// ErrorOnViolation mode it returns a syserror instead.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+//
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ // Get the path to the child dentry. This is only used to provide path
+ // information in failure case.
+ childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ fs.verityMu.RLock()
+ defer fs.verityMu.RUnlock()
+ // Read the offset of the child from the extended attributes of the
+ // corresponding Merkle tree file.
+ // This is the offset of the hash for child in its parent's Merkle tree
+ // file.
+ off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.lowerMerkleVD,
+ Start: child.lowerMerkleVD,
+ }, &vfs.GetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the file or the xattr does not
+ // exist, it indicates unexpected modifications to the file system.
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+ // The offset xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ offset, err := strconv.Atoi(off)
+ if err != nil {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+ }
+
+ // Open parent Merkle tree file to read and verify child's hash.
+ parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerMerkleVD,
+ Start: parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The parent Merkle tree file should have been created. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // dataSize is the size of raw data for the Merkle tree. For a file,
+ // dataSize is the size of the whole file. For a directory, dataSize is
+ // the size of all its children's hashes.
+ dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the file or the xattr does not
+ // exist, it indicates unexpected modifications to the file system.
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ parentSize, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+ }
+
+ fdReader := FileReadWriteSeeker{
+ FD: parentMerkleFD,
+ Ctx: ctx,
+ }
+
+ parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ }, &vfs.StatOptions{})
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // Since we are verifying against a directory Merkle tree, buf should
+ // contain the hash of the children in the parent Merkle tree when
+ // Verify returns with success.
+ var buf bytes.Buffer
+ parent.hashMu.RLock()
+ _, err = merkletree.Verify(&merkletree.VerifyParams{
+ Out: &buf,
+ File: &fdReader,
+ Tree: &fdReader,
+ Size: int64(parentSize),
+ Name: parent.name,
+ Mode: uint32(parentStat.Mode),
+ UID: parentStat.UID,
+ GID: parentStat.GID,
+ Children: parent.childrenNames,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: fs.alg.toLinuxHashAlg(),
+ ReadOffset: int64(offset),
+ ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())),
+ Expected: parent.hash,
+ DataAndTreeInSameFile: true,
+ })
+ parent.hashMu.RUnlock()
+ if err != nil && err != io.EOF {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+ }
+
+ // Cache child hash when it's verified the first time.
+ child.hashMu.Lock()
+ if len(child.hash) == 0 {
+ child.hash = buf.Bytes()
+ }
+ child.hashMu.Unlock()
+ return child, nil
+}
+
+// verifyStatAndChildrenLocked verifies the stat and children names against the
+// verified hash. The mode/uid/gid and childrenNames of the file is cached
+// after verified.
+//
+// Preconditions: d.dirMu must be locked.
+func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ // Get the path to the child dentry. This is only used to provide path
+ // information in failure case.
+ childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+ if err != nil {
+ return err
+ }
+
+ fs.verityMu.RLock()
+ defer fs.verityMu.RUnlock()
+
+ fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err == syserror.ENOENT {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return err
+ }
+
+ merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ if err == syserror.ENODATA {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err))
+ }
+ if err != nil {
+ return err
+ }
+
+ size, err := strconv.Atoi(merkleSize)
+ if err != nil {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+ }
+
+ if d.isDir() && len(d.childrenNames) == 0 {
+ childrenOffString, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: childrenOffsetXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ if err == syserror.ENODATA {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenOffsetXattr, childPath, err))
+ }
+ if err != nil {
+ return err
+ }
+ childrenOffset, err := strconv.Atoi(childrenOffString)
+ if err != nil {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err))
+ }
+
+ childrenSizeString, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: childrenSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ if err == syserror.ENODATA {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenSizeXattr, childPath, err))
+ }
+ if err != nil {
+ return err
+ }
+ childrenSize, err := strconv.Atoi(childrenSizeString)
+ if err != nil {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err))
+ }
+
+ childrenNames := make([]byte, childrenSize)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(childrenOffset), vfs.ReadOptions{}); err != nil {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to read children map for %s: %v", childPath, err))
+ }
+
+ if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames of %s: %v", childPath, err))
+ }
+ }
+
+ fdReader := FileReadWriteSeeker{
+ FD: fd,
+ Ctx: ctx,
+ }
+
+ var buf bytes.Buffer
+ d.hashMu.RLock()
+ params := &merkletree.VerifyParams{
+ Out: &buf,
+ Tree: &fdReader,
+ Size: int64(size),
+ Name: d.name,
+ Mode: uint32(stat.Mode),
+ UID: stat.UID,
+ GID: stat.GID,
+ Children: d.childrenNames,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: fs.alg.toLinuxHashAlg(),
+ ReadOffset: 0,
+ // Set read size to 0 so only the metadata is verified.
+ ReadSize: 0,
+ Expected: d.hash,
+ DataAndTreeInSameFile: false,
+ }
+ d.hashMu.RUnlock()
+ if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
+ params.DataAndTreeInSameFile = true
+ }
+
+ if d.isSymlink() {
+ target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ })
+ if err != nil {
+ return err
+ }
+ params.SymlinkTarget = target
+ }
+
+ if _, err := merkletree.Verify(params); err != nil && err != io.EOF {
+ return alertIntegrityViolation(fmt.Sprintf("Verification stat for %s failed: %v", childPath, err))
+ }
+ d.mode = uint32(stat.Mode)
+ d.uid = stat.UID
+ d.gid = stat.GID
+ d.size = uint32(size)
+ d.symlinkTarget = params.SymlinkTarget
+ return nil
+}
+
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+ if child, ok := parent.children[name]; ok {
+ // If verity is enabled on child, we should check again whether
+ // the file and the corresponding Merkle tree are as expected,
+ // in order to catch deletion/renaming after the last time it's
+ // accessed.
+ if child.verityEnabled() {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ // Get the path to the child dentry. This is only used
+ // to provide path information in failure case.
+ path, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ childVD, err := parent.getLowerAt(ctx, vfsObj, name)
+ if err == syserror.ENOENT {
+ // The file was previously accessed. If the
+ // file does not exist now, it indicates an
+ // unexpected modification to the file system.
+ return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", path))
+ }
+ if err != nil {
+ return nil, err
+ }
+ defer childVD.DecRef(ctx)
+
+ childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+ // The Merkle tree file was previous accessed. If it
+ // does not exist now, it indicates an unexpected
+ // modification to the file system.
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ defer childMerkleVD.DecRef(ctx)
+ }
+
+ // If enabling verification on files/directories is not allowed
+ // during runtime, all cached children are already verified. If
+ // runtime enable is allowed and the parent directory is
+ // enabled, we should verify the child hash here because it may
+ // be cached before enabled.
+ if fs.allowRuntimeEnable {
+ if parent.verityEnabled() {
+ if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil {
+ return nil, err
+ }
+ }
+ if child.verityEnabled() {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.lowerVD,
+ Start: child.lowerVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ return nil, err
+ }
+ if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil {
+ return nil, err
+ }
+ }
+ }
+ return child, nil
+ }
+ child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+ if err != nil {
+ return nil, err
+ }
+ if parent.children == nil {
+ parent.children = make(map[string]*dentry)
+ }
+ parent.children[name] = child
+ // child's refcount is initially 0, so it may be dropped after traversal.
+ *ds = appendDentry(*ds, child)
+ return child, nil
+}
+
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ if parent.verityEnabled() {
+ if _, ok := parent.childrenNames[name]; !ok {
+ return nil, syserror.ENOENT
+ }
+ }
+
+ parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ childVD, err := parent.getLowerAt(ctx, vfsObj, name)
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("file %s expected but not found", parentPath+"/"+name))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // The dentry needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity child dentry is successfully created.
+ defer childVD.DecRef(ctx)
+
+ childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+ if err == syserror.ENOENT {
+ if !fs.allowRuntimeEnable {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath+"/"+name))
+ }
+ childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ Path: fspath.Parse(merklePrefix + name),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
+ })
+ if err != nil {
+ return nil, err
+ }
+ childMerkleFD.DecRef(ctx)
+ childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+ if err != nil {
+ return nil, err
+ }
+ }
+ if err != nil && err != syserror.ENOENT {
+ return nil, err
+ }
+
+ // Clear the Merkle tree file if they are to be generated at runtime.
+ // TODO(b/182315468): Optimize the Merkle tree generate process to
+ // allow only updating certain files/directories.
+ if fs.allowRuntimeEnable {
+ childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childMerkleVD,
+ Start: childMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_TRUNC,
+ Mode: 0644,
+ })
+ if err != nil {
+ return nil, err
+ }
+ childMerkleFD.DecRef(ctx)
+ }
+
+ // The dentry needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity child dentry is successfully created.
+ defer childMerkleVD.DecRef(ctx)
+
+ mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ child := fs.newDentry()
+ child.lowerVD = childVD
+ child.lowerMerkleVD = childMerkleVD
+
+ // Increase the reference for both childVD and childMerkleVD as they are
+ // held by child. If this function fails and the child is destroyed, the
+ // references will be decreased in destroyLocked.
+ childVD.IncRef()
+ childMerkleVD.IncRef()
+
+ parent.IncRef()
+ child.parent = parent
+ child.name = name
+
+ child.mode = uint32(stat.Mode)
+ child.uid = stat.UID
+ child.gid = stat.GID
+ child.childrenNames = make(map[string]struct{})
+
+ // Verify child hash. This should always be performed unless in
+ // allowRuntimeEnable mode and the parent directory hasn't been enabled
+ // yet.
+ if parent.verityEnabled() {
+ if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil {
+ child.destroyLocked(ctx)
+ return nil, err
+ }
+ }
+ if child.verityEnabled() {
+ if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil {
+ child.destroyLocked(ctx)
+ return nil, err
+ }
+ }
+
+ return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+ for !rp.Final() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+ // Verity file system is read-only.
+ if ats&vfs.MayWrite != 0 {
+ return syserror.EROFS
+ }
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return d.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Verity fs is read-only.
+ if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+ return nil, syserror.EROFS
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+ start := rp.Start().Impl().(*dentry)
+ if rp.Done() {
+ return start.openLocked(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+ // Open existing child or follow symlink.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+ parent.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Users should not open the Merkle tree files. Those are for verity fs
+ // use only.
+ if strings.Contains(d.name, merklePrefix) {
+ return nil, syserror.EPERM
+ }
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+
+ // Verity fs is read-only.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EROFS
+ }
+
+ // Get the path to the target file. This is only used to provide path
+ // information in failure case.
+ path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ // Open the file in the underlying file system.
+ lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, opts)
+
+ // The file should exist, as we succeeded in finding its dentry. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // lowerFD needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity FD is successfully created.
+ defer lowerFD.DecRef(ctx)
+
+ // Open the Merkle tree file corresponding to the current file/directory
+ // to be used later for verifying Read/Walk.
+ merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The Merkle tree file should exist, as we succeeded in finding its
+ // dentry. If it's missing, it indicates an unexpected modification to
+ // the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // merkleReader needs to be cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is successfully created.
+ defer merkleReader.DecRef(ctx)
+
+ lowerFlags := lowerFD.StatusFlags()
+ lowerFDOpts := lowerFD.Options()
+ var merkleWriter *vfs.FileDescription
+ var parentMerkleWriter *vfs.FileDescription
+
+ // Only open the Merkle tree files for write if in allowRuntimeEnable
+ // mode.
+ if d.fs.allowRuntimeEnable {
+ merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+ // merkleWriter is cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is created successfully.
+ defer merkleWriter.DecRef(ctx)
+
+ if d.parent != nil {
+ parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.parent.lowerMerkleVD,
+ Start: d.parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+ }
+ return nil, err
+ }
+ // parentMerkleWriter is cleaned up if any error occurs. IncRef
+ // will be called if a verity FD is created successfully.
+ defer parentMerkleWriter.DecRef(ctx)
+ }
+ }
+
+ fd := &fileDescription{
+ d: d,
+ lowerFD: lowerFD,
+ merkleReader: merkleReader,
+ merkleWriter: merkleWriter,
+ parentMerkleWriter: parentMerkleWriter,
+ isDir: d.isDir(),
+ }
+
+ if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+ return nil, err
+ }
+ lowerFD.IncRef()
+ merkleReader.IncRef()
+ if merkleWriter != nil {
+ merkleWriter.IncRef()
+ }
+ if parentMerkleWriter != nil {
+ parentMerkleWriter.IncRef()
+ }
+ return &fd.vfsfd, err
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ return d.readlink(ctx)
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should
+// be verified.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+
+ var stat linux.Statx
+ stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, &opts)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ d.dirMu.Lock()
+ if d.verityEnabled() {
+ if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ d.dirMu.Unlock()
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ // TODO(b/159261227): Implement StatFSAt.
+ return linux.Statfs{}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil {
+ return nil, err
+ }
+ return nil, syserror.ECONNREFUSED
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ lowerVD := d.lowerVD
+ return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, size)
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ lowerVD := d.lowerVD
+ return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &opts)
+}
+
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ mnt := vd.Mount()
+ d := vd.Dentry().Impl().(*dentry)
+ for {
+ if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+ return vfs.PrependPathAtVFSRootError{}
+ }
+ if &d.vfsd == mnt.Root() {
+ return nil
+ }
+ if d.parent == nil {
+ return vfs.PrependPathAtNonMountRootError{}
+ }
+ b.PrependComponent(d.name)
+ d = d.parent
+ }
+}
diff --git a/pkg/sentry/fsimpl/verity/save_restore.go b/pkg/sentry/fsimpl/verity/save_restore.go
new file mode 100644
index 000000000..46b064342
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+ if atomic.LoadInt64(&d.refs) != -1 {
+ refsvfs2.Register(d)
+ }
+}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
new file mode 100644
index 000000000..2f97324a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -0,0 +1,1402 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package verity provides a filesystem implementation that is a wrapper of
+// another file system.
+// The verity file system provides integrity check for the underlying file
+// system by providing verification for path traversals and each read.
+// The verity file system is read-only, except for one case: when
+// allowRuntimeEnable is true, additional Merkle files can be generated using
+// the FS_IOC_ENABLE_VERITY ioctl.
+//
+// Lock order:
+//
+// filesystem.renameMu
+// dentry.dirMu
+// fileDescription.mu
+// filesystem.verityMu
+// dentry.hashMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
+package verity
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "math"
+ "strconv"
+ "strings"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/merkletree"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+ // Name is the default filesystem name.
+ Name = "verity"
+
+ // merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+ // tree file for "/foo" is "/.merkle.verity.foo".
+ merklePrefix = ".merkle.verity."
+
+ // merkleRootPrefix is the prefix of the Merkle tree root file. This
+ // needs to be different from merklePrefix to avoid name collision.
+ merkleRootPrefix = ".merkleroot.verity."
+
+ // merkleOffsetInParentXattr is the extended attribute name specifying the
+ // offset of the child hash in its parent's Merkle tree.
+ merkleOffsetInParentXattr = "user.merkle.offset"
+
+ // merkleSizeXattr is the extended attribute name specifying the size of data
+ // hashed by the corresponding Merkle tree. For a regular file, this is the
+ // file size. For a directory, this is the size of all its children's hashes.
+ merkleSizeXattr = "user.merkle.size"
+
+ // childrenOffsetXattr is the extended attribute name specifying the
+ // names of the offset of the serialized children names in the Merkle
+ // tree file.
+ childrenOffsetXattr = "user.merkle.childrenOffset"
+
+ // childrenSizeXattr is the extended attribute name specifying the size
+ // of the serialized children names.
+ childrenSizeXattr = "user.merkle.childrenSize"
+
+ // sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+ // extended attributes. The maximum value of a 32 bit integer has 10 digits.
+ sizeOfStringInt32 = 10
+)
+
+var (
+ // action specifies the action towards detected violation.
+ action ViolationAction
+
+ // verityMu synchronizes concurrent operations that enable verity and perform
+ // verification checks.
+ verityMu sync.RWMutex
+)
+
+// HashAlgorithm is a type specifying the algorithm used to hash the file
+// content.
+type HashAlgorithm int
+
+// ViolationAction is a type specifying the action when an integrity violation
+// is detected.
+type ViolationAction int
+
+const (
+ // PanicOnViolation terminates the sentry on detected violation.
+ PanicOnViolation ViolationAction = 0
+ // ErrorOnViolation returns an error from the violating system call on
+ // detected violation.
+ ErrorOnViolation = 1
+)
+
+// Currently supported hashing algorithms include SHA256 and SHA512.
+const (
+ SHA256 HashAlgorithm = iota
+ SHA512
+)
+
+func (alg HashAlgorithm) toLinuxHashAlg() int {
+ switch alg {
+ case SHA256:
+ return linux.FS_VERITY_HASH_ALG_SHA256
+ case SHA512:
+ return linux.FS_VERITY_HASH_ALG_SHA512
+ default:
+ return 0
+ }
+}
+
+// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // creds is a copy of the filesystem's creator's credentials, which are
+ // used for accesses to the underlying file system. creds is immutable.
+ creds *auth.Credentials
+
+ // allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY
+ // to build Merkle trees in the verity file system is allowed. If this
+ // is false, no new Merkle trees can be built, and only the files that
+ // had Merkle trees before startup (e.g. from a host filesystem mounted
+ // with gofer fs) can be verified.
+ allowRuntimeEnable bool
+
+ // lowerMount is the underlying file system mount.
+ lowerMount *vfs.Mount
+
+ // rootDentry is the mount root Dentry for this file system, which
+ // stores the root hash of the whole file system in bytes.
+ rootDentry *dentry
+
+ // alg is the algorithms used to hash the files in the verity file
+ // system.
+ alg HashAlgorithm
+
+ // renameMu synchronizes renaming with non-renaming operations in order
+ // to ensure consistent lock ordering between dentry.dirMu in different
+ // dentries.
+ renameMu sync.RWMutex `state:"nosave"`
+
+ // verityMu synchronizes enabling verity files, protects files or
+ // directories from being enabled by different threads simultaneously.
+ // It also ensures that verity does not access files that are being
+ // enabled.
+ //
+ // Also, the directory Merkle trees depends on the generated trees of
+ // its children. So they shouldn't be enabled the same time. This lock
+ // is for the whole file system to ensure that no more than one file is
+ // enabled the same time.
+ verityMu sync.RWMutex `state:"nosave"`
+}
+
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
+type InternalFilesystemOptions struct {
+ // RootMerkleFileName is the name of the verity root Merkle tree file.
+ RootMerkleFileName string
+
+ // LowerName is the name of the filesystem wrapped by verity fs.
+ LowerName string
+
+ // Alg is the algorithms used to hash the files in the verity file
+ // system.
+ Alg HashAlgorithm
+
+ // RootHash is the root hash of the overall verity file system.
+ RootHash []byte
+
+ // AllowRuntimeEnable specifies whether the verity file system allows
+ // enabling verification for files (i.e. building Merkle trees) during
+ // runtime.
+ AllowRuntimeEnable bool
+
+ // LowerGetFSOptions is the file system option for the lower layer file
+ // system wrapped by verity file system.
+ LowerGetFSOptions vfs.GetFilesystemOptions
+
+ // Action specifies the action on an integrity violation.
+ Action ViolationAction
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In ErrorOnViolation
+// mode, it returns EIO, otherwise it panic.
+func alertIntegrityViolation(msg string) error {
+ if action == ErrorOnViolation {
+ return syserror.EIO
+ }
+ panic(msg)
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+ if !ok {
+ ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+ return nil, nil, syserror.EINVAL
+ }
+ action = iopts.Action
+
+ // Mount the lower file system. The lower file system is wrapped inside
+ // verity, and should not be exposed or connected.
+ mopts := &vfs.MountOptions{
+ GetFilesystemOptions: iopts.LowerGetFSOptions,
+ InternalMount: true,
+ }
+ mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ fs := &filesystem{
+ creds: creds.Fork(),
+ alg: iopts.Alg,
+ lowerMount: mnt,
+ allowRuntimeEnable: iopts.AllowRuntimeEnable,
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+ // Construct the root dentry.
+ d := fs.newDentry()
+ d.refs = 1
+ lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+ lowerVD.IncRef()
+ d.lowerVD = lowerVD
+
+ rootMerkleName := merkleRootPrefix + iopts.RootMerkleFileName
+
+ lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.GetDentryOptions{})
+
+ // If runtime enable is allowed, the root merkle tree may be absent. We
+ // should create the tree file.
+ if err == syserror.ENOENT && fs.allowRuntimeEnable {
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
+ })
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+ lowerMerkleFD.DecRef(ctx)
+ lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.GetDentryOptions{})
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+ } else if err != nil {
+ // Failed to get dentry for the root Merkle file. This
+ // indicates an unexpected modification that removed/renamed
+ // the root Merkle file, or it's never generated.
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, alertIntegrityViolation("Failed to find root Merkle file")
+ }
+
+ // Clear the Merkle tree file if they are to be generated at runtime.
+ // TODO(b/182315468): Optimize the Merkle tree generate process to
+ // allow only updating certain files/directories.
+ if fs.allowRuntimeEnable {
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_TRUNC,
+ Mode: 0644,
+ })
+ if err != nil {
+ return nil, nil, err
+ }
+ lowerMerkleFD.DecRef(ctx)
+ }
+
+ d.lowerMerkleVD = lowerMerkleVD
+
+ // Get metadata from the underlying file system.
+ const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.StatOptions{
+ Mask: statMask,
+ })
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+
+ d.mode = uint32(stat.Mode)
+ d.uid = stat.UID
+ d.gid = stat.GID
+ d.hash = make([]byte, len(iopts.RootHash))
+ d.childrenNames = make(map[string]struct{})
+
+ if !d.isDir() {
+ ctx.Warningf("verity root must be a directory")
+ return nil, nil, syserror.EINVAL
+ }
+
+ if !fs.allowRuntimeEnable {
+ // Get children names from the underlying file system.
+ offString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.GetXattrOptions{
+ Name: childrenOffsetXattr,
+ Size: sizeOfStringInt32,
+ })
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenOffsetXattr, err))
+ }
+ if err != nil {
+ return nil, nil, err
+ }
+
+ off, err := strconv.Atoi(offString)
+ if err != nil {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err))
+ }
+
+ sizeString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.GetXattrOptions{
+ Name: childrenSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenSizeXattr, err))
+ }
+ if err != nil {
+ return nil, nil, err
+ }
+ size, err := strconv.Atoi(sizeString)
+ if err != nil {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err))
+ }
+
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err == syserror.ENOENT {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to open root Merkle file: %v", err))
+ }
+ if err != nil {
+ return nil, nil, err
+ }
+
+ childrenNames := make([]byte, size)
+ if _, err := lowerMerkleFD.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(off), vfs.ReadOptions{}); err != nil {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to read root children map: %v", err))
+ }
+
+ if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil {
+ return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err))
+ }
+
+ if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
+ return nil, nil, err
+ }
+ }
+
+ d.hashMu.Lock()
+ copy(d.hash, iopts.RootHash)
+ d.hashMu.Unlock()
+ d.vfsd.Init(d)
+
+ fs.rootDentry = d
+
+ return &fs.vfsfs, &d.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ fs.lowerMount.DecRef(ctx)
+}
+
+// MountOptions implements vfs.FilesystemImpl.MountOptions.
+func (fs *filesystem) MountOptions() string {
+ return ""
+}
+
+// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
+type dentry struct {
+ vfsd vfs.Dentry
+
+ refs int64
+
+ // fs is the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // mode, uid, gid and size are the file mode, owner, group, and size of
+ // the file in the underlying file system. They are set when a dentry
+ // is initialized, and never modified.
+ mode uint32
+ uid uint32
+ gid uint32
+ size uint32
+
+ // parent is the dentry corresponding to this dentry's parent directory.
+ // name is this dentry's name in parent. If this dentry is a filesystem
+ // root, parent is nil and name is the empty string. parent and name are
+ // protected by fs.renameMu.
+ parent *dentry
+ name string
+
+ // If this dentry represents a directory, children maps the names of
+ // children for which dentries have been instantiated to those dentries,
+ // and dirents (if not nil) is a cache of dirents as returned by
+ // directoryFDs representing this directory. children is protected by
+ // dirMu.
+ dirMu sync.Mutex `state:"nosave"`
+ children map[string]*dentry
+
+ // childrenNames stores the name of all children of the dentry. This is
+ // used by verity to check whether a child is expected. This is only
+ // populated by enableVerity. childrenNames is also protected by dirMu.
+ childrenNames map[string]struct{}
+
+ // lowerVD is the VirtualDentry in the underlying file system. It is
+ // never modified after initialized.
+ lowerVD vfs.VirtualDentry
+
+ // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
+ // in the underlying file system. It is never modified after
+ // initialized.
+ lowerMerkleVD vfs.VirtualDentry
+
+ // symlinkTarget is the target path of a symlink file in the underlying filesystem.
+ symlinkTarget string
+
+ // hash is the calculated hash for the current file or directory. hash
+ // is protected by hashMu.
+ hashMu sync.RWMutex `state:"nosave"`
+ hash []byte
+}
+
+// newDentry creates a new dentry representing the given verity file. The
+// dentry initially has no references; it is the caller's responsibility to set
+// the dentry's reference count and/or call dentry.destroy() as appropriate.
+// The dentry is initially invalid in that it contains no underlying dentry;
+// the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+ d := &dentry{
+ fs: fs,
+ }
+ d.vfsd.Init(d)
+ refsvfs2.Register(d)
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+ r := atomic.AddInt64(&d.refs, 1)
+ if d.LogRefs() {
+ refsvfs2.LogIncRef(d, r)
+ }
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+ for {
+ r := atomic.LoadInt64(&d.refs)
+ if r <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
+ if d.LogRefs() {
+ refsvfs2.LogTryIncRef(d, r+1)
+ }
+ return true
+ }
+ }
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+ r := atomic.AddInt64(&d.refs, -1)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
+ if r == 0 {
+ d.fs.renameMu.Lock()
+ d.checkDropLocked(ctx)
+ d.fs.renameMu.Unlock()
+ } else if r < 0 {
+ panic("verity.dentry.DecRef() called without holding a reference")
+ }
+}
+
+func (d *dentry) decRefLocked(ctx context.Context) {
+ r := atomic.AddInt64(&d.refs, -1)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
+ if r == 0 {
+ d.checkDropLocked(ctx)
+ } else if r < 0 {
+ panic("verity.dentry.decRefLocked() called without holding a reference")
+ }
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+ // Dentries with a positive reference count must be retained. Dentries
+ // with a negative reference count have already been destroyed.
+ if atomic.LoadInt64(&d.refs) != 0 {
+ return
+ }
+ // Refs is still zero; destroy it.
+ d.destroyLocked(ctx)
+ return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+ switch atomic.LoadInt64(&d.refs) {
+ case 0:
+ // Mark the dentry destroyed.
+ atomic.StoreInt64(&d.refs, -1)
+ case -1:
+ panic("verity.dentry.destroyLocked() called on already destroyed dentry")
+ default:
+ panic("verity.dentry.destroyLocked() called with references on the dentry")
+ }
+
+ if d.lowerVD.Ok() {
+ d.lowerVD.DecRef(ctx)
+ }
+ if d.lowerMerkleVD.Ok() {
+ d.lowerMerkleVD.DecRef(ctx)
+ }
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ if !d.vfsd.IsDead() {
+ delete(d.parent.children, d.name)
+ }
+ d.parent.dirMu.Unlock()
+ d.parent.decRefLocked(ctx)
+ }
+ refsvfs2.Unregister(d)
+}
+
+// RefType implements refsvfs2.CheckedObject.Type.
+func (d *dentry) RefType() string {
+ return "verity.dentry"
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+ return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
+// LogRefs implements refsvfs2.CheckedObject.LogRefs.
+//
+// This should only be set to true for debugging purposes, as it can generate an
+// extremely large amount of output and drastically degrade performance.
+func (d *dentry) LogRefs() bool {
+ return false
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+ //TODO(b/159261227): Implement InotifyWithParent.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ //TODO(b/159261227): Implement Watches.
+ return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {
+ //TODO(b/159261227): Implement OnZeroWatches.
+}
+
+func (d *dentry) isSymlink() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) isDir() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+ d.hashMu.RLock()
+ defer d.hashMu.RUnlock()
+ return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
+// getLowerAt returns the dentry in the underlying file system, which is
+// represented by filename relative to d.
+func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) {
+ return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ Path: fspath.Parse(filename),
+ }, &vfs.GetDentryOptions{})
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ if d.verityEnabled() {
+ stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ return "", err
+ }
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ if err := d.fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
+ return "", err
+ }
+ return d.symlinkTarget, nil
+ }
+
+ return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ })
+}
+
+// FileDescription implements vfs.FileDescriptionImpl for verity fds.
+// FileDescription is a wrapper of the underlying lowerFD, with support to build
+// Merkle trees through the Linux fs-verity API to verify contents read from
+// lowerFD.
+//
+// +stateify savable
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+
+ // d is the corresponding dentry to the fileDescription.
+ d *dentry
+
+ // isDir specifies whehter the fileDescription points to a directory.
+ isDir bool
+
+ // lowerFD is the FileDescription corresponding to the file in the
+ // underlying file system.
+ lowerFD *vfs.FileDescription
+
+ // lowerMappable is the memmap.Mappable corresponding to this file in the
+ // underlying file system.
+ lowerMappable memmap.Mappable
+
+ // merkleReader is the read-only FileDescription corresponding to the
+ // Merkle tree file in the underlying file system.
+ merkleReader *vfs.FileDescription
+
+ // merkleWriter is the FileDescription corresponding to the Merkle tree
+ // file in the underlying file system for writing. This should only be
+ // used when allowRuntimeEnable is set to true.
+ merkleWriter *vfs.FileDescription
+
+ // parentMerkleWriter is the FileDescription of the Merkle tree for the
+ // directory that contains the current file/directory. This is only used
+ // if allowRuntimeEnable is set to true.
+ parentMerkleWriter *vfs.FileDescription
+
+ // off is the file offset. off is protected by mu.
+ mu sync.Mutex `state:"nosave"`
+ off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+ fd.lowerFD.DecRef(ctx)
+ fd.merkleReader.DecRef(ctx)
+ if fd.merkleWriter != nil {
+ fd.merkleWriter.DecRef(ctx)
+ }
+ if fd.parentMerkleWriter != nil {
+ fd.parentMerkleWriter.DecRef(ctx)
+ }
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ // TODO(b/162788573): Add integrity check for metadata.
+ stat, err := fd.lowerFD.Stat(ctx, opts)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ fd.d.dirMu.Lock()
+ if fd.d.verityEnabled() {
+ if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ fd.d.dirMu.Unlock()
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ // Verity files are read-only.
+ return syserror.EPERM
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ if !fd.d.isDir() {
+ return syserror.ENOTDIR
+ }
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ var ds []vfs.Dirent
+ err := fd.lowerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ // Do not include the Merkle tree files.
+ if strings.Contains(dirent.Name, merklePrefix) || strings.Contains(dirent.Name, merkleRootPrefix) {
+ return nil
+ }
+ if fd.d.verityEnabled() {
+ // Verify that the child is expected.
+ if dirent.Name != "." && dirent.Name != ".." {
+ if _, ok := fd.d.childrenNames[dirent.Name]; !ok {
+ return alertIntegrityViolation(fmt.Sprintf("Unexpected children %s", dirent.Name))
+ }
+ }
+ }
+ ds = append(ds, dirent)
+ return nil
+ }))
+
+ if err != nil {
+ return err
+ }
+
+ // The result should contain all children plus "." and "..".
+ if fd.d.verityEnabled() && len(ds) != len(fd.d.childrenNames)+2 {
+ return alertIntegrityViolation(fmt.Sprintf("Unexpected children number %d", len(ds)))
+ }
+
+ for fd.off < int64(len(ds)) {
+ if err := cb.Handle(ds[fd.off]); err != nil {
+ return err
+ }
+ fd.off++
+ }
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ n := int64(0)
+ switch whence {
+ case linux.SEEK_SET:
+ // use offset as specified
+ case linux.SEEK_CUR:
+ n = fd.off
+ case linux.SEEK_END:
+ n = int64(fd.d.size)
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset > math.MaxInt64-n {
+ return 0, syserror.EINVAL
+ }
+ offset += n
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a
+// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The
+// hash of the generated Merkle tree and the data size is returned. If fd
+// points to a regular file, the data is the content of the file. If fd points
+// to a directory, the data is all hashes of its children, written to the Merkle
+// tree file. If fd represents a symlink, the data is empty and nothing is written
+// to the Merkle tree file.
+//
+// Preconditions: fd.d.fs.verityMu must be locked.
+func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) {
+ fdReader := FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+ merkleReader := FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+ merkleWriter := FileReadWriteSeeker{
+ FD: fd.merkleWriter,
+ Ctx: ctx,
+ }
+
+ stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ params := &merkletree.GenerateParams{
+ TreeReader: &merkleReader,
+ TreeWriter: &merkleWriter,
+ Children: fd.d.childrenNames,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
+ Name: fd.d.name,
+ Mode: uint32(stat.Mode),
+ UID: stat.UID,
+ GID: stat.GID,
+ }
+
+ switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+ case linux.S_IFREG:
+ // For a regular file, generate a Merkle tree based on its
+ // content.
+ params.File = &fdReader
+ params.Size = int64(stat.Size)
+ params.DataAndTreeInSameFile = false
+ case linux.S_IFDIR:
+ // For a directory, generate a Merkle tree based on the hashes
+ // of its children that has already been written to the Merkle
+ // tree file.
+ merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ params.Size = int64(merkleStat.Size)
+ params.File = &merkleReader
+ params.DataAndTreeInSameFile = true
+ case linux.S_IFLNK:
+ // For a symlink, generate a Merkle tree file but do not write the root hash
+ // of the target file content to it. Return a hash of a VerityDescriptor object
+ // which includes the symlink target name.
+ target, err := fd.d.readlink(ctx)
+ if err != nil {
+ return nil, 0, err
+ }
+
+ params.Size = int64(stat.Size)
+ params.DataAndTreeInSameFile = false
+ params.SymlinkTarget = target
+ default:
+ // TODO(b/167728857): Investigate whether and how we should
+ // enable other types of file.
+ return nil, 0, syserror.EINVAL
+ }
+ hash, err := merkletree.Generate(params)
+ return hash, uint64(params.Size), err
+}
+
+// recordChildrenLocked writes the names of fd's children into the
+// corresponding Merkle tree file, and saves the offset/size of the map into
+// xattrs.
+//
+// Preconditions:
+// * fd.d.fs.verityMu must be locked.
+// * fd.d.isDir() == true.
+func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error {
+ // Record the children names in the Merkle tree file.
+ childrenNames, err := json.Marshal(fd.d.childrenNames)
+ if err != nil {
+ return err
+ }
+
+ stat, err := fd.merkleWriter.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return err
+ }
+
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: childrenOffsetXattr,
+ Value: strconv.Itoa(int(stat.Size)),
+ }); err != nil {
+ return err
+ }
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: childrenSizeXattr,
+ Value: strconv.Itoa(len(childrenNames)),
+ }); err != nil {
+ return err
+ }
+
+ if _, err = fd.merkleWriter.Write(ctx, usermem.BytesIOSequence(childrenNames), vfs.WriteOptions{}); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
+ if !fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.EPERM
+ }
+
+ fd.d.fs.verityMu.Lock()
+ defer fd.d.fs.verityMu.Unlock()
+
+ // In allowRuntimeEnable mode, the underlying fd and read/write fd for
+ // the Merkle tree file should have all been initialized. For any file
+ // or directory other than the root, the parent Merkle tree file should
+ // have also been initialized.
+ if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
+ return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
+ }
+
+ hash, dataSize, err := fd.generateMerkleLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+
+ if fd.parentMerkleWriter != nil {
+ stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return 0, err
+ }
+
+ // Write the hash of fd to the parent directory's Merkle tree
+ // file, as it should be part of the parent Merkle tree data.
+ // parentMerkleWriter is open with O_APPEND, so it should write
+ // directly to the end of the file.
+ if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
+ return 0, err
+ }
+
+ // Record the offset of the hash of fd in parent directory's
+ // Merkle tree file.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Value: strconv.Itoa(int(stat.Size)),
+ }); err != nil {
+ return 0, err
+ }
+
+ // Add the current child's name to parent's childrenNames.
+ fd.d.parent.childrenNames[fd.d.name] = struct{}{}
+ }
+
+ // Record the size of the data being hashed for fd.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleSizeXattr,
+ Value: strconv.Itoa(int(dataSize)),
+ }); err != nil {
+ return 0, err
+ }
+
+ if fd.d.isDir() {
+ if err := fd.recordChildrenLocked(ctx); err != nil {
+ return 0, err
+ }
+ }
+ fd.d.hashMu.Lock()
+ fd.d.hash = hash
+ fd.d.hashMu.Unlock()
+ return 0, nil
+}
+
+// measureVerity returns the hash of fd, saved in verityDigest.
+func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hostarch.Addr) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ return 0, syserror.EINVAL
+ }
+ var metadata linux.DigestMetadata
+
+ fd.d.hashMu.RLock()
+ defer fd.d.hashMu.RUnlock()
+
+ // If allowRuntimeEnable is true, an empty fd.d.hash indicates that
+ // verity is not enabled for the file. If allowRuntimeEnable is false,
+ // this is an integrity violation because all files should have verity
+ // enabled, in which case fd.d.hash should be set.
+ if len(fd.d.hash) == 0 {
+ if fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.ENODATA
+ }
+ return 0, alertIntegrityViolation("Ioctl measureVerity: no hash found")
+ }
+
+ // The first part of VerityDigest is the metadata.
+ if _, err := metadata.CopyIn(t, verityDigest); err != nil {
+ return 0, err
+ }
+ if metadata.DigestSize < uint16(len(fd.d.hash)) {
+ return 0, syserror.EOVERFLOW
+ }
+
+ // Populate the output digest size, since DigestSize is both input and
+ // output.
+ metadata.DigestSize = uint16(len(fd.d.hash))
+
+ // First copy the metadata.
+ if _, err := metadata.CopyOut(t, verityDigest); err != nil {
+ return 0, err
+ }
+
+ // Now copy the root hash bytes to the memory after metadata.
+ _, err := t.CopyOutBytes(hostarch.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
+ return 0, err
+}
+
+func (fd *fileDescription) verityFlags(ctx context.Context, flags hostarch.Addr) (uintptr, error) {
+ f := int32(0)
+
+ fd.d.hashMu.RLock()
+ // All enabled files should store a hash. This flag is not settable via
+ // FS_IOC_SETFLAGS.
+ if len(fd.d.hash) != 0 {
+ f |= linux.FS_VERITY_FL
+ }
+ fd.d.hashMu.RUnlock()
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ return 0, syserror.EINVAL
+ }
+ _, err := primitive.CopyInt32Out(t, flags, f)
+ return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch cmd := args[1].Uint(); cmd {
+ case linux.FS_IOC_ENABLE_VERITY:
+ return fd.enableVerity(ctx)
+ case linux.FS_IOC_MEASURE_VERITY:
+ return fd.measureVerity(ctx, args[2].Pointer())
+ case linux.FS_IOC_GETFLAGS:
+ return fd.verityFlags(ctx, args[2].Pointer())
+ default:
+ // TODO(b/169682228): Investigate which ioctl commands should
+ // be allowed.
+ return 0, syserror.ENOSYS
+ }
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Implement Read with PRead by setting offset.
+ fd.mu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // No need to verify if the file is not enabled yet in
+ // allowRuntimeEnable mode.
+ if !fd.d.verityEnabled() {
+ return fd.lowerFD.PRead(ctx, dst, offset, opts)
+ }
+
+ fd.d.fs.verityMu.RLock()
+ defer fd.d.fs.verityMu.RUnlock()
+ // dataSize is the size of the whole file.
+ dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the xattr does not exist, it
+ // indicates unexpected modifications to the file system.
+ if err == syserror.ENODATA {
+ return 0, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ size, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+ }
+
+ dataReader := FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+
+ merkleReader := FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+
+ fd.d.hashMu.RLock()
+ n, err := merkletree.Verify(&merkletree.VerifyParams{
+ Out: dst.Writer(ctx),
+ File: &dataReader,
+ Tree: &merkleReader,
+ Size: int64(size),
+ Name: fd.d.name,
+ Mode: fd.d.mode,
+ UID: fd.d.uid,
+ GID: fd.d.gid,
+ Children: fd.d.childrenNames,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
+ ReadOffset: offset,
+ ReadSize: dst.NumBytes(),
+ Expected: fd.d.hash,
+ DataAndTreeInSameFile: false,
+ })
+ fd.d.hashMu.RUnlock()
+ if err != nil {
+ return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
+ }
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EROFS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EROFS
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ if err := fd.lowerFD.ConfigureMMap(ctx, opts); err != nil {
+ return err
+ }
+ fd.lowerMappable = opts.Mappable
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef(ctx)
+ opts.MappingIdentity = nil
+ }
+
+ // Check if mmap is allowed on the lower filesystem.
+ if !opts.SentryOwnedContent {
+ return syserror.ENODEV
+ }
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts)
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
+ return fd.lowerFD.LockBSD(ctx, ownerPID, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *fileDescription) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+ return fd.lowerFD.UnlockBSD(ctx)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
+ return fd.lowerFD.LockPOSIX(ctx, uid, ownerPID, t, r, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
+ return fd.lowerFD.UnlockPOSIX(ctx, uid, r)
+}
+
+// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX.
+func (fd *fileDescription) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
+ return fd.lowerFD.TestPOSIX(ctx, uid, t, r)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (fd *fileDescription) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
+ ts, err := fd.lowerMappable.Translate(ctx, required, optional, at)
+ if err != nil {
+ return ts, err
+ }
+
+ // dataSize is the size of the whole file.
+ dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the xattr does not exist, it
+ // indicates unexpected modifications to the file system.
+ if err == syserror.ENODATA {
+ return ts, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+ }
+ if err != nil {
+ return ts, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ size, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return ts, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+ }
+
+ merkleReader := FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+
+ for _, t := range ts {
+ // Content integrity relies on sentry owning the backing data. MapInternal is guaranteed
+ // to fetch sentry owned memory because we disallow verity mmaps otherwise.
+ ims, err := t.File.MapInternal(memmap.FileRange{t.Offset, t.Offset + t.Source.Length()}, hostarch.Read)
+ if err != nil {
+ return nil, err
+ }
+ dataReader := mmapReadSeeker{ims, t.Source.Start}
+ var buf bytes.Buffer
+ _, err = merkletree.Verify(&merkletree.VerifyParams{
+ Out: &buf,
+ File: &dataReader,
+ Tree: &merkleReader,
+ Size: int64(size),
+ Name: fd.d.name,
+ Mode: fd.d.mode,
+ UID: fd.d.uid,
+ GID: fd.d.gid,
+ HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
+ ReadOffset: int64(t.Source.Start),
+ ReadSize: int64(t.Source.Length()),
+ Expected: fd.d.hash,
+ DataAndTreeInSameFile: false,
+ })
+ if err != nil {
+ return ts, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
+ }
+ }
+ return ts, err
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (fd *fileDescription) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
+ return fd.lowerMappable.AddMapping(ctx, ms, ar, offset, writable)
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (fd *fileDescription) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
+ fd.lowerMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (fd *fileDescription) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
+ return fd.lowerMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (fd *fileDescription) InvalidateUnsavable(context.Context) error {
+ return nil
+}
+
+// mmapReadSeeker is a helper struct used by fileDescription.Translate to pass
+// a safemem.BlockSeq pointing to the mapped region as io.ReaderAt.
+type mmapReadSeeker struct {
+ safemem.BlockSeq
+ Offset uint64
+}
+
+// ReadAt implements io.ReaderAt.ReadAt. off is the offset into the mapped file.
+func (r *mmapReadSeeker) ReadAt(p []byte, off int64) (int, error) {
+ bs := r.BlockSeq
+ // Adjust the offset into the mapped file to get the offset into the internally
+ // mapped region.
+ readOffset := off - int64(r.Offset)
+ if readOffset < 0 {
+ return 0, syserror.EINVAL
+ }
+ bs.DropFirst64(uint64(readOffset))
+ view := bs.TakeFirst64(uint64(len(p)))
+ dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+ n, err := safemem.CopySeq(dst, view)
+ return int(n), err
+}
+
+// FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as
+// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
+type FileReadWriteSeeker struct {
+ FD *vfs.FileDescription
+ Ctx context.Context
+ ROpts vfs.ReadOptions
+ WOpts vfs.WriteOptions
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
+ return int(n), err
+}
+
+// Read implements io.ReadWriteSeeker.Read.
+func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
+ return int(n), err
+}
+
+// Seek implements io.ReadWriteSeeker.Seek.
+func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+ return f.FD.Seek(f.Ctx, offset, int32(whence))
+}
+
+// WriteAt implements io.WriterAt.WriteAt.
+func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
+ return int(n), err
+}
+
+// Write implements io.ReadWriteSeeker.Write.
+func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
+ buf := usermem.BytesIOSequence(p)
+ n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
+ return int(n), err
+}
diff --git a/pkg/sentry/fsimpl/verity/verity_state_autogen.go b/pkg/sentry/fsimpl/verity/verity_state_autogen.go
new file mode 100644
index 000000000..47bea46c9
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity_state_autogen.go
@@ -0,0 +1,237 @@
+// automatically generated by stateify.
+
+package verity
+
+import (
+ "gvisor.dev/gvisor/pkg/state"
+)
+
+func (fstype *FilesystemType) StateTypeName() string {
+ return "pkg/sentry/fsimpl/verity.FilesystemType"
+}
+
+func (fstype *FilesystemType) StateFields() []string {
+ return []string{}
+}
+
+func (fstype *FilesystemType) beforeSave() {}
+
+// +checklocksignore
+func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) {
+ fstype.beforeSave()
+}
+
+func (fstype *FilesystemType) afterLoad() {}
+
+// +checklocksignore
+func (fstype *FilesystemType) StateLoad(stateSourceObject state.Source) {
+}
+
+func (fs *filesystem) StateTypeName() string {
+ return "pkg/sentry/fsimpl/verity.filesystem"
+}
+
+func (fs *filesystem) StateFields() []string {
+ return []string{
+ "vfsfs",
+ "creds",
+ "allowRuntimeEnable",
+ "lowerMount",
+ "rootDentry",
+ "alg",
+ }
+}
+
+func (fs *filesystem) beforeSave() {}
+
+// +checklocksignore
+func (fs *filesystem) StateSave(stateSinkObject state.Sink) {
+ fs.beforeSave()
+ stateSinkObject.Save(0, &fs.vfsfs)
+ stateSinkObject.Save(1, &fs.creds)
+ stateSinkObject.Save(2, &fs.allowRuntimeEnable)
+ stateSinkObject.Save(3, &fs.lowerMount)
+ stateSinkObject.Save(4, &fs.rootDentry)
+ stateSinkObject.Save(5, &fs.alg)
+}
+
+func (fs *filesystem) afterLoad() {}
+
+// +checklocksignore
+func (fs *filesystem) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &fs.vfsfs)
+ stateSourceObject.Load(1, &fs.creds)
+ stateSourceObject.Load(2, &fs.allowRuntimeEnable)
+ stateSourceObject.Load(3, &fs.lowerMount)
+ stateSourceObject.Load(4, &fs.rootDentry)
+ stateSourceObject.Load(5, &fs.alg)
+}
+
+func (i *InternalFilesystemOptions) StateTypeName() string {
+ return "pkg/sentry/fsimpl/verity.InternalFilesystemOptions"
+}
+
+func (i *InternalFilesystemOptions) StateFields() []string {
+ return []string{
+ "RootMerkleFileName",
+ "LowerName",
+ "Alg",
+ "RootHash",
+ "AllowRuntimeEnable",
+ "LowerGetFSOptions",
+ "Action",
+ }
+}
+
+func (i *InternalFilesystemOptions) beforeSave() {}
+
+// +checklocksignore
+func (i *InternalFilesystemOptions) StateSave(stateSinkObject state.Sink) {
+ i.beforeSave()
+ stateSinkObject.Save(0, &i.RootMerkleFileName)
+ stateSinkObject.Save(1, &i.LowerName)
+ stateSinkObject.Save(2, &i.Alg)
+ stateSinkObject.Save(3, &i.RootHash)
+ stateSinkObject.Save(4, &i.AllowRuntimeEnable)
+ stateSinkObject.Save(5, &i.LowerGetFSOptions)
+ stateSinkObject.Save(6, &i.Action)
+}
+
+func (i *InternalFilesystemOptions) afterLoad() {}
+
+// +checklocksignore
+func (i *InternalFilesystemOptions) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &i.RootMerkleFileName)
+ stateSourceObject.Load(1, &i.LowerName)
+ stateSourceObject.Load(2, &i.Alg)
+ stateSourceObject.Load(3, &i.RootHash)
+ stateSourceObject.Load(4, &i.AllowRuntimeEnable)
+ stateSourceObject.Load(5, &i.LowerGetFSOptions)
+ stateSourceObject.Load(6, &i.Action)
+}
+
+func (d *dentry) StateTypeName() string {
+ return "pkg/sentry/fsimpl/verity.dentry"
+}
+
+func (d *dentry) StateFields() []string {
+ return []string{
+ "vfsd",
+ "refs",
+ "fs",
+ "mode",
+ "uid",
+ "gid",
+ "size",
+ "parent",
+ "name",
+ "children",
+ "childrenNames",
+ "lowerVD",
+ "lowerMerkleVD",
+ "symlinkTarget",
+ "hash",
+ }
+}
+
+func (d *dentry) beforeSave() {}
+
+// +checklocksignore
+func (d *dentry) StateSave(stateSinkObject state.Sink) {
+ d.beforeSave()
+ stateSinkObject.Save(0, &d.vfsd)
+ stateSinkObject.Save(1, &d.refs)
+ stateSinkObject.Save(2, &d.fs)
+ stateSinkObject.Save(3, &d.mode)
+ stateSinkObject.Save(4, &d.uid)
+ stateSinkObject.Save(5, &d.gid)
+ stateSinkObject.Save(6, &d.size)
+ stateSinkObject.Save(7, &d.parent)
+ stateSinkObject.Save(8, &d.name)
+ stateSinkObject.Save(9, &d.children)
+ stateSinkObject.Save(10, &d.childrenNames)
+ stateSinkObject.Save(11, &d.lowerVD)
+ stateSinkObject.Save(12, &d.lowerMerkleVD)
+ stateSinkObject.Save(13, &d.symlinkTarget)
+ stateSinkObject.Save(14, &d.hash)
+}
+
+// +checklocksignore
+func (d *dentry) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &d.vfsd)
+ stateSourceObject.Load(1, &d.refs)
+ stateSourceObject.Load(2, &d.fs)
+ stateSourceObject.Load(3, &d.mode)
+ stateSourceObject.Load(4, &d.uid)
+ stateSourceObject.Load(5, &d.gid)
+ stateSourceObject.Load(6, &d.size)
+ stateSourceObject.Load(7, &d.parent)
+ stateSourceObject.Load(8, &d.name)
+ stateSourceObject.Load(9, &d.children)
+ stateSourceObject.Load(10, &d.childrenNames)
+ stateSourceObject.Load(11, &d.lowerVD)
+ stateSourceObject.Load(12, &d.lowerMerkleVD)
+ stateSourceObject.Load(13, &d.symlinkTarget)
+ stateSourceObject.Load(14, &d.hash)
+ stateSourceObject.AfterLoad(d.afterLoad)
+}
+
+func (fd *fileDescription) StateTypeName() string {
+ return "pkg/sentry/fsimpl/verity.fileDescription"
+}
+
+func (fd *fileDescription) StateFields() []string {
+ return []string{
+ "vfsfd",
+ "FileDescriptionDefaultImpl",
+ "d",
+ "isDir",
+ "lowerFD",
+ "lowerMappable",
+ "merkleReader",
+ "merkleWriter",
+ "parentMerkleWriter",
+ "off",
+ }
+}
+
+func (fd *fileDescription) beforeSave() {}
+
+// +checklocksignore
+func (fd *fileDescription) StateSave(stateSinkObject state.Sink) {
+ fd.beforeSave()
+ stateSinkObject.Save(0, &fd.vfsfd)
+ stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl)
+ stateSinkObject.Save(2, &fd.d)
+ stateSinkObject.Save(3, &fd.isDir)
+ stateSinkObject.Save(4, &fd.lowerFD)
+ stateSinkObject.Save(5, &fd.lowerMappable)
+ stateSinkObject.Save(6, &fd.merkleReader)
+ stateSinkObject.Save(7, &fd.merkleWriter)
+ stateSinkObject.Save(8, &fd.parentMerkleWriter)
+ stateSinkObject.Save(9, &fd.off)
+}
+
+func (fd *fileDescription) afterLoad() {}
+
+// +checklocksignore
+func (fd *fileDescription) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &fd.vfsfd)
+ stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl)
+ stateSourceObject.Load(2, &fd.d)
+ stateSourceObject.Load(3, &fd.isDir)
+ stateSourceObject.Load(4, &fd.lowerFD)
+ stateSourceObject.Load(5, &fd.lowerMappable)
+ stateSourceObject.Load(6, &fd.merkleReader)
+ stateSourceObject.Load(7, &fd.merkleWriter)
+ stateSourceObject.Load(8, &fd.parentMerkleWriter)
+ stateSourceObject.Load(9, &fd.off)
+}
+
+func init() {
+ state.Register((*FilesystemType)(nil))
+ state.Register((*filesystem)(nil))
+ state.Register((*InternalFilesystemOptions)(nil))
+ state.Register((*dentry)(nil))
+ state.Register((*fileDescription)(nil))
+}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 9b3dacf46..5ef8cc039 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,6 +16,7 @@ package boot
import (
"fmt"
+ "path"
"sort"
"strings"
@@ -37,6 +38,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/verity"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -83,6 +85,10 @@ func registerFilesystems(k *kernel.Kernel) error {
AllowUserMount: true,
AllowUserList: true,
})
+ vfsObj.MustRegisterFilesystemType(verity.Name, &verity.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ AllowUserMount: false,
+ })
// Setup files in devtmpfs.
if err := memdev.Register(vfsObj); err != nil {
@@ -472,6 +478,12 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
var data []string
var iopts interface{}
+ verityOpts, verityRequested, remainingMOpts, err := parseVerityMountOptions(m.Options)
+ if err != nil {
+ return "", nil, false, err
+ }
+ m.Options = remainingMOpts
+
// Find filesystem name and FS specific data field.
switch m.Type {
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
@@ -530,9 +542,73 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
}
}
+ if verityRequested {
+ verityOpts.RootMerkleFileName = path.Base(m.Mount.Destination)
+ verityOpts.LowerName = fsName
+ verityOpts.LowerGetFSOptions = opts.GetFilesystemOptions
+ fsName = verity.Name
+ opts = &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ Data: strings.Join(data, ","),
+ InternalData: verityOpts,
+ },
+ InternalMount: true,
+ }
+ }
+
return fsName, opts, useOverlay, nil
}
+func parseKeyValue(s string) (string, string, bool) {
+ tokens := strings.SplitN(s, "=", 2)
+ if len(tokens) < 2 {
+ return "", "", false
+ }
+ return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
+}
+
+// parseAndFilterOptions scans the provided mount options for verity-related
+// mount options. It returns the parsed set of verity mount options, as well as
+// the filtered set of mount options unrelated to verity.
+func parseVerityMountOptions(mopts []string) (verity.InternalFilesystemOptions, bool, []string, error) {
+ nonVerity := []string{}
+ found := false
+ verityOpts := verity.InternalFilesystemOptions{
+ Action: verity.PanicOnViolation,
+ }
+ for _, o := range mopts {
+ if !strings.HasPrefix(o, "verity.") {
+ nonVerity = append(nonVerity, o)
+ continue
+ }
+
+ k, v, ok := parseKeyValue(o)
+ if !ok {
+ return verityOpts, found, nonVerity, fmt.Errorf("invalid verity mount option with no value: %q", o)
+ }
+
+ found = true
+ switch k {
+ case "verity.roothash":
+ verityOpts.RootHash = []byte(v)
+ case "verity.action":
+ switch v {
+ case "error":
+ verityOpts.Action = verity.ErrorOnViolation
+ case "panic":
+ verityOpts.Action = verity.PanicOnViolation
+ default:
+ log.Warningf("Invalid verity action %q", v)
+ verityOpts.Action = verity.PanicOnViolation
+ }
+ default:
+ return verityOpts, found, nonVerity, fmt.Errorf("unknown verity mount option: %q", k)
+ }
+ }
+ verityOpts.AllowRuntimeEnable = len(verityOpts.RootHash) == 0
+ return verityOpts, found, nonVerity, nil
+}
+
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
// the host /tmp, but this is a nice optimization, and fixes some apps that call
diff --git a/runsc/cli/main.go b/runsc/cli/main.go
index a3c515f4b..6db6614cc 100644
--- a/runsc/cli/main.go
+++ b/runsc/cli/main.go
@@ -86,6 +86,7 @@ func Main(version string) {
subcommands.Register(new(cmd.Symbolize), "")
subcommands.Register(new(cmd.Wait), "")
subcommands.Register(new(cmd.Mitigate), "")
+ subcommands.Register(new(cmd.VerityPrepare), "")
// Register internal commands with the internal group name. This causes
// them to be sorted below the user-facing commands with empty group.
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 455c57692..5485db149 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -126,9 +126,8 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
Hostname: hostname,
}
- specutils.LogSpec(spec)
-
cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+
if conf.Network == config.NetworkNone {
addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
@@ -154,55 +153,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
}
}
- out, err := json.Marshal(spec)
- if err != nil {
- return Errorf("Error to marshal spec: %v", err)
- }
- tmpDir, err := ioutil.TempDir("", "runsc-do")
- if err != nil {
- return Errorf("Error to create tmp dir: %v", err)
- }
- defer os.RemoveAll(tmpDir)
-
- log.Infof("Changing configuration RootDir to %q", tmpDir)
- conf.RootDir = tmpDir
-
- cfgPath := filepath.Join(tmpDir, "config.json")
- if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
- return Errorf("Error write spec: %v", err)
- }
-
- containerArgs := container.Args{
- ID: cid,
- Spec: spec,
- BundleDir: tmpDir,
- Attached: true,
- }
- ct, err := container.New(conf, containerArgs)
- if err != nil {
- return Errorf("creating container: %v", err)
- }
- defer ct.Destroy()
-
- if err := ct.Start(conf); err != nil {
- return Errorf("starting container: %v", err)
- }
-
- // Forward signals to init in the container. Thus if we get SIGINT from
- // ^C, the container gracefully exit, and we can clean up.
- //
- // N.B. There is a still a window before this where a signal may kill
- // this process, skipping cleanup.
- stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */)
- defer stopForwarding()
-
- ws, err := ct.Wait()
- if err != nil {
- return Errorf("waiting for container: %v", err)
- }
-
- *waitStatus = ws
- return subcommands.ExitSuccess
+ return startContainerAndWait(spec, conf, cid, waitStatus)
}
func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) {
@@ -397,3 +348,58 @@ func calculatePeerIP(ip string) (string, error) {
}
return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil
}
+
+func startContainerAndWait(spec *specs.Spec, conf *config.Config, cid string, waitStatus *unix.WaitStatus) subcommands.ExitStatus {
+ specutils.LogSpec(spec)
+
+ out, err := json.Marshal(spec)
+ if err != nil {
+ return Errorf("Error to marshal spec: %v", err)
+ }
+ tmpDir, err := ioutil.TempDir("", "runsc-do")
+ if err != nil {
+ return Errorf("Error to create tmp dir: %v", err)
+ }
+ defer os.RemoveAll(tmpDir)
+
+ log.Infof("Changing configuration RootDir to %q", tmpDir)
+ conf.RootDir = tmpDir
+
+ cfgPath := filepath.Join(tmpDir, "config.json")
+ if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
+ return Errorf("Error write spec: %v", err)
+ }
+
+ containerArgs := container.Args{
+ ID: cid,
+ Spec: spec,
+ BundleDir: tmpDir,
+ Attached: true,
+ }
+
+ ct, err := container.New(conf, containerArgs)
+ if err != nil {
+ return Errorf("creating container: %v", err)
+ }
+ defer ct.Destroy()
+
+ if err := ct.Start(conf); err != nil {
+ return Errorf("starting container: %v", err)
+ }
+
+ // Forward signals to init in the container. Thus if we get SIGINT from
+ // ^C, the container gracefully exit, and we can clean up.
+ //
+ // N.B. There is a still a window before this where a signal may kill
+ // this process, skipping cleanup.
+ stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */)
+ defer stopForwarding()
+
+ ws, err := ct.Wait()
+ if err != nil {
+ return Errorf("waiting for container: %v", err)
+ }
+
+ *waitStatus = ws
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/verity_prepare.go b/runsc/cmd/verity_prepare.go
new file mode 100644
index 000000000..2197cd3f8
--- /dev/null
+++ b/runsc/cmd/verity_prepare.go
@@ -0,0 +1,106 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "fmt"
+ "math/rand"
+ "os"
+
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/runsc/config"
+ "gvisor.dev/gvisor/runsc/flag"
+ "gvisor.dev/gvisor/runsc/specutils"
+)
+
+// VerityPrepare implements subcommands.Commands for the "verity-prepare"
+// command. It sets up a sandbox with a writable verity mount mapped to "--dir",
+// and executes the verity measure tool specified by "--tool" in the sandbox. It
+// is intended to prepare --dir to be mounted as a verity filesystem.
+type VerityPrepare struct {
+ root string
+ tool string
+ dir string
+}
+
+// Name implements subcommands.Command.Name.
+func (*VerityPrepare) Name() string {
+ return "verity-prepare"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*VerityPrepare) Synopsis() string {
+ return "Generates the data structures necessary to enable verityfs on a filesystem."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*VerityPrepare) Usage() string {
+ return "verity-prepare --tool=<measure_tool> --dir=<path>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *VerityPrepare) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
+ f.StringVar(&c.tool, "tool", "", "path to the verity measure_tool")
+ f.StringVar(&c.dir, "dir", "", "path to the directory to be hashed")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ conf := args[0].(*config.Config)
+ waitStatus := args[1].(*unix.WaitStatus)
+
+ hostname, err := os.Hostname()
+ if err != nil {
+ return Errorf("Error to retrieve hostname: %v", err)
+ }
+
+ // Map the entire host file system.
+ absRoot, err := resolvePath(c.root)
+ if err != nil {
+ return Errorf("Error resolving root: %v", err)
+ }
+
+ spec := &specs.Spec{
+ Root: &specs.Root{
+ Path: absRoot,
+ },
+ Process: &specs.Process{
+ Cwd: absRoot,
+ Args: []string{c.tool, "--path", "/verityroot"},
+ Env: os.Environ(),
+ Capabilities: specutils.AllCapabilities(),
+ },
+ Hostname: hostname,
+ Mounts: []specs.Mount{
+ specs.Mount{
+ Source: c.dir,
+ Destination: "/verityroot",
+ Type: "bind",
+ Options: []string{"verity.roothash="},
+ },
+ },
+ }
+
+ cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+
+ // Force no networking, it is not necessary to run the verity measure tool.
+ conf.Network = config.NetworkNone
+
+ return startContainerAndWait(spec, conf, cid, waitStatus)
+}
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index b62504a8c..9ecd0fde6 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -18,6 +18,7 @@ import (
"fmt"
"math/bits"
"path"
+ "strings"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
@@ -64,6 +65,12 @@ var optionsMap = map[string]mapping{
"sync": {set: true, val: unix.MS_SYNCHRONOUS},
}
+// verityMountOptions is the set of valid verity mount option keys.
+var verityMountOptions = map[string]struct{}{
+ "verity.roothash": struct{}{},
+ "verity.action": struct{}{},
+}
+
// propOptionsMap is similar to optionsMap, but it lists propagation options
// that cannot be used together with other flags.
var propOptionsMap = map[string]mapping{
@@ -117,6 +124,14 @@ func validateMount(mnt *specs.Mount) error {
return nil
}
+func moptKey(opt string) string {
+ if len(opt) == 0 {
+ return opt
+ }
+ // Guaranteed to have at least one token, since opt is not empty.
+ return strings.SplitN(opt, "=", 2)[0]
+}
+
// ValidateMountOptions validates that mount options are correct.
func ValidateMountOptions(opts []string) error {
for _, o := range opts {
@@ -125,7 +140,8 @@ func ValidateMountOptions(opts []string) error {
}
_, ok1 := optionsMap[o]
_, ok2 := propOptionsMap[o]
- if !ok1 && !ok2 {
+ _, ok3 := verityMountOptions[moptKey(o)]
+ if !ok1 && !ok2 && !ok3 {
return fmt.Errorf("unknown mount option %q", o)
}
if err := validatePropagation(o); err != nil {