diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/merkletree/merkletree.go | 534 | ||||
-rw-r--r-- | pkg/merkletree/merkletree_state_autogen.go | 3 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/verity/filesystem.go | 1109 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/verity/save_restore.go | 27 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/verity/verity.go | 1402 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/verity/verity_state_autogen.go | 237 |
6 files changed, 3312 insertions, 0 deletions
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go new file mode 100644 index 000000000..961bd4dcf --- /dev/null +++ b/pkg/merkletree/merkletree.go @@ -0,0 +1,534 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package merkletree implements Merkle tree generating and verification. +package merkletree + +import ( + "bytes" + "crypto/sha256" + "crypto/sha512" + "encoding/gob" + "fmt" + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + + "gvisor.dev/gvisor/pkg/hostarch" +) + +const ( + // sha256DigestSize specifies the digest size of a SHA256 hash. + sha256DigestSize = 32 + // sha512DigestSize specifies the digest size of a SHA512 hash. + sha512DigestSize = 64 +) + +// DigestSize returns the size (in bytes) of a digest. +// TODO(b/156980949): Allow config SHA384. +func DigestSize(hashAlgorithm int) int { + switch hashAlgorithm { + case linux.FS_VERITY_HASH_ALG_SHA256: + return sha256DigestSize + case linux.FS_VERITY_HASH_ALG_SHA512: + return sha512DigestSize + default: + return -1 + } +} + +// Layout defines the scale of a Merkle tree. +type Layout struct { + // blockSize is the size of a data block to be hashed. + blockSize int64 + // digestSize is the size of a generated hash. + digestSize int64 + // levelOffset contains the offset of the beginning of each level in + // bytes. The number of levels in the tree is the length of the slice. + // The leaf nodes (level 0) contain hashes of blocks of the input data. + // Each level N contains hashes of the blocks in level N-1. The highest + // level is the root hash. + levelOffset []int64 +} + +// InitLayout initializes and returns a new Layout object describing the structure +// of a tree. dataSize specifies the size of input data in bytes. +func InitLayout(dataSize int64, hashAlgorithms int, dataAndTreeInSameFile bool) (Layout, error) { + layout := Layout{ + blockSize: hostarch.PageSize, + } + + // TODO(b/156980949): Allow config SHA384. + switch hashAlgorithms { + case linux.FS_VERITY_HASH_ALG_SHA256: + layout.digestSize = sha256DigestSize + case linux.FS_VERITY_HASH_ALG_SHA512: + layout.digestSize = sha512DigestSize + default: + return Layout{}, fmt.Errorf("unexpected hash algorithms") + } + + // treeStart is the offset (in bytes) of the first level of the tree in + // the file. If data and tree are in different files, treeStart should + // be zero. If data is in the same file as the tree, treeStart points + // to the block after the last data block (which may be zero-padded). + var treeStart int64 + if dataAndTreeInSameFile { + treeStart = dataSize + if dataSize%layout.blockSize != 0 { + treeStart += layout.blockSize - dataSize%layout.blockSize + } + } + + numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize + level := 0 + offset := int64(0) + + // Calculate the number of levels in the Merkle tree and the beginning + // offset of each level. Level 0 consists of the leaf nodes that + // contain the hashes of the data blocks, while level numLevels - 1 is + // the root. + for numBlocks > 1 { + layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize) + // Round numBlocks up to fill up a block. + numBlocks += (layout.hashesPerBlock() - numBlocks%layout.hashesPerBlock()) % layout.hashesPerBlock() + offset += numBlocks / layout.hashesPerBlock() + numBlocks = numBlocks / layout.hashesPerBlock() + level++ + } + layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize) + + return layout, nil +} + +// hashesPerBlock() returns the number of digests in each block. For example, +// if blockSize is 4096 bytes, and digestSize is 32 bytes, there will be 128 +// hashesPerBlock. Therefore 128 hashes in one level will be combined in one +// hash in the level above. +func (layout Layout) hashesPerBlock() int64 { + return layout.blockSize / layout.digestSize +} + +// numLevels returns the total number of levels in the Merkle tree. +func (layout Layout) numLevels() int { + return len(layout.levelOffset) +} + +// rootLevel returns the level of the root hash. +func (layout Layout) rootLevel() int { + return layout.numLevels() - 1 +} + +// digestOffset finds the offset of a digest from the beginning of the tree. +// The target digest is at level of the tree, with index from the beginning of +// the current level. +func (layout Layout) digestOffset(level int, index int64) int64 { + return layout.levelOffset[level] + index*layout.digestSize +} + +// blockOffset finds the offset of a block from the beginning of the tree. The +// target block is at level of the tree, with index from the beginning of the +// current level. +func (layout Layout) blockOffset(level int, index int64) int64 { + return layout.levelOffset[level] + index*layout.blockSize +} + +// VerityDescriptor is a struct that is serialized and hashed to get a file's +// root hash, which contains the root hash of the raw content and the file's +// meatadata. +type VerityDescriptor struct { + Name string + FileSize int64 + Mode uint32 + UID uint32 + GID uint32 + Children map[string]struct{} + SymlinkTarget string + RootHash []byte +} + +func (d *VerityDescriptor) String() string { + b := new(bytes.Buffer) + e := gob.NewEncoder(b) + e.Encode(d.Children) + return fmt.Sprintf("Name: %s, Size: %d, Mode: %d, UID: %d, GID: %d, Children: %v, Symlink: %s, RootHash: %v", d.Name, d.FileSize, d.Mode, d.UID, d.GID, b.Bytes(), d.SymlinkTarget, d.RootHash) +} + +// verify generates a hash from d, and compares it with expected. +func (d *VerityDescriptor) verify(expected []byte, hashAlgorithms int) error { + h, err := hashData([]byte(d.String()), hashAlgorithms) + if err != nil { + return err + } + if !bytes.Equal(h[:], expected) { + return fmt.Errorf("unexpected root hash") + } + return nil + +} + +// hashData hashes data and returns the result hash based on the hash +// algorithms. +func hashData(data []byte, hashAlgorithms int) ([]byte, error) { + var digest []byte + switch hashAlgorithms { + case linux.FS_VERITY_HASH_ALG_SHA256: + digestArray := sha256.Sum256(data) + digest = digestArray[:] + case linux.FS_VERITY_HASH_ALG_SHA512: + digestArray := sha512.Sum512(data) + digest = digestArray[:] + default: + return nil, fmt.Errorf("unexpected hash algorithms") + } + return digest, nil +} + +// GenerateParams contains the parameters used to generate a Merkle tree for a +// given file. +type GenerateParams struct { + // File is a reader of the file to be hashed. + File io.ReaderAt + // Size is the size of the file. + Size int64 + // Name is the name of the target file. + Name string + // Mode is the mode of the target file. + Mode uint32 + // UID is the user ID of the target file. + UID uint32 + // GID is the group ID of the target file. + GID uint32 + // Children is a map of children names for a directory. It should be + // empty for a regular file. + Children map[string]struct{} + // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink. + SymlinkTarget string + // HashAlgorithms is the algorithms used to hash data. + HashAlgorithms int + // TreeReader is a reader for the Merkle tree. + TreeReader io.ReaderAt + // TreeWriter is a writer for the Merkle tree. + TreeWriter io.Writer + // DataAndTreeInSameFile is true if data and Merkle tree are in the same + // file, or false if Merkle tree is a separate file from data. + DataAndTreeInSameFile bool +} + +// Generate constructs a Merkle tree for the contents of params.File. The +// output is written to params.TreeWriter. +// +// Generate returns a hash of a VerityDescriptor, which contains the file +// metadata and the hash from file content. +func Generate(params *GenerateParams) ([]byte, error) { + descriptor := VerityDescriptor{ + FileSize: params.Size, + Name: params.Name, + Mode: params.Mode, + UID: params.UID, + GID: params.GID, + SymlinkTarget: params.SymlinkTarget, + } + + // If file is a symlink do not generate root hash for file content. + if params.SymlinkTarget != "" { + return hashData([]byte(descriptor.String()), params.HashAlgorithms) + } + + layout, err := InitLayout(params.Size, params.HashAlgorithms, params.DataAndTreeInSameFile) + if err != nil { + return nil, err + } + + numBlocks := (params.Size + layout.blockSize - 1) / layout.blockSize + + // If the data is in the same file as the tree, zero pad the last data + // block. + bytesInLastBlock := params.Size % layout.blockSize + if params.DataAndTreeInSameFile && bytesInLastBlock != 0 { + zeroBuf := make([]byte, layout.blockSize-bytesInLastBlock) + if _, err := params.TreeWriter.Write(zeroBuf); err != nil { + return nil, err + } + } + + var root []byte + for level := 0; level < layout.numLevels(); level++ { + for i := int64(0); i < numBlocks; i++ { + buf := make([]byte, layout.blockSize) + var ( + n int + err error + ) + if level == 0 { + // Read data block from the target file since level 0 includes hashes + // of blocks in the input data. + n, err = params.File.ReadAt(buf, i*layout.blockSize) + } else { + // Read data block from the tree file since levels higher than 0 are + // hashing the lower level hashes. + n, err = params.TreeReader.ReadAt(buf, layout.blockOffset(level-1, i)) + } + + // err is populated as long as the bytes read is smaller than the buffer + // size. This could be the case if we are reading the last block, and + // break in that case. If this is the last block, the end of the block + // will be zero-padded. + if n == 0 && err == io.EOF { + break + } else if err != nil && err != io.EOF { + return nil, err + } + // Hash the bytes in buf. + digest, err := hashData(buf, params.HashAlgorithms) + if err != nil { + return nil, err + } + + if level == layout.rootLevel() { + root = digest + } + + // Write the generated hash to the end of the tree file. + if _, err = params.TreeWriter.Write(digest[:]); err != nil { + return nil, err + } + } + // If the generated digests do not round up to a block, zero-padding the + // remaining of the last block. But no need to do so for root. + if level != layout.rootLevel() && numBlocks%layout.hashesPerBlock() != 0 { + zeroBuf := make([]byte, layout.blockSize-(numBlocks%layout.hashesPerBlock())*layout.digestSize) + if _, err := params.TreeWriter.Write(zeroBuf[:]); err != nil { + return nil, err + } + } + numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock() + } + descriptor.RootHash = root + return hashData([]byte(descriptor.String()), params.HashAlgorithms) +} + +// VerifyParams contains the params used to verify a portion of a file against +// a Merkle tree. +type VerifyParams struct { + // Out will be filled with verified data. + Out io.Writer + // File is a handler on the file to be verified. + File io.ReaderAt + // tree is a handler on the Merkle tree used to verify file. + Tree io.ReaderAt + // Size is the size of the file. + Size int64 + // Name is the name of the target file. + Name string + // Mode is the mode of the target file. + Mode uint32 + // UID is the user ID of the target file. + UID uint32 + // GID is the group ID of the target file. + GID uint32 + // Children is a map of children names for a directory. It should be + // empty for a regular file. + Children map[string]struct{} + // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink. + SymlinkTarget string + // HashAlgorithms is the algorithms used to hash data. + HashAlgorithms int + // ReadOffset is the offset of the data range to be verified. + ReadOffset int64 + // ReadSize is the size of the data range to be verified. + ReadSize int64 + // Expected is a trusted hash for the file. It is compared with the + // calculated root hash to verify the content. + Expected []byte + // DataAndTreeInSameFile is true if data and Merkle tree are in the same + // file, or false if Merkle tree is a separate file from data. + DataAndTreeInSameFile bool +} + +// verifyMetadata verifies the metadata by hashing a descriptor that contains +// the metadata and compare the generated hash with expected. +// +// For verifyMetadata, params.data is not needed. It only accesses params.tree +// for the raw root hash. +func verifyMetadata(params *VerifyParams, layout *Layout) error { + var root []byte + // Only read the root hash if we expect that the file is not a symlink and its + // Merkle tree file is non-empty. + if params.Size != 0 && params.SymlinkTarget == "" { + root = make([]byte, layout.digestSize) + if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil { + return fmt.Errorf("failed to read root hash: %w", err) + } + } + descriptor := VerityDescriptor{ + Name: params.Name, + FileSize: params.Size, + Mode: params.Mode, + UID: params.UID, + GID: params.GID, + Children: params.Children, + SymlinkTarget: params.SymlinkTarget, + RootHash: root, + } + return descriptor.verify(params.Expected, params.HashAlgorithms) +} + +// Verify verifies the content read from data with offset. The content is +// verified against tree. If content spans across multiple blocks, each block is +// verified. Verification fails if the hash of the data does not match the tree +// at any level, or if the final root hash does not match expected. +// Once the data is verified, it will be written using params.Out. +// +// Verify checks for both target file content and metadata. If readSize is 0, +// only metadata is checked. +func Verify(params *VerifyParams) (int64, error) { + if params.ReadSize < 0 { + return 0, fmt.Errorf("unexpected read size: %d", params.ReadSize) + } + layout, err := InitLayout(int64(params.Size), params.HashAlgorithms, params.DataAndTreeInSameFile) + if err != nil { + return 0, err + } + if params.ReadSize == 0 { + return 0, verifyMetadata(params, &layout) + } + + // Calculate the index of blocks that includes the target range in input + // data. + firstDataBlock := params.ReadOffset / layout.blockSize + lastDataBlock := (params.ReadOffset + params.ReadSize - 1) / layout.blockSize + + buf := make([]byte, layout.blockSize) + var readErr error + total := int64(0) + for i := firstDataBlock; i <= lastDataBlock; i++ { + // Read a block that includes all or part of target range in + // input data. + bytesRead, err := params.File.ReadAt(buf, i*layout.blockSize) + readErr = err + // If at the end of input data and all previous blocks are + // verified, return the verified input data and EOF. + if readErr == io.EOF && bytesRead == 0 { + break + } + if readErr != nil && readErr != io.EOF { + return 0, fmt.Errorf("read from data failed: %w", err) + } + // If this is the end of file, zero the remaining bytes in buf, + // otherwise they are still from the previous block. + // TODO(b/162908070): Investigate possible issues with zero + // padding the data. + if bytesRead < len(buf) { + for j := bytesRead; j < len(buf); j++ { + buf[j] = 0 + } + } + descriptor := VerityDescriptor{ + Name: params.Name, + FileSize: params.Size, + Mode: params.Mode, + UID: params.UID, + GID: params.GID, + SymlinkTarget: params.SymlinkTarget, + Children: params.Children, + } + if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.HashAlgorithms, params.Expected); err != nil { + return 0, err + } + + // startOff is the beginning of the read range within the + // current data block. Note that for all blocks other than the + // first, startOff should be 0. + startOff := int64(0) + if i == firstDataBlock { + startOff = params.ReadOffset % layout.blockSize + } + // endOff is the end of the read range within the current data + // block. Note that for all blocks other than the last, endOff + // should be the block size. + endOff := layout.blockSize + if i == lastDataBlock { + endOff = (params.ReadOffset+params.ReadSize-1)%layout.blockSize + 1 + } + // If the provided size exceeds the end of input data, we should + // only copy the parts in buf that's part of input data. + if startOff > int64(bytesRead) { + startOff = int64(bytesRead) + } + if endOff > int64(bytesRead) { + endOff = int64(bytesRead) + } + n, err := params.Out.Write(buf[startOff:endOff]) + if err != nil { + return total, err + } + total += int64(n) + + } + return total, readErr +} + +// verifyBlock verifies a block against tree. index is the number of block in +// original data. The block is verified through each level of the tree. It +// fails if the calculated hash from block is different from any level of +// hashes stored in tree. And the final root hash is compared with +// expected. +func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, hashAlgorithms int, expected []byte) error { + if len(dataBlock) != int(layout.blockSize) { + return fmt.Errorf("incorrect block size") + } + + expectedDigest := make([]byte, layout.digestSize) + treeBlock := make([]byte, layout.blockSize) + var digest []byte + for level := 0; level < layout.numLevels(); level++ { + // Calculate hash. + if level == 0 { + h, err := hashData(dataBlock, hashAlgorithms) + if err != nil { + return err + } + digest = h + } else { + // Read a block in previous level that contains the + // hash we just generated, and generate a next level + // hash from it. + if _, err := tree.ReadAt(treeBlock, layout.blockOffset(level-1, blockIndex)); err != nil { + return err + } + h, err := hashData(treeBlock, hashAlgorithms) + if err != nil { + return err + } + digest = h + } + + // Read the digest for the current block and store in + // expectedDigest. + if _, err := tree.ReadAt(expectedDigest, layout.digestOffset(level, blockIndex)); err != nil { + return err + } + + if !bytes.Equal(digest, expectedDigest) { + return fmt.Errorf("verification failed") + } + blockIndex = blockIndex / layout.hashesPerBlock() + } + + // Verification for the tree succeeded. Now hash the descriptor with + // the root hash and compare it with expected. + descriptor.RootHash = digest + return descriptor.verify(expected, hashAlgorithms) +} diff --git a/pkg/merkletree/merkletree_state_autogen.go b/pkg/merkletree/merkletree_state_autogen.go new file mode 100644 index 000000000..e5670cde4 --- /dev/null +++ b/pkg/merkletree/merkletree_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package merkletree diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go new file mode 100644 index 000000000..6cb1a23e0 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -0,0 +1,1109 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "strconv" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/merkletree" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All files should be read-only. + return nil +} + +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls +// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may have a reference count of zero, and which therefore +// should be dropped once traversal is complete, are appended to ds. +// +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !rp.Done(). +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + child, err := fs.getChildLocked(ctx, d, name, ds) + if err != nil { + return nil, err + } + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { + return nil, err + } + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// verifyChildLocked verifies the hash of child against the already verified +// hash of the parent to ensure the child is expected. verifyChild triggers a +// sentry panic if unexpected modifications to the file system are detected. In +// ErrorOnViolation mode it returns a syserror instead. +// +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// +// TODO(b/166474175): Investigate all possible errors returned in this +// function, and make sure we differentiate all errors that indicate unexpected +// modifications to the file system from the ones that are not harmful. +func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { + vfsObj := fs.vfsfs.VirtualFilesystem() + + // Get the path to the child dentry. This is only used to provide path + // information in failure case. + childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD) + if err != nil { + return nil, err + } + + fs.verityMu.RLock() + defer fs.verityMu.RUnlock() + // Read the offset of the child from the extended attributes of the + // corresponding Merkle tree file. + // This is the offset of the hash for child in its parent's Merkle tree + // file. + off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.lowerMerkleVD, + Start: child.lowerMerkleVD, + }, &vfs.GetXattrOptions{ + Name: merkleOffsetInParentXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the file or the xattr does not + // exist, it indicates unexpected modifications to the file system. + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err)) + } + if err != nil { + return nil, err + } + // The offset xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + offset, err := strconv.Atoi(off) + if err != nil { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err)) + } + + // Open parent Merkle tree file to read and verify child's hash. + parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerMerkleVD, + Start: parent.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + + // The parent Merkle tree file should have been created. If it's + // missing, it indicates an unexpected modification to the file system. + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err)) + } + if err != nil { + return nil, err + } + + // dataSize is the size of raw data for the Merkle tree. For a file, + // dataSize is the size of the whole file. For a directory, dataSize is + // the size of all its children's hashes. + dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the file or the xattr does not + // exist, it indicates unexpected modifications to the file system. + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err)) + } + if err != nil { + return nil, err + } + + // The dataSize xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + parentSize, err := strconv.Atoi(dataSize) + if err != nil { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) + } + + fdReader := FileReadWriteSeeker{ + FD: parentMerkleFD, + Ctx: ctx, + } + + parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerVD, + Start: parent.lowerVD, + }, &vfs.StatOptions{}) + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err)) + } + if err != nil { + return nil, err + } + + // Since we are verifying against a directory Merkle tree, buf should + // contain the hash of the children in the parent Merkle tree when + // Verify returns with success. + var buf bytes.Buffer + parent.hashMu.RLock() + _, err = merkletree.Verify(&merkletree.VerifyParams{ + Out: &buf, + File: &fdReader, + Tree: &fdReader, + Size: int64(parentSize), + Name: parent.name, + Mode: uint32(parentStat.Mode), + UID: parentStat.UID, + GID: parentStat.GID, + Children: parent.childrenNames, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: fs.alg.toLinuxHashAlg(), + ReadOffset: int64(offset), + ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())), + Expected: parent.hash, + DataAndTreeInSameFile: true, + }) + parent.hashMu.RUnlock() + if err != nil && err != io.EOF { + return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err)) + } + + // Cache child hash when it's verified the first time. + child.hashMu.Lock() + if len(child.hash) == 0 { + child.hash = buf.Bytes() + } + child.hashMu.Unlock() + return child, nil +} + +// verifyStatAndChildrenLocked verifies the stat and children names against the +// verified hash. The mode/uid/gid and childrenNames of the file is cached +// after verified. +// +// Preconditions: d.dirMu must be locked. +func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + + // Get the path to the child dentry. This is only used to provide path + // information in failure case. + childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD) + if err != nil { + return err + } + + fs.verityMu.RLock() + defer fs.verityMu.RUnlock() + + fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err == syserror.ENOENT { + return alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err)) + } + if err != nil { + return err + } + + merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err)) + } + if err != nil { + return err + } + + size, err := strconv.Atoi(merkleSize) + if err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) + } + + if d.isDir() && len(d.childrenNames) == 0 { + childrenOffString, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: childrenOffsetXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenOffsetXattr, childPath, err)) + } + if err != nil { + return err + } + childrenOffset, err := strconv.Atoi(childrenOffString) + if err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err)) + } + + childrenSizeString, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: childrenSizeXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenSizeXattr, childPath, err)) + } + if err != nil { + return err + } + childrenSize, err := strconv.Atoi(childrenSizeString) + if err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err)) + } + + childrenNames := make([]byte, childrenSize) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(childrenOffset), vfs.ReadOptions{}); err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to read children map for %s: %v", childPath, err)) + } + + if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames of %s: %v", childPath, err)) + } + } + + fdReader := FileReadWriteSeeker{ + FD: fd, + Ctx: ctx, + } + + var buf bytes.Buffer + d.hashMu.RLock() + params := &merkletree.VerifyParams{ + Out: &buf, + Tree: &fdReader, + Size: int64(size), + Name: d.name, + Mode: uint32(stat.Mode), + UID: stat.UID, + GID: stat.GID, + Children: d.childrenNames, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: fs.alg.toLinuxHashAlg(), + ReadOffset: 0, + // Set read size to 0 so only the metadata is verified. + ReadSize: 0, + Expected: d.hash, + DataAndTreeInSameFile: false, + } + d.hashMu.RUnlock() + if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR { + params.DataAndTreeInSameFile = true + } + + if d.isSymlink() { + target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }) + if err != nil { + return err + } + params.SymlinkTarget = target + } + + if _, err := merkletree.Verify(params); err != nil && err != io.EOF { + return alertIntegrityViolation(fmt.Sprintf("Verification stat for %s failed: %v", childPath, err)) + } + d.mode = uint32(stat.Mode) + d.uid = stat.UID + d.gid = stat.GID + d.size = uint32(size) + d.symlinkTarget = params.SymlinkTarget + return nil +} + +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { + if child, ok := parent.children[name]; ok { + // If verity is enabled on child, we should check again whether + // the file and the corresponding Merkle tree are as expected, + // in order to catch deletion/renaming after the last time it's + // accessed. + if child.verityEnabled() { + vfsObj := fs.vfsfs.VirtualFilesystem() + // Get the path to the child dentry. This is only used + // to provide path information in failure case. + path, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD) + if err != nil { + return nil, err + } + + childVD, err := parent.getLowerAt(ctx, vfsObj, name) + if err == syserror.ENOENT { + // The file was previously accessed. If the + // file does not exist now, it indicates an + // unexpected modification to the file system. + return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", path)) + } + if err != nil { + return nil, err + } + defer childVD.DecRef(ctx) + + childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + // The Merkle tree file was previous accessed. If it + // does not exist now, it indicates an unexpected + // modification to the file system. + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path)) + } + if err != nil { + return nil, err + } + + defer childMerkleVD.DecRef(ctx) + } + + // If enabling verification on files/directories is not allowed + // during runtime, all cached children are already verified. If + // runtime enable is allowed and the parent directory is + // enabled, we should verify the child hash here because it may + // be cached before enabled. + if fs.allowRuntimeEnable { + if parent.verityEnabled() { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { + return nil, err + } + } + if child.verityEnabled() { + vfsObj := fs.vfsfs.VirtualFilesystem() + mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.lowerVD, + Start: child.lowerVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + return nil, err + } + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { + return nil, err + } + } + } + return child, nil + } + child, err := fs.lookupAndVerifyLocked(ctx, parent, name) + if err != nil { + return nil, err + } + if parent.children == nil { + parent.children = make(map[string]*dentry) + } + parent.children[name] = child + // child's refcount is initially 0, so it may be dropped after traversal. + *ds = appendDentry(*ds, child) + return child, nil +} + +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { + vfsObj := fs.vfsfs.VirtualFilesystem() + + if parent.verityEnabled() { + if _, ok := parent.childrenNames[name]; !ok { + return nil, syserror.ENOENT + } + } + + parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD) + if err != nil { + return nil, err + } + + childVD, err := parent.getLowerAt(ctx, vfsObj, name) + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("file %s expected but not found", parentPath+"/"+name)) + } + if err != nil { + return nil, err + } + + // The dentry needs to be cleaned up if any error occurs. IncRef will be + // called if a verity child dentry is successfully created. + defer childVD.DecRef(ctx) + + childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + if err == syserror.ENOENT { + if !fs.allowRuntimeEnable { + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath+"/"+name)) + } + childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerVD, + Start: parent.lowerVD, + Path: fspath.Parse(merklePrefix + name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, + }) + if err != nil { + return nil, err + } + childMerkleFD.DecRef(ctx) + childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + if err != nil { + return nil, err + } + } + if err != nil && err != syserror.ENOENT { + return nil, err + } + + // Clear the Merkle tree file if they are to be generated at runtime. + // TODO(b/182315468): Optimize the Merkle tree generate process to + // allow only updating certain files/directories. + if fs.allowRuntimeEnable { + childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childMerkleVD, + Start: childMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_TRUNC, + Mode: 0644, + }) + if err != nil { + return nil, err + } + childMerkleFD.DecRef(ctx) + } + + // The dentry needs to be cleaned up if any error occurs. IncRef will be + // called if a verity child dentry is successfully created. + defer childMerkleVD.DecRef(ctx) + + mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + return nil, err + } + + child := fs.newDentry() + child.lowerVD = childVD + child.lowerMerkleVD = childMerkleVD + + // Increase the reference for both childVD and childMerkleVD as they are + // held by child. If this function fails and the child is destroyed, the + // references will be decreased in destroyLocked. + childVD.IncRef() + childMerkleVD.IncRef() + + parent.IncRef() + child.parent = parent + child.name = name + + child.mode = uint32(stat.Mode) + child.uid = stat.UID + child.gid = stat.GID + child.childrenNames = make(map[string]struct{}) + + // Verify child hash. This should always be performed unless in + // allowRuntimeEnable mode and the parent directory hasn't been enabled + // yet. + if parent.verityEnabled() { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { + child.destroyLocked(ctx) + return nil, err + } + } + if child.verityEnabled() { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { + child.destroyLocked(ctx) + return nil, err + } + } + + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: +// * fs.renameMu must be locked. +// * !rp.Done(). +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + // Verity file system is read-only. + if ats&vfs.MayWrite != 0 { + return syserror.EROFS + } + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Verity fs is read-only. + if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 { + return nil, syserror.EROFS + } + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + + start := rp.Start().Impl().(*dentry) + if rp.Done() { + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + + // Open existing child or follow symlink. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds) + parent.dirMu.Unlock() + if err != nil { + return nil, err + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + // Users should not open the Merkle tree files. Those are for verity fs + // use only. + if strings.Contains(d.name, merklePrefix) { + return nil, syserror.EPERM + } + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + + // Verity fs is read-only. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EROFS + } + + // Get the path to the target file. This is only used to provide path + // information in failure case. + path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD) + if err != nil { + return nil, err + } + + // Open the file in the underlying file system. + lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, opts) + + // The file should exist, as we succeeded in finding its dentry. If it's + // missing, it indicates an unexpected modification to the file system. + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path)) + } + return nil, err + } + + // lowerFD needs to be cleaned up if any error occurs. IncRef will be + // called if a verity FD is successfully created. + defer lowerFD.DecRef(ctx) + + // Open the Merkle tree file corresponding to the current file/directory + // to be used later for verifying Read/Walk. + merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + + // The Merkle tree file should exist, as we succeeded in finding its + // dentry. If it's missing, it indicates an unexpected modification to + // the file system. + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path)) + } + return nil, err + } + + // merkleReader needs to be cleaned up if any error occurs. IncRef will + // be called if a verity FD is successfully created. + defer merkleReader.DecRef(ctx) + + lowerFlags := lowerFD.StatusFlags() + lowerFDOpts := lowerFD.Options() + var merkleWriter *vfs.FileDescription + var parentMerkleWriter *vfs.FileDescription + + // Only open the Merkle tree files for write if in allowRuntimeEnable + // mode. + if d.fs.allowRuntimeEnable { + merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path)) + } + return nil, err + } + // merkleWriter is cleaned up if any error occurs. IncRef will + // be called if a verity FD is created successfully. + defer merkleWriter.DecRef(ctx) + + if d.parent != nil { + parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.parent.lowerMerkleVD, + Start: d.parent.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD) + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath)) + } + return nil, err + } + // parentMerkleWriter is cleaned up if any error occurs. IncRef + // will be called if a verity FD is created successfully. + defer parentMerkleWriter.DecRef(ctx) + } + } + + fd := &fileDescription{ + d: d, + lowerFD: lowerFD, + merkleReader: merkleReader, + merkleWriter: merkleWriter, + parentMerkleWriter: parentMerkleWriter, + isDir: d.isDir(), + } + + if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { + return nil, err + } + lowerFD.IncRef() + merkleReader.IncRef() + if merkleWriter != nil { + merkleWriter.IncRef() + } + if parentMerkleWriter != nil { + parentMerkleWriter.IncRef() + } + return &fd.vfsfd, err +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return d.readlink(ctx) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should +// be verified. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + + var stat linux.Statx + stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, &opts) + if err != nil { + return linux.Statx{}, err + } + d.dirMu.Lock() + if d.verityEnabled() { + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { + return linux.Statx{}, err + } + } + d.dirMu.Unlock() + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + // TODO(b/159261227): Implement StatFSAt. + return linux.Statfs{}, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + lowerVD := d.lowerVD + return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, size) +} + +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + lowerVD := d.lowerVD + return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &opts) +} + +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/fsimpl/verity/save_restore.go b/pkg/sentry/fsimpl/verity/save_restore.go new file mode 100644 index 000000000..46b064342 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/save_restore.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/refsvfs2" +) + +func (d *dentry) afterLoad() { + if atomic.LoadInt64(&d.refs) != -1 { + refsvfs2.Register(d) + } +} diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go new file mode 100644 index 000000000..2f97324a8 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -0,0 +1,1402 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package verity provides a filesystem implementation that is a wrapper of +// another file system. +// The verity file system provides integrity check for the underlying file +// system by providing verification for path traversals and each read. +// The verity file system is read-only, except for one case: when +// allowRuntimeEnable is true, additional Merkle files can be generated using +// the FS_IOC_ENABLE_VERITY ioctl. +// +// Lock order: +// +// filesystem.renameMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. +package verity + +import ( + "bytes" + "encoding/json" + "fmt" + "math" + "strconv" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/merkletree" + "gvisor.dev/gvisor/pkg/refsvfs2" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + // Name is the default filesystem name. + Name = "verity" + + // merklePrefix is the prefix of the Merkle tree files. For example, the Merkle + // tree file for "/foo" is "/.merkle.verity.foo". + merklePrefix = ".merkle.verity." + + // merkleRootPrefix is the prefix of the Merkle tree root file. This + // needs to be different from merklePrefix to avoid name collision. + merkleRootPrefix = ".merkleroot.verity." + + // merkleOffsetInParentXattr is the extended attribute name specifying the + // offset of the child hash in its parent's Merkle tree. + merkleOffsetInParentXattr = "user.merkle.offset" + + // merkleSizeXattr is the extended attribute name specifying the size of data + // hashed by the corresponding Merkle tree. For a regular file, this is the + // file size. For a directory, this is the size of all its children's hashes. + merkleSizeXattr = "user.merkle.size" + + // childrenOffsetXattr is the extended attribute name specifying the + // names of the offset of the serialized children names in the Merkle + // tree file. + childrenOffsetXattr = "user.merkle.childrenOffset" + + // childrenSizeXattr is the extended attribute name specifying the size + // of the serialized children names. + childrenSizeXattr = "user.merkle.childrenSize" + + // sizeOfStringInt32 is the size for a 32 bit integer stored as string in + // extended attributes. The maximum value of a 32 bit integer has 10 digits. + sizeOfStringInt32 = 10 +) + +var ( + // action specifies the action towards detected violation. + action ViolationAction + + // verityMu synchronizes concurrent operations that enable verity and perform + // verification checks. + verityMu sync.RWMutex +) + +// HashAlgorithm is a type specifying the algorithm used to hash the file +// content. +type HashAlgorithm int + +// ViolationAction is a type specifying the action when an integrity violation +// is detected. +type ViolationAction int + +const ( + // PanicOnViolation terminates the sentry on detected violation. + PanicOnViolation ViolationAction = 0 + // ErrorOnViolation returns an error from the violating system call on + // detected violation. + ErrorOnViolation = 1 +) + +// Currently supported hashing algorithms include SHA256 and SHA512. +const ( + SHA256 HashAlgorithm = iota + SHA512 +) + +func (alg HashAlgorithm) toLinuxHashAlg() int { + switch alg { + case SHA256: + return linux.FS_VERITY_HASH_ALG_SHA256 + case SHA512: + return linux.FS_VERITY_HASH_ALG_SHA512 + default: + return 0 + } +} + +// FilesystemType implements vfs.FilesystemType. +// +// +stateify savable +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +// +// +stateify savable +type filesystem struct { + vfsfs vfs.Filesystem + + // creds is a copy of the filesystem's creator's credentials, which are + // used for accesses to the underlying file system. creds is immutable. + creds *auth.Credentials + + // allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY + // to build Merkle trees in the verity file system is allowed. If this + // is false, no new Merkle trees can be built, and only the files that + // had Merkle trees before startup (e.g. from a host filesystem mounted + // with gofer fs) can be verified. + allowRuntimeEnable bool + + // lowerMount is the underlying file system mount. + lowerMount *vfs.Mount + + // rootDentry is the mount root Dentry for this file system, which + // stores the root hash of the whole file system in bytes. + rootDentry *dentry + + // alg is the algorithms used to hash the files in the verity file + // system. + alg HashAlgorithm + + // renameMu synchronizes renaming with non-renaming operations in order + // to ensure consistent lock ordering between dentry.dirMu in different + // dentries. + renameMu sync.RWMutex `state:"nosave"` + + // verityMu synchronizes enabling verity files, protects files or + // directories from being enabled by different threads simultaneously. + // It also ensures that verity does not access files that are being + // enabled. + // + // Also, the directory Merkle trees depends on the generated trees of + // its children. So they shouldn't be enabled the same time. This lock + // is for the whole file system to ensure that no more than one file is + // enabled the same time. + verityMu sync.RWMutex `state:"nosave"` +} + +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +// +// +stateify savable +type InternalFilesystemOptions struct { + // RootMerkleFileName is the name of the verity root Merkle tree file. + RootMerkleFileName string + + // LowerName is the name of the filesystem wrapped by verity fs. + LowerName string + + // Alg is the algorithms used to hash the files in the verity file + // system. + Alg HashAlgorithm + + // RootHash is the root hash of the overall verity file system. + RootHash []byte + + // AllowRuntimeEnable specifies whether the verity file system allows + // enabling verification for files (i.e. building Merkle trees) during + // runtime. + AllowRuntimeEnable bool + + // LowerGetFSOptions is the file system option for the lower layer file + // system wrapped by verity file system. + LowerGetFSOptions vfs.GetFilesystemOptions + + // Action specifies the action on an integrity violation. + Action ViolationAction +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// alertIntegrityViolation alerts a violation of integrity, which usually means +// unexpected modification to the file system is detected. In ErrorOnViolation +// mode, it returns EIO, otherwise it panic. +func alertIntegrityViolation(msg string) error { + if action == ErrorOnViolation { + return syserror.EIO + } + panic(msg) +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + iopts, ok := opts.InternalData.(InternalFilesystemOptions) + if !ok { + ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs") + return nil, nil, syserror.EINVAL + } + action = iopts.Action + + // Mount the lower file system. The lower file system is wrapped inside + // verity, and should not be exposed or connected. + mopts := &vfs.MountOptions{ + GetFilesystemOptions: iopts.LowerGetFSOptions, + InternalMount: true, + } + mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts) + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + creds: creds.Fork(), + alg: iopts.Alg, + lowerMount: mnt, + allowRuntimeEnable: iopts.AllowRuntimeEnable, + } + fs.vfsfs.Init(vfsObj, &fstype, fs) + + // Construct the root dentry. + d := fs.newDentry() + d.refs = 1 + lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root()) + lowerVD.IncRef() + d.lowerVD = lowerVD + + rootMerkleName := merkleRootPrefix + iopts.RootMerkleFileName + + lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + Path: fspath.Parse(rootMerkleName), + }, &vfs.GetDentryOptions{}) + + // If runtime enable is allowed, the root merkle tree may be absent. We + // should create the tree file. + if err == syserror.ENOENT && fs.allowRuntimeEnable { + lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + Path: fspath.Parse(rootMerkleName), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, + }) + if err != nil { + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, err + } + lowerMerkleFD.DecRef(ctx) + lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + Path: fspath.Parse(rootMerkleName), + }, &vfs.GetDentryOptions{}) + if err != nil { + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, err + } + } else if err != nil { + // Failed to get dentry for the root Merkle file. This + // indicates an unexpected modification that removed/renamed + // the root Merkle file, or it's never generated. + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, alertIntegrityViolation("Failed to find root Merkle file") + } + + // Clear the Merkle tree file if they are to be generated at runtime. + // TODO(b/182315468): Optimize the Merkle tree generate process to + // allow only updating certain files/directories. + if fs.allowRuntimeEnable { + lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_TRUNC, + Mode: 0644, + }) + if err != nil { + return nil, nil, err + } + lowerMerkleFD.DecRef(ctx) + } + + d.lowerMerkleVD = lowerMerkleVD + + // Get metadata from the underlying file system. + const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &vfs.StatOptions{ + Mask: statMask, + }) + if err != nil { + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, err + } + + d.mode = uint32(stat.Mode) + d.uid = stat.UID + d.gid = stat.GID + d.hash = make([]byte, len(iopts.RootHash)) + d.childrenNames = make(map[string]struct{}) + + if !d.isDir() { + ctx.Warningf("verity root must be a directory") + return nil, nil, syserror.EINVAL + } + + if !fs.allowRuntimeEnable { + // Get children names from the underlying file system. + offString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.GetXattrOptions{ + Name: childrenOffsetXattr, + Size: sizeOfStringInt32, + }) + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenOffsetXattr, err)) + } + if err != nil { + return nil, nil, err + } + + off, err := strconv.Atoi(offString) + if err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err)) + } + + sizeString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.GetXattrOptions{ + Name: childrenSizeXattr, + Size: sizeOfStringInt32, + }) + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenSizeXattr, err)) + } + if err != nil { + return nil, nil, err + } + size, err := strconv.Atoi(sizeString) + if err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err)) + } + + lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err == syserror.ENOENT { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to open root Merkle file: %v", err)) + } + if err != nil { + return nil, nil, err + } + + childrenNames := make([]byte, size) + if _, err := lowerMerkleFD.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(off), vfs.ReadOptions{}); err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to read root children map: %v", err)) + } + + if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err)) + } + + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { + return nil, nil, err + } + } + + d.hashMu.Lock() + copy(d.hash, iopts.RootHash) + d.hashMu.Unlock() + d.vfsd.Init(d) + + fs.rootDentry = d + + return &fs.vfsfs, &d.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + fs.lowerMount.DecRef(ctx) +} + +// MountOptions implements vfs.FilesystemImpl.MountOptions. +func (fs *filesystem) MountOptions() string { + return "" +} + +// dentry implements vfs.DentryImpl. +// +// +stateify savable +type dentry struct { + vfsd vfs.Dentry + + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // mode, uid, gid and size are the file mode, owner, group, and size of + // the file in the underlying file system. They are set when a dentry + // is initialized, and never modified. + mode uint32 + uid uint32 + gid uint32 + size uint32 + + // parent is the dentry corresponding to this dentry's parent directory. + // name is this dentry's name in parent. If this dentry is a filesystem + // root, parent is nil and name is the empty string. parent and name are + // protected by fs.renameMu. + parent *dentry + name string + + // If this dentry represents a directory, children maps the names of + // children for which dentries have been instantiated to those dentries, + // and dirents (if not nil) is a cache of dirents as returned by + // directoryFDs representing this directory. children is protected by + // dirMu. + dirMu sync.Mutex `state:"nosave"` + children map[string]*dentry + + // childrenNames stores the name of all children of the dentry. This is + // used by verity to check whether a child is expected. This is only + // populated by enableVerity. childrenNames is also protected by dirMu. + childrenNames map[string]struct{} + + // lowerVD is the VirtualDentry in the underlying file system. It is + // never modified after initialized. + lowerVD vfs.VirtualDentry + + // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree + // in the underlying file system. It is never modified after + // initialized. + lowerMerkleVD vfs.VirtualDentry + + // symlinkTarget is the target path of a symlink file in the underlying filesystem. + symlinkTarget string + + // hash is the calculated hash for the current file or directory. hash + // is protected by hashMu. + hashMu sync.RWMutex `state:"nosave"` + hash []byte +} + +// newDentry creates a new dentry representing the given verity file. The +// dentry initially has no references; it is the caller's responsibility to set +// the dentry's reference count and/or call dentry.destroy() as appropriate. +// The dentry is initially invalid in that it contains no underlying dentry; +// the caller is responsible for setting them. +func (fs *filesystem) newDentry() *dentry { + d := &dentry{ + fs: fs, + } + d.vfsd.Init(d) + refsvfs2.Register(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + r := atomic.AddInt64(&d.refs, 1) + if d.LogRefs() { + refsvfs2.LogIncRef(d, r) + } +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + r := atomic.LoadInt64(&d.refs) + if r <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, r, r+1) { + if d.LogRefs() { + refsvfs2.LogTryIncRef(d, r+1) + } + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef(ctx context.Context) { + r := atomic.AddInt64(&d.refs, -1) + if d.LogRefs() { + refsvfs2.LogDecRef(d, r) + } + if r == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked(ctx) + d.fs.renameMu.Unlock() + } else if r < 0 { + panic("verity.dentry.DecRef() called without holding a reference") + } +} + +func (d *dentry) decRefLocked(ctx context.Context) { + r := atomic.AddInt64(&d.refs, -1) + if d.LogRefs() { + refsvfs2.LogDecRef(d, r) + } + if r == 0 { + d.checkDropLocked(ctx) + } else if r < 0 { + panic("verity.dentry.decRefLocked() called without holding a reference") + } +} + +// checkDropLocked should be called after d's reference count becomes 0 or it +// becomes deleted. +func (d *dentry) checkDropLocked(ctx context.Context) { + // Dentries with a positive reference count must be retained. Dentries + // with a negative reference count have already been destroyed. + if atomic.LoadInt64(&d.refs) != 0 { + return + } + // Refs is still zero; destroy it. + d.destroyLocked(ctx) + return +} + +// destroyLocked destroys the dentry. +// +// Preconditions: +// * d.fs.renameMu must be locked for writing. +// * d.refs == 0. +func (d *dentry) destroyLocked(ctx context.Context) { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("verity.dentry.destroyLocked() called on already destroyed dentry") + default: + panic("verity.dentry.destroyLocked() called with references on the dentry") + } + + if d.lowerVD.Ok() { + d.lowerVD.DecRef(ctx) + } + if d.lowerMerkleVD.Ok() { + d.lowerMerkleVD.DecRef(ctx) + } + if d.parent != nil { + d.parent.dirMu.Lock() + if !d.vfsd.IsDead() { + delete(d.parent.children, d.name) + } + d.parent.dirMu.Unlock() + d.parent.decRefLocked(ctx) + } + refsvfs2.Unregister(d) +} + +// RefType implements refsvfs2.CheckedObject.Type. +func (d *dentry) RefType() string { + return "verity.dentry" +} + +// LeakMessage implements refsvfs2.CheckedObject.LeakMessage. +func (d *dentry) LeakMessage() string { + return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs)) +} + +// LogRefs implements refsvfs2.CheckedObject.LogRefs. +// +// This should only be set to true for debugging purposes, as it can generate an +// extremely large amount of output and drastically degrade performance. +func (d *dentry) LogRefs() bool { + return false +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { + //TODO(b/159261227): Implement InotifyWithParent. +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + //TODO(b/159261227): Implement Watches. + return nil +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +func (d *dentry) OnZeroWatches(context.Context) { + //TODO(b/159261227): Implement OnZeroWatches. +} + +func (d *dentry) isSymlink() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK +} + +func (d *dentry) isDir() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// verityEnabled checks whether the file is enabled with verity features. It +// should always be true if runtime enable is not allowed. In runtime enable +// mode, it returns true if the target has been enabled with +// ioctl(FS_IOC_ENABLE_VERITY). +func (d *dentry) verityEnabled() bool { + d.hashMu.RLock() + defer d.hashMu.RUnlock() + return !d.fs.allowRuntimeEnable || len(d.hash) != 0 +} + +// getLowerAt returns the dentry in the underlying file system, which is +// represented by filename relative to d. +func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) { + return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(filename), + }, &vfs.GetDentryOptions{}) +} + +func (d *dentry) readlink(ctx context.Context) (string, error) { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + if d.verityEnabled() { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, &vfs.StatOptions{}) + if err != nil { + return "", err + } + d.dirMu.Lock() + defer d.dirMu.Unlock() + if err := d.fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { + return "", err + } + return d.symlinkTarget, nil + } + + return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }) +} + +// FileDescription implements vfs.FileDescriptionImpl for verity fds. +// FileDescription is a wrapper of the underlying lowerFD, with support to build +// Merkle trees through the Linux fs-verity API to verify contents read from +// lowerFD. +// +// +stateify savable +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + // d is the corresponding dentry to the fileDescription. + d *dentry + + // isDir specifies whehter the fileDescription points to a directory. + isDir bool + + // lowerFD is the FileDescription corresponding to the file in the + // underlying file system. + lowerFD *vfs.FileDescription + + // lowerMappable is the memmap.Mappable corresponding to this file in the + // underlying file system. + lowerMappable memmap.Mappable + + // merkleReader is the read-only FileDescription corresponding to the + // Merkle tree file in the underlying file system. + merkleReader *vfs.FileDescription + + // merkleWriter is the FileDescription corresponding to the Merkle tree + // file in the underlying file system for writing. This should only be + // used when allowRuntimeEnable is set to true. + merkleWriter *vfs.FileDescription + + // parentMerkleWriter is the FileDescription of the Merkle tree for the + // directory that contains the current file/directory. This is only used + // if allowRuntimeEnable is set to true. + parentMerkleWriter *vfs.FileDescription + + // off is the file offset. off is protected by mu. + mu sync.Mutex `state:"nosave"` + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fileDescription) Release(ctx context.Context) { + fd.lowerFD.DecRef(ctx) + fd.merkleReader.DecRef(ctx) + if fd.merkleWriter != nil { + fd.merkleWriter.DecRef(ctx) + } + if fd.parentMerkleWriter != nil { + fd.parentMerkleWriter.DecRef(ctx) + } +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + // TODO(b/162788573): Add integrity check for metadata. + stat, err := fd.lowerFD.Stat(ctx, opts) + if err != nil { + return linux.Statx{}, err + } + fd.d.dirMu.Lock() + if fd.d.verityEnabled() { + if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil { + return linux.Statx{}, err + } + } + fd.d.dirMu.Unlock() + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + // Verity files are read-only. + return syserror.EPERM +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + if !fd.d.isDir() { + return syserror.ENOTDIR + } + fd.mu.Lock() + defer fd.mu.Unlock() + + var ds []vfs.Dirent + err := fd.lowerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + // Do not include the Merkle tree files. + if strings.Contains(dirent.Name, merklePrefix) || strings.Contains(dirent.Name, merkleRootPrefix) { + return nil + } + if fd.d.verityEnabled() { + // Verify that the child is expected. + if dirent.Name != "." && dirent.Name != ".." { + if _, ok := fd.d.childrenNames[dirent.Name]; !ok { + return alertIntegrityViolation(fmt.Sprintf("Unexpected children %s", dirent.Name)) + } + } + } + ds = append(ds, dirent) + return nil + })) + + if err != nil { + return err + } + + // The result should contain all children plus "." and "..". + if fd.d.verityEnabled() && len(ds) != len(fd.d.childrenNames)+2 { + return alertIntegrityViolation(fmt.Sprintf("Unexpected children number %d", len(ds))) + } + + for fd.off < int64(len(ds)) { + if err := cb.Handle(ds[fd.off]); err != nil { + return err + } + fd.off++ + } + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + n := int64(0) + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + n = fd.off + case linux.SEEK_END: + n = int64(fd.d.size) + default: + return 0, syserror.EINVAL + } + if offset > math.MaxInt64-n { + return 0, syserror.EINVAL + } + offset += n + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a +// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The +// hash of the generated Merkle tree and the data size is returned. If fd +// points to a regular file, the data is the content of the file. If fd points +// to a directory, the data is all hashes of its children, written to the Merkle +// tree file. If fd represents a symlink, the data is empty and nothing is written +// to the Merkle tree file. +// +// Preconditions: fd.d.fs.verityMu must be locked. +func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) { + fdReader := FileReadWriteSeeker{ + FD: fd.lowerFD, + Ctx: ctx, + } + merkleReader := FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + merkleWriter := FileReadWriteSeeker{ + FD: fd.merkleWriter, + Ctx: ctx, + } + + stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + + params := &merkletree.GenerateParams{ + TreeReader: &merkleReader, + TreeWriter: &merkleWriter, + Children: fd.d.childrenNames, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), + Name: fd.d.name, + Mode: uint32(stat.Mode), + UID: stat.UID, + GID: stat.GID, + } + + switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT { + case linux.S_IFREG: + // For a regular file, generate a Merkle tree based on its + // content. + params.File = &fdReader + params.Size = int64(stat.Size) + params.DataAndTreeInSameFile = false + case linux.S_IFDIR: + // For a directory, generate a Merkle tree based on the hashes + // of its children that has already been written to the Merkle + // tree file. + merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + + params.Size = int64(merkleStat.Size) + params.File = &merkleReader + params.DataAndTreeInSameFile = true + case linux.S_IFLNK: + // For a symlink, generate a Merkle tree file but do not write the root hash + // of the target file content to it. Return a hash of a VerityDescriptor object + // which includes the symlink target name. + target, err := fd.d.readlink(ctx) + if err != nil { + return nil, 0, err + } + + params.Size = int64(stat.Size) + params.DataAndTreeInSameFile = false + params.SymlinkTarget = target + default: + // TODO(b/167728857): Investigate whether and how we should + // enable other types of file. + return nil, 0, syserror.EINVAL + } + hash, err := merkletree.Generate(params) + return hash, uint64(params.Size), err +} + +// recordChildrenLocked writes the names of fd's children into the +// corresponding Merkle tree file, and saves the offset/size of the map into +// xattrs. +// +// Preconditions: +// * fd.d.fs.verityMu must be locked. +// * fd.d.isDir() == true. +func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error { + // Record the children names in the Merkle tree file. + childrenNames, err := json.Marshal(fd.d.childrenNames) + if err != nil { + return err + } + + stat, err := fd.merkleWriter.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return err + } + + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: childrenOffsetXattr, + Value: strconv.Itoa(int(stat.Size)), + }); err != nil { + return err + } + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: childrenSizeXattr, + Value: strconv.Itoa(len(childrenNames)), + }); err != nil { + return err + } + + if _, err = fd.merkleWriter.Write(ctx, usermem.BytesIOSequence(childrenNames), vfs.WriteOptions{}); err != nil { + return err + } + + return nil +} + +// enableVerity enables verity features on fd by generating a Merkle tree file +// and stores its hash in its parent directory's Merkle tree. +func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { + if !fd.d.fs.allowRuntimeEnable { + return 0, syserror.EPERM + } + + fd.d.fs.verityMu.Lock() + defer fd.d.fs.verityMu.Unlock() + + // In allowRuntimeEnable mode, the underlying fd and read/write fd for + // the Merkle tree file should have all been initialized. For any file + // or directory other than the root, the parent Merkle tree file should + // have also been initialized. + if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) { + return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") + } + + hash, dataSize, err := fd.generateMerkleLocked(ctx) + if err != nil { + return 0, err + } + + if fd.parentMerkleWriter != nil { + stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } + + // Write the hash of fd to the parent directory's Merkle tree + // file, as it should be part of the parent Merkle tree data. + // parentMerkleWriter is open with O_APPEND, so it should write + // directly to the end of the file. + if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil { + return 0, err + } + + // Record the offset of the hash of fd in parent directory's + // Merkle tree file. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleOffsetInParentXattr, + Value: strconv.Itoa(int(stat.Size)), + }); err != nil { + return 0, err + } + + // Add the current child's name to parent's childrenNames. + fd.d.parent.childrenNames[fd.d.name] = struct{}{} + } + + // Record the size of the data being hashed for fd. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleSizeXattr, + Value: strconv.Itoa(int(dataSize)), + }); err != nil { + return 0, err + } + + if fd.d.isDir() { + if err := fd.recordChildrenLocked(ctx); err != nil { + return 0, err + } + } + fd.d.hashMu.Lock() + fd.d.hash = hash + fd.d.hashMu.Unlock() + return 0, nil +} + +// measureVerity returns the hash of fd, saved in verityDigest. +func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hostarch.Addr) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + return 0, syserror.EINVAL + } + var metadata linux.DigestMetadata + + fd.d.hashMu.RLock() + defer fd.d.hashMu.RUnlock() + + // If allowRuntimeEnable is true, an empty fd.d.hash indicates that + // verity is not enabled for the file. If allowRuntimeEnable is false, + // this is an integrity violation because all files should have verity + // enabled, in which case fd.d.hash should be set. + if len(fd.d.hash) == 0 { + if fd.d.fs.allowRuntimeEnable { + return 0, syserror.ENODATA + } + return 0, alertIntegrityViolation("Ioctl measureVerity: no hash found") + } + + // The first part of VerityDigest is the metadata. + if _, err := metadata.CopyIn(t, verityDigest); err != nil { + return 0, err + } + if metadata.DigestSize < uint16(len(fd.d.hash)) { + return 0, syserror.EOVERFLOW + } + + // Populate the output digest size, since DigestSize is both input and + // output. + metadata.DigestSize = uint16(len(fd.d.hash)) + + // First copy the metadata. + if _, err := metadata.CopyOut(t, verityDigest); err != nil { + return 0, err + } + + // Now copy the root hash bytes to the memory after metadata. + _, err := t.CopyOutBytes(hostarch.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash) + return 0, err +} + +func (fd *fileDescription) verityFlags(ctx context.Context, flags hostarch.Addr) (uintptr, error) { + f := int32(0) + + fd.d.hashMu.RLock() + // All enabled files should store a hash. This flag is not settable via + // FS_IOC_SETFLAGS. + if len(fd.d.hash) != 0 { + f |= linux.FS_VERITY_FL + } + fd.d.hashMu.RUnlock() + + t := kernel.TaskFromContext(ctx) + if t == nil { + return 0, syserror.EINVAL + } + _, err := primitive.CopyInt32Out(t, flags, f) + return 0, err +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FS_IOC_ENABLE_VERITY: + return fd.enableVerity(ctx) + case linux.FS_IOC_MEASURE_VERITY: + return fd.measureVerity(ctx, args[2].Pointer()) + case linux.FS_IOC_GETFLAGS: + return fd.verityFlags(ctx, args[2].Pointer()) + default: + // TODO(b/169682228): Investigate which ioctl commands should + // be allowed. + return 0, syserror.ENOSYS + } +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Implement Read with PRead by setting offset. + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // No need to verify if the file is not enabled yet in + // allowRuntimeEnable mode. + if !fd.d.verityEnabled() { + return fd.lowerFD.PRead(ctx, dst, offset, opts) + } + + fd.d.fs.verityMu.RLock() + defer fd.d.fs.verityMu.RUnlock() + // dataSize is the size of the whole file. + dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the xattr does not exist, it + // indicates unexpected modifications to the file system. + if err == syserror.ENODATA { + return 0, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err)) + } + if err != nil { + return 0, err + } + + // The dataSize xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + size, err := strconv.Atoi(dataSize) + if err != nil { + return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err)) + } + + dataReader := FileReadWriteSeeker{ + FD: fd.lowerFD, + Ctx: ctx, + } + + merkleReader := FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + + fd.d.hashMu.RLock() + n, err := merkletree.Verify(&merkletree.VerifyParams{ + Out: dst.Writer(ctx), + File: &dataReader, + Tree: &merkleReader, + Size: int64(size), + Name: fd.d.name, + Mode: fd.d.mode, + UID: fd.d.uid, + GID: fd.d.gid, + Children: fd.d.childrenNames, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), + ReadOffset: offset, + ReadSize: dst.NumBytes(), + Expected: fd.d.hash, + DataAndTreeInSameFile: false, + }) + fd.d.hashMu.RUnlock() + if err != nil { + return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err)) + } + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EROFS +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EROFS +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if err := fd.lowerFD.ConfigureMMap(ctx, opts); err != nil { + return err + } + fd.lowerMappable = opts.Mappable + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef(ctx) + opts.MappingIdentity = nil + } + + // Check if mmap is allowed on the lower filesystem. + if !opts.SentryOwnedContent { + return syserror.ENODEV + } + return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { + return fd.lowerFD.LockBSD(ctx, ownerPID, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *fileDescription) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return fd.lowerFD.UnlockBSD(ctx) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + return fd.lowerFD.LockPOSIX(ctx, uid, ownerPID, t, r, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.lowerFD.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (fd *fileDescription) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fd.lowerFD.TestPOSIX(ctx, uid, t, r) +} + +// Translate implements memmap.Mappable.Translate. +func (fd *fileDescription) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { + ts, err := fd.lowerMappable.Translate(ctx, required, optional, at) + if err != nil { + return ts, err + } + + // dataSize is the size of the whole file. + dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the xattr does not exist, it + // indicates unexpected modifications to the file system. + if err == syserror.ENODATA { + return ts, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err)) + } + if err != nil { + return ts, err + } + + // The dataSize xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + size, err := strconv.Atoi(dataSize) + if err != nil { + return ts, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err)) + } + + merkleReader := FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + + for _, t := range ts { + // Content integrity relies on sentry owning the backing data. MapInternal is guaranteed + // to fetch sentry owned memory because we disallow verity mmaps otherwise. + ims, err := t.File.MapInternal(memmap.FileRange{t.Offset, t.Offset + t.Source.Length()}, hostarch.Read) + if err != nil { + return nil, err + } + dataReader := mmapReadSeeker{ims, t.Source.Start} + var buf bytes.Buffer + _, err = merkletree.Verify(&merkletree.VerifyParams{ + Out: &buf, + File: &dataReader, + Tree: &merkleReader, + Size: int64(size), + Name: fd.d.name, + Mode: fd.d.mode, + UID: fd.d.uid, + GID: fd.d.gid, + HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), + ReadOffset: int64(t.Source.Start), + ReadSize: int64(t.Source.Length()), + Expected: fd.d.hash, + DataAndTreeInSameFile: false, + }) + if err != nil { + return ts, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err)) + } + } + return ts, err +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (fd *fileDescription) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { + return fd.lowerMappable.AddMapping(ctx, ms, ar, offset, writable) +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (fd *fileDescription) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { + fd.lowerMappable.RemoveMapping(ctx, ms, ar, offset, writable) +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (fd *fileDescription) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { + return fd.lowerMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable) +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (fd *fileDescription) InvalidateUnsavable(context.Context) error { + return nil +} + +// mmapReadSeeker is a helper struct used by fileDescription.Translate to pass +// a safemem.BlockSeq pointing to the mapped region as io.ReaderAt. +type mmapReadSeeker struct { + safemem.BlockSeq + Offset uint64 +} + +// ReadAt implements io.ReaderAt.ReadAt. off is the offset into the mapped file. +func (r *mmapReadSeeker) ReadAt(p []byte, off int64) (int, error) { + bs := r.BlockSeq + // Adjust the offset into the mapped file to get the offset into the internally + // mapped region. + readOffset := off - int64(r.Offset) + if readOffset < 0 { + return 0, syserror.EINVAL + } + bs.DropFirst64(uint64(readOffset)) + view := bs.TakeFirst64(uint64(len(p))) + dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p)) + n, err := safemem.CopySeq(dst, view) + return int(n), err +} + +// FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as +// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc. +type FileReadWriteSeeker struct { + FD *vfs.FileDescription + Ctx context.Context + ROpts vfs.ReadOptions + WOpts vfs.WriteOptions +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts) + return int(n), err +} + +// Read implements io.ReadWriteSeeker.Read. +func (f *FileReadWriteSeeker) Read(p []byte) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.Read(f.Ctx, dst, f.ROpts) + return int(n), err +} + +// Seek implements io.ReadWriteSeeker.Seek. +func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) { + return f.FD.Seek(f.Ctx, offset, int32(whence)) +} + +// WriteAt implements io.WriterAt.WriteAt. +func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts) + return int(n), err +} + +// Write implements io.ReadWriteSeeker.Write. +func (f *FileReadWriteSeeker) Write(p []byte) (int, error) { + buf := usermem.BytesIOSequence(p) + n, err := f.FD.Write(f.Ctx, buf, f.WOpts) + return int(n), err +} diff --git a/pkg/sentry/fsimpl/verity/verity_state_autogen.go b/pkg/sentry/fsimpl/verity/verity_state_autogen.go new file mode 100644 index 000000000..47bea46c9 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity_state_autogen.go @@ -0,0 +1,237 @@ +// automatically generated by stateify. + +package verity + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (fstype *FilesystemType) StateTypeName() string { + return "pkg/sentry/fsimpl/verity.FilesystemType" +} + +func (fstype *FilesystemType) StateFields() []string { + return []string{} +} + +func (fstype *FilesystemType) beforeSave() {} + +// +checklocksignore +func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) { + fstype.beforeSave() +} + +func (fstype *FilesystemType) afterLoad() {} + +// +checklocksignore +func (fstype *FilesystemType) StateLoad(stateSourceObject state.Source) { +} + +func (fs *filesystem) StateTypeName() string { + return "pkg/sentry/fsimpl/verity.filesystem" +} + +func (fs *filesystem) StateFields() []string { + return []string{ + "vfsfs", + "creds", + "allowRuntimeEnable", + "lowerMount", + "rootDentry", + "alg", + } +} + +func (fs *filesystem) beforeSave() {} + +// +checklocksignore +func (fs *filesystem) StateSave(stateSinkObject state.Sink) { + fs.beforeSave() + stateSinkObject.Save(0, &fs.vfsfs) + stateSinkObject.Save(1, &fs.creds) + stateSinkObject.Save(2, &fs.allowRuntimeEnable) + stateSinkObject.Save(3, &fs.lowerMount) + stateSinkObject.Save(4, &fs.rootDentry) + stateSinkObject.Save(5, &fs.alg) +} + +func (fs *filesystem) afterLoad() {} + +// +checklocksignore +func (fs *filesystem) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &fs.vfsfs) + stateSourceObject.Load(1, &fs.creds) + stateSourceObject.Load(2, &fs.allowRuntimeEnable) + stateSourceObject.Load(3, &fs.lowerMount) + stateSourceObject.Load(4, &fs.rootDentry) + stateSourceObject.Load(5, &fs.alg) +} + +func (i *InternalFilesystemOptions) StateTypeName() string { + return "pkg/sentry/fsimpl/verity.InternalFilesystemOptions" +} + +func (i *InternalFilesystemOptions) StateFields() []string { + return []string{ + "RootMerkleFileName", + "LowerName", + "Alg", + "RootHash", + "AllowRuntimeEnable", + "LowerGetFSOptions", + "Action", + } +} + +func (i *InternalFilesystemOptions) beforeSave() {} + +// +checklocksignore +func (i *InternalFilesystemOptions) StateSave(stateSinkObject state.Sink) { + i.beforeSave() + stateSinkObject.Save(0, &i.RootMerkleFileName) + stateSinkObject.Save(1, &i.LowerName) + stateSinkObject.Save(2, &i.Alg) + stateSinkObject.Save(3, &i.RootHash) + stateSinkObject.Save(4, &i.AllowRuntimeEnable) + stateSinkObject.Save(5, &i.LowerGetFSOptions) + stateSinkObject.Save(6, &i.Action) +} + +func (i *InternalFilesystemOptions) afterLoad() {} + +// +checklocksignore +func (i *InternalFilesystemOptions) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &i.RootMerkleFileName) + stateSourceObject.Load(1, &i.LowerName) + stateSourceObject.Load(2, &i.Alg) + stateSourceObject.Load(3, &i.RootHash) + stateSourceObject.Load(4, &i.AllowRuntimeEnable) + stateSourceObject.Load(5, &i.LowerGetFSOptions) + stateSourceObject.Load(6, &i.Action) +} + +func (d *dentry) StateTypeName() string { + return "pkg/sentry/fsimpl/verity.dentry" +} + +func (d *dentry) StateFields() []string { + return []string{ + "vfsd", + "refs", + "fs", + "mode", + "uid", + "gid", + "size", + "parent", + "name", + "children", + "childrenNames", + "lowerVD", + "lowerMerkleVD", + "symlinkTarget", + "hash", + } +} + +func (d *dentry) beforeSave() {} + +// +checklocksignore +func (d *dentry) StateSave(stateSinkObject state.Sink) { + d.beforeSave() + stateSinkObject.Save(0, &d.vfsd) + stateSinkObject.Save(1, &d.refs) + stateSinkObject.Save(2, &d.fs) + stateSinkObject.Save(3, &d.mode) + stateSinkObject.Save(4, &d.uid) + stateSinkObject.Save(5, &d.gid) + stateSinkObject.Save(6, &d.size) + stateSinkObject.Save(7, &d.parent) + stateSinkObject.Save(8, &d.name) + stateSinkObject.Save(9, &d.children) + stateSinkObject.Save(10, &d.childrenNames) + stateSinkObject.Save(11, &d.lowerVD) + stateSinkObject.Save(12, &d.lowerMerkleVD) + stateSinkObject.Save(13, &d.symlinkTarget) + stateSinkObject.Save(14, &d.hash) +} + +// +checklocksignore +func (d *dentry) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &d.vfsd) + stateSourceObject.Load(1, &d.refs) + stateSourceObject.Load(2, &d.fs) + stateSourceObject.Load(3, &d.mode) + stateSourceObject.Load(4, &d.uid) + stateSourceObject.Load(5, &d.gid) + stateSourceObject.Load(6, &d.size) + stateSourceObject.Load(7, &d.parent) + stateSourceObject.Load(8, &d.name) + stateSourceObject.Load(9, &d.children) + stateSourceObject.Load(10, &d.childrenNames) + stateSourceObject.Load(11, &d.lowerVD) + stateSourceObject.Load(12, &d.lowerMerkleVD) + stateSourceObject.Load(13, &d.symlinkTarget) + stateSourceObject.Load(14, &d.hash) + stateSourceObject.AfterLoad(d.afterLoad) +} + +func (fd *fileDescription) StateTypeName() string { + return "pkg/sentry/fsimpl/verity.fileDescription" +} + +func (fd *fileDescription) StateFields() []string { + return []string{ + "vfsfd", + "FileDescriptionDefaultImpl", + "d", + "isDir", + "lowerFD", + "lowerMappable", + "merkleReader", + "merkleWriter", + "parentMerkleWriter", + "off", + } +} + +func (fd *fileDescription) beforeSave() {} + +// +checklocksignore +func (fd *fileDescription) StateSave(stateSinkObject state.Sink) { + fd.beforeSave() + stateSinkObject.Save(0, &fd.vfsfd) + stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) + stateSinkObject.Save(2, &fd.d) + stateSinkObject.Save(3, &fd.isDir) + stateSinkObject.Save(4, &fd.lowerFD) + stateSinkObject.Save(5, &fd.lowerMappable) + stateSinkObject.Save(6, &fd.merkleReader) + stateSinkObject.Save(7, &fd.merkleWriter) + stateSinkObject.Save(8, &fd.parentMerkleWriter) + stateSinkObject.Save(9, &fd.off) +} + +func (fd *fileDescription) afterLoad() {} + +// +checklocksignore +func (fd *fileDescription) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &fd.vfsfd) + stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) + stateSourceObject.Load(2, &fd.d) + stateSourceObject.Load(3, &fd.isDir) + stateSourceObject.Load(4, &fd.lowerFD) + stateSourceObject.Load(5, &fd.lowerMappable) + stateSourceObject.Load(6, &fd.merkleReader) + stateSourceObject.Load(7, &fd.merkleWriter) + stateSourceObject.Load(8, &fd.parentMerkleWriter) + stateSourceObject.Load(9, &fd.off) +} + +func init() { + state.Register((*FilesystemType)(nil)) + state.Register((*filesystem)(nil)) + state.Register((*InternalFilesystemOptions)(nil)) + state.Register((*dentry)(nil)) + state.Register((*fileDescription)(nil)) +} |