diff options
author | Ayush Ranjan <ayushranjan@google.com> | 2021-09-21 14:07:09 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-09-21 14:09:18 -0700 |
commit | 6fccc185609e37b0e3346f8df91bdcb37bc990db (patch) | |
tree | 073420614d4dfbc5c13617633d99b38c050e0a86 /pkg | |
parent | e819029f3ad059bfc1635b7f2a196c332fa7532f (diff) |
[lisa] Implement lisafs protocol methods in VFS2 gofer client and fsgofer.
Introduces RPC methods in lisafs. Makes that gofer client use lisafs RPCs
instead of p9 when lisafs is enabled.
Implements the handlers for those methods in fsgofer.
Fixes #5465
PiperOrigin-RevId: 398080310
Diffstat (limited to 'pkg')
25 files changed, 4192 insertions, 336 deletions
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 1e23850a9..67646f837 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -242,7 +242,7 @@ const ( // Statx represents struct statx. // -// +marshal +// +marshal slice:StatxSlice type Statx struct { Mask uint32 Blksize uint32 @@ -270,6 +270,8 @@ type Statx struct { var SizeOfStatx = (*Statx)(nil).SizeBytes() // FileMode represents a mode_t. +// +// +marshal type FileMode uint16 // Permissions returns just the permission bits. diff --git a/pkg/lisafs/BUILD b/pkg/lisafs/BUILD index 9914ed2f5..313c1756d 100644 --- a/pkg/lisafs/BUILD +++ b/pkg/lisafs/BUILD @@ -57,6 +57,7 @@ go_library( srcs = [ "channel.go", "client.go", + "client_file.go", "communicator.go", "connection.go", "control_fd_list.go", diff --git a/pkg/lisafs/client.go b/pkg/lisafs/client.go index c99f8c73d..ccf1b9f72 100644 --- a/pkg/lisafs/client.go +++ b/pkg/lisafs/client.go @@ -20,12 +20,19 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) +const ( + // fdsToCloseBatchSize is the number of closed FDs batched before an Close + // RPC is made to close them all. fdsToCloseBatchSize is immutable. + fdsToCloseBatchSize = 100 +) + // Client helps manage a connection to the lisafs server and pass messages // efficiently. There is a 1:1 mapping between a Connection and a Client. type Client struct { @@ -53,6 +60,12 @@ type Client struct { // maxMessageSize is the maximum payload length (in bytes) that can be sent. // It is initialized on Mount and is immutable. maxMessageSize uint32 + + // fdsToClose tracks the FDs to close. It caches the FDs no longer being used + // by the client and closes them in one shot. It is not preserved across + // checkpoint/restore as FDIDs are not preserved. + fdsMu sync.Mutex + fdsToClose []FDID } // NewClient creates a new client for communication with the server. It mounts @@ -66,6 +79,7 @@ func NewClient(sock *unet.Socket, mountPath string) (*Client, *Inode, error) { channels: make([]*channel, 0, maxChans), availableChannels: make([]*channel, 0, maxChans), maxMessageSize: 1 << 20, // 1 MB for now. + fdsToClose: make([]FDID, 0, fdsToCloseBatchSize), } // Start a goroutine to check socket health. This goroutine is also @@ -245,6 +259,47 @@ func (c *Client) IsSupported(m MID) bool { return int(m) < len(c.supported) && c.supported[m] } +// CloseFDBatched either queues the passed FD to be closed or makes a batch +// RPC to close all the accumulated FDs-to-close. +func (c *Client) CloseFDBatched(ctx context.Context, fd FDID) { + c.fdsMu.Lock() + c.fdsToClose = append(c.fdsToClose, fd) + if len(c.fdsToClose) < fdsToCloseBatchSize { + c.fdsMu.Unlock() + return + } + + // Flush the cache. We should not hold fdsMu while making an RPC, so be sure + // to copy the fdsToClose to another buffer before unlocking fdsMu. + var toCloseArr [fdsToCloseBatchSize]FDID + toClose := toCloseArr[:len(c.fdsToClose)] + copy(toClose, c.fdsToClose) + + // Clear fdsToClose so other FDIDs can be appended. + c.fdsToClose = c.fdsToClose[:0] + c.fdsMu.Unlock() + + req := CloseReq{FDs: toClose} + ctx.UninterruptibleSleepStart(false) + err := c.SndRcvMessage(Close, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + log.Warningf("lisafs: batch closing FDs returned error: %v", err) + } +} + +// SyncFDs makes a Fsync RPC to sync multiple FDs. +func (c *Client) SyncFDs(ctx context.Context, fds []FDID) error { + if len(fds) == 0 { + return nil + } + req := FsyncReq{FDs: fds} + ctx.UninterruptibleSleepStart(false) + err := c.SndRcvMessage(FSync, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + // SndRcvMessage invokes reqMarshal to marshal the request onto the payload // buffer, wakes up the server to process the request, waits for the response // and invokes respUnmarshal with the response payload. respFDs is populated diff --git a/pkg/lisafs/client_file.go b/pkg/lisafs/client_file.go new file mode 100644 index 000000000..0f8788f3b --- /dev/null +++ b/pkg/lisafs/client_file.go @@ -0,0 +1,475 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lisafs + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal/primitive" +) + +// ClientFD is a wrapper around FDID that provides client-side utilities +// so that RPC making is easier. +type ClientFD struct { + fd FDID + client *Client +} + +// ID returns the underlying FDID. +func (f *ClientFD) ID() FDID { + return f.fd +} + +// Client returns the backing Client. +func (f *ClientFD) Client() *Client { + return f.client +} + +// NewFD initializes a new ClientFD. +func (c *Client) NewFD(fd FDID) ClientFD { + return ClientFD{ + client: c, + fd: fd, + } +} + +// Ok returns true if the underlying FD is ok. +func (f *ClientFD) Ok() bool { + return f.fd.Ok() +} + +// CloseBatched queues this FD to be closed on the server and resets f.fd. +// This maybe invoke the Close RPC if the queue is full. +func (f *ClientFD) CloseBatched(ctx context.Context) { + f.client.CloseFDBatched(ctx, f.fd) + f.fd = InvalidFDID +} + +// Close closes this FD immediately (invoking a Close RPC). Consider using +// CloseBatched if closing this FD on remote right away is not critical. +func (f *ClientFD) Close(ctx context.Context) error { + fdArr := [1]FDID{f.fd} + req := CloseReq{FDs: fdArr[:]} + + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(Close, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// OpenAt makes the OpenAt RPC. +func (f *ClientFD) OpenAt(ctx context.Context, flags uint32) (FDID, int, error) { + req := OpenAtReq{ + FD: f.fd, + Flags: flags, + } + var respFD [1]int + var resp OpenAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(OpenAt, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.UnmarshalUnsafe, respFD[:]) + ctx.UninterruptibleSleepFinish(false) + return resp.NewFD, respFD[0], err +} + +// OpenCreateAt makes the OpenCreateAt RPC. +func (f *ClientFD) OpenCreateAt(ctx context.Context, name string, flags uint32, mode linux.FileMode, uid UID, gid GID) (Inode, FDID, int, error) { + var req OpenCreateAtReq + req.DirFD = f.fd + req.Name = SizedString(name) + req.Flags = primitive.Uint32(flags) + req.Mode = mode + req.UID = uid + req.GID = gid + + var respFD [1]int + var resp OpenCreateAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(OpenCreateAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalUnsafe, respFD[:]) + ctx.UninterruptibleSleepFinish(false) + return resp.Child, resp.NewFD, respFD[0], err +} + +// StatTo makes the Fstat RPC and populates stat with the result. +func (f *ClientFD) StatTo(ctx context.Context, stat *linux.Statx) error { + req := StatReq{FD: f.fd} + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FStat, uint32(req.SizeBytes()), req.MarshalUnsafe, stat.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// Sync makes the Fsync RPC. +func (f *ClientFD) Sync(ctx context.Context) error { + req := FsyncReq{FDs: []FDID{f.fd}} + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FSync, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// Read makes the PRead RPC. +func (f *ClientFD) Read(ctx context.Context, dst []byte, offset uint64) (uint64, error) { + req := PReadReq{ + Offset: offset, + FD: f.fd, + Count: uint32(len(dst)), + } + + resp := PReadResp{ + // This will be unmarshalled into. Already set Buf so that we don't need to + // allocate a temporary buffer during unmarshalling. + // PReadResp.UnmarshalBytes expects this to be set. + Buf: dst, + } + + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(PRead, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return uint64(resp.NumBytes), err +} + +// Write makes the PWrite RPC. +func (f *ClientFD) Write(ctx context.Context, src []byte, offset uint64) (uint64, error) { + req := PWriteReq{ + Offset: primitive.Uint64(offset), + FD: f.fd, + NumBytes: primitive.Uint32(len(src)), + Buf: src, + } + + var resp PWriteResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(PWrite, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return resp.Count, err +} + +// MkdirAt makes the MkdirAt RPC. +func (f *ClientFD) MkdirAt(ctx context.Context, name string, mode linux.FileMode, uid UID, gid GID) (*Inode, error) { + var req MkdirAtReq + req.DirFD = f.fd + req.Name = SizedString(name) + req.Mode = mode + req.UID = uid + req.GID = gid + + var resp MkdirAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(MkdirAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return &resp.ChildDir, err +} + +// SymlinkAt makes the SymlinkAt RPC. +func (f *ClientFD) SymlinkAt(ctx context.Context, name, target string, uid UID, gid GID) (*Inode, error) { + req := SymlinkAtReq{ + DirFD: f.fd, + Name: SizedString(name), + Target: SizedString(target), + UID: uid, + GID: gid, + } + + var resp SymlinkAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(SymlinkAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return &resp.Symlink, err +} + +// LinkAt makes the LinkAt RPC. +func (f *ClientFD) LinkAt(ctx context.Context, targetFD FDID, name string) (*Inode, error) { + req := LinkAtReq{ + DirFD: f.fd, + Target: targetFD, + Name: SizedString(name), + } + + var resp LinkAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(LinkAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return &resp.Link, err +} + +// MknodAt makes the MknodAt RPC. +func (f *ClientFD) MknodAt(ctx context.Context, name string, mode linux.FileMode, uid UID, gid GID, minor, major uint32) (*Inode, error) { + var req MknodAtReq + req.DirFD = f.fd + req.Name = SizedString(name) + req.Mode = mode + req.UID = uid + req.GID = gid + req.Minor = primitive.Uint32(minor) + req.Major = primitive.Uint32(major) + + var resp MknodAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(MknodAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return &resp.Child, err +} + +// SetStat makes the SetStat RPC. +func (f *ClientFD) SetStat(ctx context.Context, stat *linux.Statx) (uint32, error, error) { + req := SetStatReq{ + FD: f.fd, + Mask: stat.Mask, + Mode: uint32(stat.Mode), + UID: UID(stat.UID), + GID: GID(stat.GID), + Size: stat.Size, + Atime: linux.Timespec{ + Sec: stat.Atime.Sec, + Nsec: int64(stat.Atime.Nsec), + }, + Mtime: linux.Timespec{ + Sec: stat.Mtime.Sec, + Nsec: int64(stat.Mtime.Nsec), + }, + } + + var resp SetStatResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(SetStat, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return resp.FailureMask, unix.Errno(resp.FailureErrNo), err +} + +// WalkMultiple makes the Walk RPC with multiple path components. +func (f *ClientFD) WalkMultiple(ctx context.Context, names []string) (WalkStatus, []Inode, error) { + req := WalkReq{ + DirFD: f.fd, + Path: StringArray(names), + } + + var resp WalkResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(Walk, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return resp.Status, resp.Inodes, err +} + +// Walk makes the Walk RPC with just one path component to walk. +func (f *ClientFD) Walk(ctx context.Context, name string) (*Inode, error) { + req := WalkReq{ + DirFD: f.fd, + Path: []string{name}, + } + + var inode [1]Inode + resp := WalkResp{Inodes: inode[:]} + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(Walk, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return nil, err + } + + switch resp.Status { + case WalkComponentDoesNotExist: + return nil, unix.ENOENT + case WalkComponentSymlink: + // f is not a directory which can be walked on. + return nil, unix.ENOTDIR + } + + if n := len(resp.Inodes); n > 1 { + for i := range resp.Inodes { + f.client.CloseFDBatched(ctx, resp.Inodes[i].ControlFD) + } + log.Warningf("requested to walk one component, but got %d results", n) + return nil, unix.EIO + } else if n == 0 { + log.Warningf("walk has success status but no results returned") + return nil, unix.ENOENT + } + return &inode[0], err +} + +// WalkStat makes the WalkStat RPC with multiple path components to walk. +func (f *ClientFD) WalkStat(ctx context.Context, names []string) ([]linux.Statx, error) { + req := WalkReq{ + DirFD: f.fd, + Path: StringArray(names), + } + + var resp WalkStatResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(WalkStat, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return resp.Stats, err +} + +// StatFSTo makes the FStatFS RPC and populates statFS with the result. +func (f *ClientFD) StatFSTo(ctx context.Context, statFS *StatFS) error { + req := FStatFSReq{FD: f.fd} + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FStatFS, uint32(req.SizeBytes()), req.MarshalUnsafe, statFS.UnmarshalUnsafe, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// Allocate makes the FAllocate RPC. +func (f *ClientFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + req := FAllocateReq{ + FD: f.fd, + Mode: mode, + Offset: offset, + Length: length, + } + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FAllocate, uint32(req.SizeBytes()), req.MarshalUnsafe, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// ReadLinkAt makes the ReadLinkAt RPC. +func (f *ClientFD) ReadLinkAt(ctx context.Context) (string, error) { + req := ReadLinkAtReq{FD: f.fd} + var resp ReadLinkAtResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(ReadLinkAt, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return string(resp.Target), err +} + +// Flush makes the Flush RPC. +func (f *ClientFD) Flush(ctx context.Context) error { + if !f.client.IsSupported(Flush) { + // If Flush is not supported, it probably means that it would be a noop. + return nil + } + req := FlushReq{FD: f.fd} + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(Flush, uint32(req.SizeBytes()), req.MarshalUnsafe, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// Connect makes the Connect RPC. +func (f *ClientFD) Connect(ctx context.Context, sockType linux.SockType) (int, error) { + req := ConnectReq{FD: f.fd, SockType: uint32(sockType)} + var sockFD [1]int + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(Connect, uint32(req.SizeBytes()), req.MarshalUnsafe, NoopUnmarshal, sockFD[:]) + ctx.UninterruptibleSleepFinish(false) + if err == nil && sockFD[0] < 0 { + err = unix.EBADF + } + return sockFD[0], err +} + +// UnlinkAt makes the UnlinkAt RPC. +func (f *ClientFD) UnlinkAt(ctx context.Context, name string, flags uint32) error { + req := UnlinkAtReq{ + DirFD: f.fd, + Name: SizedString(name), + Flags: primitive.Uint32(flags), + } + + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(UnlinkAt, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// RenameTo makes the RenameAt RPC which renames f to newDirFD directory with +// name newName. +func (f *ClientFD) RenameTo(ctx context.Context, newDirFD FDID, newName string) error { + req := RenameAtReq{ + Renamed: f.fd, + NewDir: newDirFD, + NewName: SizedString(newName), + } + + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(RenameAt, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// Getdents64 makes the Getdents64 RPC. +func (f *ClientFD) Getdents64(ctx context.Context, count int32) ([]Dirent64, error) { + req := Getdents64Req{ + DirFD: f.fd, + Count: count, + } + + var resp Getdents64Resp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(Getdents64, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return resp.Dirents, err +} + +// ListXattr makes the FListXattr RPC. +func (f *ClientFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { + req := FListXattrReq{ + FD: f.fd, + Size: size, + } + + var resp FListXattrResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FListXattr, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return resp.Xattrs, err +} + +// GetXattr makes the FGetXattr RPC. +func (f *ClientFD) GetXattr(ctx context.Context, name string, size uint64) (string, error) { + req := FGetXattrReq{ + FD: f.fd, + Name: SizedString(name), + BufSize: primitive.Uint32(size), + } + + var resp FGetXattrResp + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FGetXattr, uint32(req.SizeBytes()), req.MarshalBytes, resp.UnmarshalBytes, nil) + ctx.UninterruptibleSleepFinish(false) + return string(resp.Value), err +} + +// SetXattr makes the FSetXattr RPC. +func (f *ClientFD) SetXattr(ctx context.Context, name string, value string, flags uint32) error { + req := FSetXattrReq{ + FD: f.fd, + Name: SizedString(name), + Value: SizedString(value), + Flags: primitive.Uint32(flags), + } + + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FSetXattr, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} + +// RemoveXattr makes the FRemoveXattr RPC. +func (f *ClientFD) RemoveXattr(ctx context.Context, name string) error { + req := FRemoveXattrReq{ + FD: f.fd, + Name: SizedString(name), + } + + ctx.UninterruptibleSleepStart(false) + err := f.client.SndRcvMessage(FRemoveXattr, uint32(req.SizeBytes()), req.MarshalBytes, NoopUnmarshal, nil) + ctx.UninterruptibleSleepFinish(false) + return err +} diff --git a/pkg/lisafs/connection.go b/pkg/lisafs/connection.go index 8dba4805f..f6e5ecb4f 100644 --- a/pkg/lisafs/connection.go +++ b/pkg/lisafs/connection.go @@ -289,6 +289,22 @@ func (c *Connection) RemoveFD(id FDID) { } } +// RemoveControlFDLocked is the same as RemoveFD with added preconditions. +// +// Preconditions: +// * server's rename mutex must at least be read locked. +// * id must be pointing to a control FD. +func (c *Connection) RemoveControlFDLocked(id FDID) { + c.fdsMu.Lock() + fd := c.removeFDLocked(id) + c.fdsMu.Unlock() + if fd != nil { + // Drop the ref held by c. This can take arbitrarily long. So do not hold + // c.fdsMu while calling it. + fd.(*ControlFD).DecRefLocked() + } +} + // removeFDLocked makes c stop tracking the passed FDID. Note that the caller // must drop ref on the returned fd (preferably without holding c.fdsMu). // diff --git a/pkg/lisafs/fd.go b/pkg/lisafs/fd.go index 9dd8ba384..cc6919a1b 100644 --- a/pkg/lisafs/fd.go +++ b/pkg/lisafs/fd.go @@ -231,7 +231,7 @@ func (fd *ControlFD) FilePath() string { return fd.FilePathLocked() } -// FilePathLocked is the same as FilePath with the additonal precondition. +// FilePathLocked is the same as FilePath with the additional precondition. // // Precondition: server's rename mutex must be at least read locked. func (fd *ControlFD) FilePathLocked() string { @@ -333,6 +333,25 @@ func (fd *OpenFD) Init(cfd *ControlFD, flags uint32, impl OpenFDImpl) { type ControlFDImpl interface { FD() *ControlFD Close(c *Connection) + Stat(c *Connection, comm Communicator) (uint32, error) + SetStat(c *Connection, comm Communicator, stat SetStatReq) (uint32, error) + Walk(c *Connection, comm Communicator, path StringArray) (uint32, error) + WalkStat(c *Connection, comm Communicator, path StringArray) (uint32, error) + Open(c *Connection, comm Communicator, flags uint32) (uint32, error) + OpenCreate(c *Connection, comm Communicator, mode linux.FileMode, uid UID, gid GID, name string, flags uint32) (uint32, error) + Mkdir(c *Connection, comm Communicator, mode linux.FileMode, uid UID, gid GID, name string) (uint32, error) + Mknod(c *Connection, comm Communicator, mode linux.FileMode, uid UID, gid GID, name string, minor uint32, major uint32) (uint32, error) + Symlink(c *Connection, comm Communicator, name string, target string, uid UID, gid GID) (uint32, error) + Link(c *Connection, comm Communicator, dir ControlFDImpl, name string) (uint32, error) + StatFS(c *Connection, comm Communicator) (uint32, error) + Readlink(c *Connection, comm Communicator) (uint32, error) + Connect(c *Connection, comm Communicator, sockType uint32) error + Unlink(c *Connection, name string, flags uint32) error + RenameLocked(c *Connection, newDir ControlFDImpl, newName string) (func(ControlFDImpl), func(), error) + GetXattr(c *Connection, comm Communicator, name string, size uint32) (uint32, error) + SetXattr(c *Connection, name string, value string, flags uint32) error + ListXattr(c *Connection, comm Communicator, size uint64) (uint32, error) + RemoveXattr(c *Connection, comm Communicator, name string) error } // OpenFDImpl contains implementation details for a OpenFD. Implementations of @@ -345,4 +364,11 @@ type ControlFDImpl interface { type OpenFDImpl interface { FD() *OpenFD Close(c *Connection) + Stat(c *Connection, comm Communicator) (uint32, error) + Sync(c *Connection) error + Write(c *Connection, comm Communicator, buf []byte, off uint64) (uint32, error) + Read(c *Connection, comm Communicator, off uint64, count uint32) (uint32, error) + Allocate(c *Connection, mode, off, length uint64) error + Flush(c *Connection) error + Getdent64(c *Connection, comm Communicator, count uint32, seek0 bool) (uint32, error) } diff --git a/pkg/lisafs/handlers.go b/pkg/lisafs/handlers.go index 9b8d8164a..82807734d 100644 --- a/pkg/lisafs/handlers.go +++ b/pkg/lisafs/handlers.go @@ -15,15 +15,23 @@ package lisafs import ( + "fmt" "path" "path/filepath" + "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/flipcall" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" ) +const ( + allowedOpenFlags = unix.O_ACCMODE | unix.O_TRUNC + setStatSupportedMask = unix.STATX_MODE | unix.STATX_UID | unix.STATX_GID | unix.STATX_SIZE | unix.STATX_ATIME | unix.STATX_MTIME +) + // RPCHandler defines a handler that is invoked when the associated message is // received. The handler is responsible for: // @@ -34,9 +42,35 @@ import ( type RPCHandler func(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) var handlers = [...]RPCHandler{ - Error: ErrorHandler, - Mount: MountHandler, - Channel: ChannelHandler, + Error: ErrorHandler, + Mount: MountHandler, + Channel: ChannelHandler, + FStat: FStatHandler, + SetStat: SetStatHandler, + Walk: WalkHandler, + WalkStat: WalkStatHandler, + OpenAt: OpenAtHandler, + OpenCreateAt: OpenCreateAtHandler, + Close: CloseHandler, + FSync: FSyncHandler, + PWrite: PWriteHandler, + PRead: PReadHandler, + MkdirAt: MkdirAtHandler, + MknodAt: MknodAtHandler, + SymlinkAt: SymlinkAtHandler, + LinkAt: LinkAtHandler, + FStatFS: FStatFSHandler, + FAllocate: FAllocateHandler, + ReadLinkAt: ReadLinkAtHandler, + Flush: FlushHandler, + Connect: ConnectHandler, + UnlinkAt: UnlinkAtHandler, + RenameAt: RenameAtHandler, + Getdents64: Getdents64Handler, + FGetXattr: FGetXattrHandler, + FSetXattr: FSetXattrHandler, + FListXattr: FListXattrHandler, + FRemoveXattr: FRemoveXattrHandler, } // ErrorHandler handles Error message. @@ -122,3 +156,613 @@ func ChannelHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32 resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } + +// FStatHandler handles the FStat RPC. +func FStatHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req StatReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.lookupFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + + switch t := fd.(type) { + case *ControlFD: + return t.impl.Stat(c, comm) + case *OpenFD: + return t.impl.Stat(c, comm) + default: + panic(fmt.Sprintf("unknown fd type %T", t)) + } +} + +// SetStatHandler handles the SetStat RPC. +func SetStatHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + + var req SetStatReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + + if req.Mask&^setStatSupportedMask != 0 { + return 0, unix.EPERM + } + + return fd.impl.SetStat(c, comm, req) +} + +// WalkHandler handles the Walk RPC. +func WalkHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req WalkReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + for _, name := range req.Path { + if err := checkSafeName(name); err != nil { + return 0, err + } + } + + return fd.impl.Walk(c, comm, req.Path) +} + +// WalkStatHandler handles the WalkStat RPC. +func WalkStatHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req WalkReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + + // Note that this fd is allowed to not actually be a directory when the + // only path component to walk is "" (self). + if !fd.IsDir() { + if len(req.Path) > 1 || (len(req.Path) == 1 && len(req.Path[0]) > 0) { + return 0, unix.ENOTDIR + } + } + for i, name := range req.Path { + // First component is allowed to be "". + if i == 0 && len(name) == 0 { + continue + } + if err := checkSafeName(name); err != nil { + return 0, err + } + } + + return fd.impl.WalkStat(c, comm, req.Path) +} + +// OpenAtHandler handles the OpenAt RPC. +func OpenAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req OpenAtReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + // Only keep allowed open flags. + if allowedFlags := req.Flags & allowedOpenFlags; allowedFlags != req.Flags { + log.Debugf("discarding open flags that are not allowed: old open flags = %d, new open flags = %d", req.Flags, allowedFlags) + req.Flags = allowedFlags + } + + accessMode := req.Flags & unix.O_ACCMODE + trunc := req.Flags&unix.O_TRUNC != 0 + if c.readonly && (accessMode != unix.O_RDONLY || trunc) { + return 0, unix.EROFS + } + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if fd.IsDir() { + // Directory is not truncatable and must be opened with O_RDONLY. + if accessMode != unix.O_RDONLY || trunc { + return 0, unix.EISDIR + } + } + + return fd.impl.Open(c, comm, req.Flags) +} + +// OpenCreateAtHandler handles the OpenCreateAt RPC. +func OpenCreateAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req OpenCreateAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + // Only keep allowed open flags. + if allowedFlags := req.Flags & allowedOpenFlags; allowedFlags != req.Flags { + log.Debugf("discarding open flags that are not allowed: old open flags = %d, new open flags = %d", req.Flags, allowedFlags) + req.Flags = allowedFlags + } + + name := string(req.Name) + if err := checkSafeName(name); err != nil { + return 0, err + } + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + + return fd.impl.OpenCreate(c, comm, req.Mode, req.UID, req.GID, name, uint32(req.Flags)) +} + +// CloseHandler handles the Close RPC. +func CloseHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req CloseReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + for _, fd := range req.FDs { + c.RemoveFD(fd) + } + + // There is no response message for this. + return 0, nil +} + +// FSyncHandler handles the FSync RPC. +func FSyncHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req FsyncReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + // Return the first error we encounter, but sync everything we can + // regardless. + var retErr error + for _, fdid := range req.FDs { + if err := c.fsyncFD(fdid); err != nil && retErr == nil { + retErr = err + } + } + + // There is no response message for this. + return 0, retErr +} + +func (c *Connection) fsyncFD(id FDID) error { + fd, err := c.LookupOpenFD(id) + if err != nil { + return err + } + return fd.impl.Sync(c) +} + +// PWriteHandler handles the PWrite RPC. +func PWriteHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req PWriteReq + // Note that it is an optimized Unmarshal operation which avoids any buffer + // allocation and copying. req.Buf just points to payload. This is safe to do + // as the handler owns payload and req's lifetime is limited to the handler. + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + fd, err := c.LookupOpenFD(req.FD) + if err != nil { + return 0, err + } + if !fd.writable { + return 0, unix.EBADF + } + return fd.impl.Write(c, comm, req.Buf, uint64(req.Offset)) +} + +// PReadHandler handles the PRead RPC. +func PReadHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req PReadReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupOpenFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.readable { + return 0, unix.EBADF + } + return fd.impl.Read(c, comm, req.Offset, req.Count) +} + +// MkdirAtHandler handles the MkdirAt RPC. +func MkdirAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req MkdirAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + name := string(req.Name) + if err := checkSafeName(name); err != nil { + return 0, err + } + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + return fd.impl.Mkdir(c, comm, req.Mode, req.UID, req.GID, name) +} + +// MknodAtHandler handles the MknodAt RPC. +func MknodAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req MknodAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + name := string(req.Name) + if err := checkSafeName(name); err != nil { + return 0, err + } + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + return fd.impl.Mknod(c, comm, req.Mode, req.UID, req.GID, name, uint32(req.Minor), uint32(req.Major)) +} + +// SymlinkAtHandler handles the SymlinkAt RPC. +func SymlinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req SymlinkAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + name := string(req.Name) + if err := checkSafeName(name); err != nil { + return 0, err + } + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + return fd.impl.Symlink(c, comm, name, string(req.Target), req.UID, req.GID) +} + +// LinkAtHandler handles the LinkAt RPC. +func LinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req LinkAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + name := string(req.Name) + if err := checkSafeName(name); err != nil { + return 0, err + } + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + + targetFD, err := c.LookupControlFD(req.Target) + if err != nil { + return 0, err + } + return targetFD.impl.Link(c, comm, fd.impl, name) +} + +// FStatFSHandler handles the FStatFS RPC. +func FStatFSHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req FStatFSReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + return fd.impl.StatFS(c, comm) +} + +// FAllocateHandler handles the FAllocate RPC. +func FAllocateHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req FAllocateReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupOpenFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.writable { + return 0, unix.EBADF + } + return 0, fd.impl.Allocate(c, req.Mode, req.Offset, req.Length) +} + +// ReadLinkAtHandler handles the ReadLinkAt RPC. +func ReadLinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req ReadLinkAtReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsSymlink() { + return 0, unix.EINVAL + } + return fd.impl.Readlink(c, comm) +} + +// FlushHandler handles the Flush RPC. +func FlushHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req FlushReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupOpenFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + + return 0, fd.impl.Flush(c) +} + +// ConnectHandler handles the Connect RPC. +func ConnectHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req ConnectReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsSocket() { + return 0, unix.ENOTSOCK + } + return 0, fd.impl.Connect(c, comm, req.SockType) +} + +// UnlinkAtHandler handles the UnlinkAt RPC. +func UnlinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req UnlinkAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + name := string(req.Name) + if err := checkSafeName(name); err != nil { + return 0, err + } + + fd, err := c.LookupControlFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.IsDir() { + return 0, unix.ENOTDIR + } + return 0, fd.impl.Unlink(c, name, uint32(req.Flags)) +} + +// RenameAtHandler handles the RenameAt RPC. +func RenameAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req RenameAtReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + newName := string(req.NewName) + if err := checkSafeName(newName); err != nil { + return 0, err + } + + renamed, err := c.LookupControlFD(req.Renamed) + if err != nil { + return 0, err + } + defer renamed.DecRef(nil) + + newDir, err := c.LookupControlFD(req.NewDir) + if err != nil { + return 0, err + } + defer newDir.DecRef(nil) + if !newDir.IsDir() { + return 0, unix.ENOTDIR + } + + // Hold RenameMu for writing during rename, this is important. + c.server.RenameMu.Lock() + defer c.server.RenameMu.Unlock() + + if renamed.parent == nil { + // renamed is root. + return 0, unix.EBUSY + } + + oldParentPath := renamed.parent.FilePathLocked() + oldPath := oldParentPath + "/" + renamed.name + if newName == renamed.name && oldParentPath == newDir.FilePathLocked() { + // Nothing to do. + return 0, nil + } + + updateControlFD, cleanUp, err := renamed.impl.RenameLocked(c, newDir.impl, newName) + if err != nil { + return 0, err + } + + c.server.forEachMountPoint(func(root *ControlFD) { + if !strings.HasPrefix(oldPath, root.name) { + return + } + pit := fspath.Parse(oldPath[len(root.name):]).Begin + root.renameRecursiveLocked(newDir, newName, pit, updateControlFD) + }) + + if cleanUp != nil { + cleanUp() + } + return 0, nil +} + +// Precondition: rename mutex must be locked for writing. +func (fd *ControlFD) renameRecursiveLocked(newDir *ControlFD, newName string, pit fspath.Iterator, updateControlFD func(ControlFDImpl)) { + if !pit.Ok() { + // fd should be renamed. + fd.clearParentLocked() + fd.setParentLocked(newDir) + fd.name = newName + if updateControlFD != nil { + updateControlFD(fd.impl) + } + return + } + + cur := pit.String() + next := pit.Next() + // No need to hold fd.childrenMu because RenameMu is locked for writing. + for child := fd.children.Front(); child != nil; child = child.Next() { + if child.name == cur { + child.renameRecursiveLocked(newDir, newName, next, updateControlFD) + } + } +} + +// Getdents64Handler handles the Getdents64 RPC. +func Getdents64Handler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req Getdents64Req + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupOpenFD(req.DirFD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + if !fd.controlFD.IsDir() { + return 0, unix.ENOTDIR + } + + seek0 := false + if req.Count < 0 { + seek0 = true + req.Count = -req.Count + } + return fd.impl.Getdent64(c, comm, uint32(req.Count), seek0) +} + +// FGetXattrHandler handles the FGetXattr RPC. +func FGetXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req FGetXattrReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + return fd.impl.GetXattr(c, comm, string(req.Name), uint32(req.BufSize)) +} + +// FSetXattrHandler handles the FSetXattr RPC. +func FSetXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req FSetXattrReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + return 0, fd.impl.SetXattr(c, string(req.Name), string(req.Value), uint32(req.Flags)) +} + +// FListXattrHandler handles the FListXattr RPC. +func FListXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + var req FListXattrReq + req.UnmarshalUnsafe(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + return fd.impl.ListXattr(c, comm, req.Size) +} + +// FRemoveXattrHandler handles the FRemoveXattr RPC. +func FRemoveXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { + if c.readonly { + return 0, unix.EROFS + } + var req FRemoveXattrReq + req.UnmarshalBytes(comm.PayloadBuf(payloadLen)) + + fd, err := c.LookupControlFD(req.FD) + if err != nil { + return 0, err + } + defer fd.DecRef(nil) + return 0, fd.impl.RemoveXattr(c, comm, string(req.Name)) +} + +// checkSafeName validates the name and returns nil or returns an error. +func checkSafeName(name string) error { + if name != "" && !strings.Contains(name, "/") && name != "." && name != ".." { + return nil + } + return unix.EINVAL +} diff --git a/pkg/lisafs/message.go b/pkg/lisafs/message.go index 55fd2c0b1..722afd0be 100644 --- a/pkg/lisafs/message.go +++ b/pkg/lisafs/message.go @@ -46,6 +46,94 @@ const ( // Channel requests to start a new communicational channel. Channel MID = 2 + + // FStat requests the stat(2) results for a specified file. + FStat MID = 3 + + // SetStat requests to change file attributes. Note that there is no one + // corresponding Linux syscall. This is a conglomeration of fchmod(2), + // fchown(2), ftruncate(2) and futimesat(2). + SetStat MID = 4 + + // Walk requests to walk the specified path starting from the specified + // directory. Server-side path traversal is terminated preemptively on + // symlinks entries because they can cause non-linear traversal. + Walk MID = 5 + + // WalkStat is the same as Walk, except the following differences: + // * If the first path component is "", then it also returns stat results + // for the directory where the walk starts. + // * Does not return Inode, just the Stat results for each path component. + WalkStat MID = 6 + + // OpenAt is analogous to openat(2). It does not perform any walk. It merely + // duplicates the control FD with the open flags passed. + OpenAt MID = 7 + + // OpenCreateAt is analogous to openat(2) with O_CREAT|O_EXCL added to flags. + // It also returns the newly created file inode. + OpenCreateAt MID = 8 + + // Close is analogous to close(2) but can work on multiple FDs. + Close MID = 9 + + // FSync is analogous to fsync(2) but can work on multiple FDs. + FSync MID = 10 + + // PWrite is analogous to pwrite(2). + PWrite MID = 11 + + // PRead is analogous to pread(2). + PRead MID = 12 + + // MkdirAt is analogous to mkdirat(2). + MkdirAt MID = 13 + + // MknodAt is analogous to mknodat(2). + MknodAt MID = 14 + + // SymlinkAt is analogous to symlinkat(2). + SymlinkAt MID = 15 + + // LinkAt is analogous to linkat(2). + LinkAt MID = 16 + + // FStatFS is analogous to fstatfs(2). + FStatFS MID = 17 + + // FAllocate is analogous to fallocate(2). + FAllocate MID = 18 + + // ReadLinkAt is analogous to readlinkat(2). + ReadLinkAt MID = 19 + + // Flush cleans up the file state. Its behavior is implementation + // dependent and might not even be supported in server implementations. + Flush MID = 20 + + // Connect is loosely analogous to connect(2). + Connect MID = 21 + + // UnlinkAt is analogous to unlinkat(2). + UnlinkAt MID = 22 + + // RenameAt is loosely analogous to renameat(2). + RenameAt MID = 23 + + // Getdents64 is analogous to getdents64(2). + Getdents64 MID = 24 + + // FGetXattr is analogous to fgetxattr(2). + FGetXattr MID = 25 + + // FSetXattr is analogous to fsetxattr(2). + FSetXattr MID = 26 + + // FListXattr is analogous to flistxattr(2). + FListXattr MID = 27 + + // FRemoveXattr is analogous to fremovexattr(2). + FRemoveXattr MID = 28 ) const ( @@ -256,3 +344,908 @@ type ChannelResp struct { type ErrorResp struct { errno uint32 } + +// StatReq requests the stat results for the specified FD. +// +// +marshal +type StatReq struct { + FD FDID +} + +// SetStatReq is used to set attributeds on FDs. +// +// +marshal +type SetStatReq struct { + FD FDID + _ uint32 + Mask uint32 + Mode uint32 // Only permissions part is settable. + UID UID + GID GID + Size uint64 + Atime linux.Timespec + Mtime linux.Timespec +} + +// SetStatResp is used to communicate SetStat results. It contains a mask +// representing the failed changes. It also contains the errno of the failed +// set attribute operation. If multiple operations failed then any of those +// errnos can be returned. +// +// +marshal +type SetStatResp struct { + FailureMask uint32 + FailureErrNo uint32 +} + +// WalkReq is used to request to walk multiple path components at once. This +// is used for both Walk and WalkStat. +type WalkReq struct { + DirFD FDID + Path StringArray +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (w *WalkReq) SizeBytes() int { + return w.DirFD.SizeBytes() + w.Path.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (w *WalkReq) MarshalBytes(dst []byte) { + w.DirFD.MarshalUnsafe(dst) + dst = dst[w.DirFD.SizeBytes():] + w.Path.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (w *WalkReq) UnmarshalBytes(src []byte) { + w.DirFD.UnmarshalUnsafe(src) + src = src[w.DirFD.SizeBytes():] + w.Path.UnmarshalBytes(src) +} + +// WalkStatus is used to indicate the reason for partial/unsuccessful server +// side Walk operations. Please note that partial/unsuccessful walk operations +// do not necessarily fail the RPC. The RPC is successful with a failure hint +// which can be used by the client to infer server-side state. +type WalkStatus = primitive.Uint8 + +const ( + // WalkSuccess indicates that all path components were successfully walked. + WalkSuccess WalkStatus = iota + + // WalkComponentDoesNotExist indicates that the walk was prematurely + // terminated because an intermediate path component does not exist on + // server. The results of all previous existing path components is returned. + WalkComponentDoesNotExist + + // WalkComponentSymlink indicates that the walk was prematurely + // terminated because an intermediate path component was a symlink. It is not + // safe to resolve symlinks remotely (unaware of mount points). + WalkComponentSymlink +) + +// WalkResp is used to communicate the inodes walked by the server. +type WalkResp struct { + Status WalkStatus + Inodes []Inode +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (w *WalkResp) SizeBytes() int { + return w.Status.SizeBytes() + + (*primitive.Uint32)(nil).SizeBytes() + (len(w.Inodes) * (*Inode)(nil).SizeBytes()) +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (w *WalkResp) MarshalBytes(dst []byte) { + w.Status.MarshalUnsafe(dst) + dst = dst[w.Status.SizeBytes():] + + numInodes := primitive.Uint32(len(w.Inodes)) + numInodes.MarshalUnsafe(dst) + dst = dst[numInodes.SizeBytes():] + + MarshalUnsafeInodeSlice(w.Inodes, dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (w *WalkResp) UnmarshalBytes(src []byte) { + w.Status.UnmarshalUnsafe(src) + src = src[w.Status.SizeBytes():] + + var numInodes primitive.Uint32 + numInodes.UnmarshalUnsafe(src) + src = src[numInodes.SizeBytes():] + + if cap(w.Inodes) < int(numInodes) { + w.Inodes = make([]Inode, numInodes) + } else { + w.Inodes = w.Inodes[:numInodes] + } + UnmarshalUnsafeInodeSlice(w.Inodes, src) +} + +// WalkStatResp is used to communicate stat results for WalkStat. +type WalkStatResp struct { + Stats []linux.Statx +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (w *WalkStatResp) SizeBytes() int { + return (*primitive.Uint32)(nil).SizeBytes() + (len(w.Stats) * linux.SizeOfStatx) +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (w *WalkStatResp) MarshalBytes(dst []byte) { + numStats := primitive.Uint32(len(w.Stats)) + numStats.MarshalUnsafe(dst) + dst = dst[numStats.SizeBytes():] + + linux.MarshalUnsafeStatxSlice(w.Stats, dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (w *WalkStatResp) UnmarshalBytes(src []byte) { + var numStats primitive.Uint32 + numStats.UnmarshalUnsafe(src) + src = src[numStats.SizeBytes():] + + if cap(w.Stats) < int(numStats) { + w.Stats = make([]linux.Statx, numStats) + } else { + w.Stats = w.Stats[:numStats] + } + linux.UnmarshalUnsafeStatxSlice(w.Stats, src) +} + +// OpenAtReq is used to open existing FDs with the specified flags. +// +// +marshal +type OpenAtReq struct { + FD FDID + Flags uint32 +} + +// OpenAtResp is used to communicate the newly created FD. +// +// +marshal +type OpenAtResp struct { + NewFD FDID +} + +// +marshal +type createCommon struct { + DirFD FDID + Mode linux.FileMode + _ uint16 // Need to make struct packed. + UID UID + GID GID +} + +// OpenCreateAtReq is used to make OpenCreateAt requests. +type OpenCreateAtReq struct { + createCommon + Name SizedString + Flags primitive.Uint32 +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (o *OpenCreateAtReq) SizeBytes() int { + return o.createCommon.SizeBytes() + o.Name.SizeBytes() + o.Flags.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (o *OpenCreateAtReq) MarshalBytes(dst []byte) { + o.createCommon.MarshalUnsafe(dst) + dst = dst[o.createCommon.SizeBytes():] + o.Name.MarshalBytes(dst) + dst = dst[o.Name.SizeBytes():] + o.Flags.MarshalUnsafe(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (o *OpenCreateAtReq) UnmarshalBytes(src []byte) { + o.createCommon.UnmarshalUnsafe(src) + src = src[o.createCommon.SizeBytes():] + o.Name.UnmarshalBytes(src) + src = src[o.Name.SizeBytes():] + o.Flags.UnmarshalUnsafe(src) +} + +// OpenCreateAtResp is used to communicate successful OpenCreateAt results. +// +// +marshal +type OpenCreateAtResp struct { + Child Inode + NewFD FDID + _ uint32 // Need to make struct packed. +} + +// FdArray is a utility struct which implements a marshallable type for +// communicating an array of FDIDs. In memory, the array data is preceded by a +// uint32 denoting the array length. +type FdArray []FDID + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FdArray) SizeBytes() int { + return (*primitive.Uint32)(nil).SizeBytes() + (len(*f) * (*FDID)(nil).SizeBytes()) +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FdArray) MarshalBytes(dst []byte) { + arrLen := primitive.Uint32(len(*f)) + arrLen.MarshalUnsafe(dst) + dst = dst[arrLen.SizeBytes():] + MarshalUnsafeFDIDSlice(*f, dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FdArray) UnmarshalBytes(src []byte) { + var arrLen primitive.Uint32 + arrLen.UnmarshalUnsafe(src) + src = src[arrLen.SizeBytes():] + if cap(*f) < int(arrLen) { + *f = make(FdArray, arrLen) + } else { + *f = (*f)[:arrLen] + } + UnmarshalUnsafeFDIDSlice(*f, src) +} + +// CloseReq is used to close(2) FDs. +type CloseReq struct { + FDs FdArray +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (c *CloseReq) SizeBytes() int { + return c.FDs.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (c *CloseReq) MarshalBytes(dst []byte) { + c.FDs.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (c *CloseReq) UnmarshalBytes(src []byte) { + c.FDs.UnmarshalBytes(src) +} + +// FsyncReq is used to fsync(2) FDs. +type FsyncReq struct { + FDs FdArray +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FsyncReq) SizeBytes() int { + return f.FDs.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FsyncReq) MarshalBytes(dst []byte) { + f.FDs.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FsyncReq) UnmarshalBytes(src []byte) { + f.FDs.UnmarshalBytes(src) +} + +// PReadReq is used to pread(2) on an FD. +// +// +marshal +type PReadReq struct { + Offset uint64 + FD FDID + Count uint32 +} + +// PReadResp is used to return the result of pread(2). +type PReadResp struct { + NumBytes primitive.Uint32 + Buf []byte +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (r *PReadResp) SizeBytes() int { + return r.NumBytes.SizeBytes() + int(r.NumBytes) +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (r *PReadResp) MarshalBytes(dst []byte) { + r.NumBytes.MarshalUnsafe(dst) + dst = dst[r.NumBytes.SizeBytes():] + copy(dst[:r.NumBytes], r.Buf[:r.NumBytes]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (r *PReadResp) UnmarshalBytes(src []byte) { + r.NumBytes.UnmarshalUnsafe(src) + src = src[r.NumBytes.SizeBytes():] + + // We expect the client to have already allocated r.Buf. r.Buf probably + // (optimally) points to usermem. Directly copy into that. + copy(r.Buf[:r.NumBytes], src[:r.NumBytes]) +} + +// PWriteReq is used to pwrite(2) on an FD. +type PWriteReq struct { + Offset primitive.Uint64 + FD FDID + NumBytes primitive.Uint32 + Buf []byte +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (w *PWriteReq) SizeBytes() int { + return w.Offset.SizeBytes() + w.FD.SizeBytes() + w.NumBytes.SizeBytes() + int(w.NumBytes) +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (w *PWriteReq) MarshalBytes(dst []byte) { + w.Offset.MarshalUnsafe(dst) + dst = dst[w.Offset.SizeBytes():] + w.FD.MarshalUnsafe(dst) + dst = dst[w.FD.SizeBytes():] + w.NumBytes.MarshalUnsafe(dst) + dst = dst[w.NumBytes.SizeBytes():] + copy(dst[:w.NumBytes], w.Buf[:w.NumBytes]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (w *PWriteReq) UnmarshalBytes(src []byte) { + w.Offset.UnmarshalUnsafe(src) + src = src[w.Offset.SizeBytes():] + w.FD.UnmarshalUnsafe(src) + src = src[w.FD.SizeBytes():] + w.NumBytes.UnmarshalUnsafe(src) + src = src[w.NumBytes.SizeBytes():] + + // This is an optimization. Assuming that the server is making this call, it + // is safe to just point to src rather than allocating and copying. + w.Buf = src[:w.NumBytes] +} + +// PWriteResp is used to return the result of pwrite(2). +// +// +marshal +type PWriteResp struct { + Count uint64 +} + +// MkdirAtReq is used to make MkdirAt requests. +type MkdirAtReq struct { + createCommon + Name SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (m *MkdirAtReq) SizeBytes() int { + return m.createCommon.SizeBytes() + m.Name.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (m *MkdirAtReq) MarshalBytes(dst []byte) { + m.createCommon.MarshalUnsafe(dst) + dst = dst[m.createCommon.SizeBytes():] + m.Name.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (m *MkdirAtReq) UnmarshalBytes(src []byte) { + m.createCommon.UnmarshalUnsafe(src) + src = src[m.createCommon.SizeBytes():] + m.Name.UnmarshalBytes(src) +} + +// MkdirAtResp is the response to a successful MkdirAt request. +// +// +marshal +type MkdirAtResp struct { + ChildDir Inode +} + +// MknodAtReq is used to make MknodAt requests. +type MknodAtReq struct { + createCommon + Name SizedString + Minor primitive.Uint32 + Major primitive.Uint32 +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (m *MknodAtReq) SizeBytes() int { + return m.createCommon.SizeBytes() + m.Name.SizeBytes() + m.Minor.SizeBytes() + m.Major.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (m *MknodAtReq) MarshalBytes(dst []byte) { + m.createCommon.MarshalUnsafe(dst) + dst = dst[m.createCommon.SizeBytes():] + m.Name.MarshalBytes(dst) + dst = dst[m.Name.SizeBytes():] + m.Minor.MarshalUnsafe(dst) + dst = dst[m.Minor.SizeBytes():] + m.Major.MarshalUnsafe(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (m *MknodAtReq) UnmarshalBytes(src []byte) { + m.createCommon.UnmarshalUnsafe(src) + src = src[m.createCommon.SizeBytes():] + m.Name.UnmarshalBytes(src) + src = src[m.Name.SizeBytes():] + m.Minor.UnmarshalUnsafe(src) + src = src[m.Minor.SizeBytes():] + m.Major.UnmarshalUnsafe(src) +} + +// MknodAtResp is the response to a successful MknodAt request. +// +// +marshal +type MknodAtResp struct { + Child Inode +} + +// SymlinkAtReq is used to make SymlinkAt request. +type SymlinkAtReq struct { + DirFD FDID + Name SizedString + Target SizedString + UID UID + GID GID +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (s *SymlinkAtReq) SizeBytes() int { + return s.DirFD.SizeBytes() + s.Name.SizeBytes() + s.Target.SizeBytes() + s.UID.SizeBytes() + s.GID.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (s *SymlinkAtReq) MarshalBytes(dst []byte) { + s.DirFD.MarshalUnsafe(dst) + dst = dst[s.DirFD.SizeBytes():] + s.Name.MarshalBytes(dst) + dst = dst[s.Name.SizeBytes():] + s.Target.MarshalBytes(dst) + dst = dst[s.Target.SizeBytes():] + s.UID.MarshalUnsafe(dst) + dst = dst[s.UID.SizeBytes():] + s.GID.MarshalUnsafe(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (s *SymlinkAtReq) UnmarshalBytes(src []byte) { + s.DirFD.UnmarshalUnsafe(src) + src = src[s.DirFD.SizeBytes():] + s.Name.UnmarshalBytes(src) + src = src[s.Name.SizeBytes():] + s.Target.UnmarshalBytes(src) + src = src[s.Target.SizeBytes():] + s.UID.UnmarshalUnsafe(src) + src = src[s.UID.SizeBytes():] + s.GID.UnmarshalUnsafe(src) +} + +// SymlinkAtResp is the response to a successful SymlinkAt request. +// +// +marshal +type SymlinkAtResp struct { + Symlink Inode +} + +// LinkAtReq is used to make LinkAt requests. +type LinkAtReq struct { + DirFD FDID + Target FDID + Name SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (l *LinkAtReq) SizeBytes() int { + return l.DirFD.SizeBytes() + l.Target.SizeBytes() + l.Name.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (l *LinkAtReq) MarshalBytes(dst []byte) { + l.DirFD.MarshalUnsafe(dst) + dst = dst[l.DirFD.SizeBytes():] + l.Target.MarshalUnsafe(dst) + dst = dst[l.Target.SizeBytes():] + l.Name.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (l *LinkAtReq) UnmarshalBytes(src []byte) { + l.DirFD.UnmarshalUnsafe(src) + src = src[l.DirFD.SizeBytes():] + l.Target.UnmarshalUnsafe(src) + src = src[l.Target.SizeBytes():] + l.Name.UnmarshalBytes(src) +} + +// LinkAtResp is used to respond to a successful LinkAt request. +// +// +marshal +type LinkAtResp struct { + Link Inode +} + +// FStatFSReq is used to request StatFS results for the specified FD. +// +// +marshal +type FStatFSReq struct { + FD FDID +} + +// StatFS is responded to a successful FStatFS request. +// +// +marshal +type StatFS struct { + Type uint64 + BlockSize int64 + Blocks uint64 + BlocksFree uint64 + BlocksAvailable uint64 + Files uint64 + FilesFree uint64 + NameLength uint64 +} + +// FAllocateReq is used to request to fallocate(2) an FD. This has no response. +// +// +marshal +type FAllocateReq struct { + FD FDID + _ uint32 + Mode uint64 + Offset uint64 + Length uint64 +} + +// ReadLinkAtReq is used to readlinkat(2) at the specified FD. +// +// +marshal +type ReadLinkAtReq struct { + FD FDID +} + +// ReadLinkAtResp is used to communicate ReadLinkAt results. +type ReadLinkAtResp struct { + Target SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (r *ReadLinkAtResp) SizeBytes() int { + return r.Target.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (r *ReadLinkAtResp) MarshalBytes(dst []byte) { + r.Target.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (r *ReadLinkAtResp) UnmarshalBytes(src []byte) { + r.Target.UnmarshalBytes(src) +} + +// FlushReq is used to make Flush requests. +// +// +marshal +type FlushReq struct { + FD FDID +} + +// ConnectReq is used to make a Connect request. +// +// +marshal +type ConnectReq struct { + FD FDID + // SockType is used to specify the socket type to connect to. As a special + // case, SockType = 0 means that the socket type does not matter and the + // requester will accept any socket type. + SockType uint32 +} + +// UnlinkAtReq is used to make UnlinkAt request. +type UnlinkAtReq struct { + DirFD FDID + Name SizedString + Flags primitive.Uint32 +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (u *UnlinkAtReq) SizeBytes() int { + return u.DirFD.SizeBytes() + u.Name.SizeBytes() + u.Flags.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (u *UnlinkAtReq) MarshalBytes(dst []byte) { + u.DirFD.MarshalUnsafe(dst) + dst = dst[u.DirFD.SizeBytes():] + u.Name.MarshalBytes(dst) + dst = dst[u.Name.SizeBytes():] + u.Flags.MarshalUnsafe(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (u *UnlinkAtReq) UnmarshalBytes(src []byte) { + u.DirFD.UnmarshalUnsafe(src) + src = src[u.DirFD.SizeBytes():] + u.Name.UnmarshalBytes(src) + src = src[u.Name.SizeBytes():] + u.Flags.UnmarshalUnsafe(src) +} + +// RenameAtReq is used to make Rename requests. Note that the request takes in +// the to-be-renamed file's FD instead of oldDir and oldName like renameat(2). +type RenameAtReq struct { + Renamed FDID + NewDir FDID + NewName SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (r *RenameAtReq) SizeBytes() int { + return r.Renamed.SizeBytes() + r.NewDir.SizeBytes() + r.NewName.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (r *RenameAtReq) MarshalBytes(dst []byte) { + r.Renamed.MarshalUnsafe(dst) + dst = dst[r.Renamed.SizeBytes():] + r.NewDir.MarshalUnsafe(dst) + dst = dst[r.NewDir.SizeBytes():] + r.NewName.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (r *RenameAtReq) UnmarshalBytes(src []byte) { + r.Renamed.UnmarshalUnsafe(src) + src = src[r.Renamed.SizeBytes():] + r.NewDir.UnmarshalUnsafe(src) + src = src[r.NewDir.SizeBytes():] + r.NewName.UnmarshalBytes(src) +} + +// Getdents64Req is used to make Getdents64 requests. +// +// +marshal +type Getdents64Req struct { + DirFD FDID + // Count is the number of bytes to read. A negative value of Count is used to + // indicate that the implementation must lseek(0, SEEK_SET) before calling + // getdents64(2). Implementations must use the absolute value of Count to + // determine the number of bytes to read. + Count int32 +} + +// Dirent64 is analogous to struct linux_dirent64. +type Dirent64 struct { + Ino primitive.Uint64 + DevMinor primitive.Uint32 + DevMajor primitive.Uint32 + Off primitive.Uint64 + Type primitive.Uint8 + Name SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (d *Dirent64) SizeBytes() int { + return d.Ino.SizeBytes() + d.DevMinor.SizeBytes() + d.DevMajor.SizeBytes() + d.Off.SizeBytes() + d.Type.SizeBytes() + d.Name.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (d *Dirent64) MarshalBytes(dst []byte) { + d.Ino.MarshalUnsafe(dst) + dst = dst[d.Ino.SizeBytes():] + d.DevMinor.MarshalUnsafe(dst) + dst = dst[d.DevMinor.SizeBytes():] + d.DevMajor.MarshalUnsafe(dst) + dst = dst[d.DevMajor.SizeBytes():] + d.Off.MarshalUnsafe(dst) + dst = dst[d.Off.SizeBytes():] + d.Type.MarshalUnsafe(dst) + dst = dst[d.Type.SizeBytes():] + d.Name.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (d *Dirent64) UnmarshalBytes(src []byte) { + d.Ino.UnmarshalUnsafe(src) + src = src[d.Ino.SizeBytes():] + d.DevMinor.UnmarshalUnsafe(src) + src = src[d.DevMinor.SizeBytes():] + d.DevMajor.UnmarshalUnsafe(src) + src = src[d.DevMajor.SizeBytes():] + d.Off.UnmarshalUnsafe(src) + src = src[d.Off.SizeBytes():] + d.Type.UnmarshalUnsafe(src) + src = src[d.Type.SizeBytes():] + d.Name.UnmarshalBytes(src) +} + +// Getdents64Resp is used to communicate getdents64 results. +type Getdents64Resp struct { + Dirents []Dirent64 +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (g *Getdents64Resp) SizeBytes() int { + ret := (*primitive.Uint32)(nil).SizeBytes() + for i := range g.Dirents { + ret += g.Dirents[i].SizeBytes() + } + return ret +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (g *Getdents64Resp) MarshalBytes(dst []byte) { + numDirents := primitive.Uint32(len(g.Dirents)) + numDirents.MarshalUnsafe(dst) + dst = dst[numDirents.SizeBytes():] + for i := range g.Dirents { + g.Dirents[i].MarshalBytes(dst) + dst = dst[g.Dirents[i].SizeBytes():] + } +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (g *Getdents64Resp) UnmarshalBytes(src []byte) { + var numDirents primitive.Uint32 + numDirents.UnmarshalUnsafe(src) + if cap(g.Dirents) < int(numDirents) { + g.Dirents = make([]Dirent64, numDirents) + } else { + g.Dirents = g.Dirents[:numDirents] + } + + src = src[numDirents.SizeBytes():] + for i := range g.Dirents { + g.Dirents[i].UnmarshalBytes(src) + src = src[g.Dirents[i].SizeBytes():] + } +} + +// FGetXattrReq is used to make FGetXattr requests. The response to this is +// just a SizedString containing the xattr value. +type FGetXattrReq struct { + FD FDID + BufSize primitive.Uint32 + Name SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (g *FGetXattrReq) SizeBytes() int { + return g.FD.SizeBytes() + g.BufSize.SizeBytes() + g.Name.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (g *FGetXattrReq) MarshalBytes(dst []byte) { + g.FD.MarshalUnsafe(dst) + dst = dst[g.FD.SizeBytes():] + g.BufSize.MarshalUnsafe(dst) + dst = dst[g.BufSize.SizeBytes():] + g.Name.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (g *FGetXattrReq) UnmarshalBytes(src []byte) { + g.FD.UnmarshalUnsafe(src) + src = src[g.FD.SizeBytes():] + g.BufSize.UnmarshalUnsafe(src) + src = src[g.BufSize.SizeBytes():] + g.Name.UnmarshalBytes(src) +} + +// FGetXattrResp is used to respond to FGetXattr request. +type FGetXattrResp struct { + Value SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (g *FGetXattrResp) SizeBytes() int { + return g.Value.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (g *FGetXattrResp) MarshalBytes(dst []byte) { + g.Value.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (g *FGetXattrResp) UnmarshalBytes(src []byte) { + g.Value.UnmarshalBytes(src) +} + +// FSetXattrReq is used to make FSetXattr requests. It has no response. +type FSetXattrReq struct { + FD FDID + Flags primitive.Uint32 + Name SizedString + Value SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (s *FSetXattrReq) SizeBytes() int { + return s.FD.SizeBytes() + s.Flags.SizeBytes() + s.Name.SizeBytes() + s.Value.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (s *FSetXattrReq) MarshalBytes(dst []byte) { + s.FD.MarshalUnsafe(dst) + dst = dst[s.FD.SizeBytes():] + s.Flags.MarshalUnsafe(dst) + dst = dst[s.Flags.SizeBytes():] + s.Name.MarshalBytes(dst) + dst = dst[s.Name.SizeBytes():] + s.Value.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (s *FSetXattrReq) UnmarshalBytes(src []byte) { + s.FD.UnmarshalUnsafe(src) + src = src[s.FD.SizeBytes():] + s.Flags.UnmarshalUnsafe(src) + src = src[s.Flags.SizeBytes():] + s.Name.UnmarshalBytes(src) + src = src[s.Name.SizeBytes():] + s.Value.UnmarshalBytes(src) +} + +// FRemoveXattrReq is used to make FRemoveXattr requests. It has no response. +type FRemoveXattrReq struct { + FD FDID + Name SizedString +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (r *FRemoveXattrReq) SizeBytes() int { + return r.FD.SizeBytes() + r.Name.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (r *FRemoveXattrReq) MarshalBytes(dst []byte) { + r.FD.MarshalUnsafe(dst) + dst = dst[r.FD.SizeBytes():] + r.Name.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (r *FRemoveXattrReq) UnmarshalBytes(src []byte) { + r.FD.UnmarshalUnsafe(src) + src = src[r.FD.SizeBytes():] + r.Name.UnmarshalBytes(src) +} + +// FListXattrReq is used to make FListXattr requests. +// +// +marshal +type FListXattrReq struct { + FD FDID + _ uint32 + Size uint64 +} + +// FListXattrResp is used to respond to FListXattr requests. +type FListXattrResp struct { + Xattrs StringArray +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (l *FListXattrResp) SizeBytes() int { + return l.Xattrs.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (l *FListXattrResp) MarshalBytes(dst []byte) { + l.Xattrs.MarshalBytes(dst) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (l *FListXattrResp) UnmarshalBytes(src []byte) { + l.Xattrs.UnmarshalBytes(src) +} diff --git a/pkg/lisafs/testsuite/BUILD b/pkg/lisafs/testsuite/BUILD new file mode 100644 index 000000000..b4a542b3a --- /dev/null +++ b/pkg/lisafs/testsuite/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package( + default_visibility = ["//visibility:public"], + licenses = ["notice"], +) + +go_library( + name = "testsuite", + testonly = True, + srcs = ["testsuite.go"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/lisafs", + "//pkg/unet", + "@com_github_syndtr_gocapability//capability:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/lisafs/testsuite/testsuite.go b/pkg/lisafs/testsuite/testsuite.go new file mode 100644 index 000000000..476ff76a5 --- /dev/null +++ b/pkg/lisafs/testsuite/testsuite.go @@ -0,0 +1,637 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testsuite provides a integration testing suite for lisafs. +// These tests are intended for servers serving the local filesystem. +package testsuite + +import ( + "bytes" + "fmt" + "io/ioutil" + "math/rand" + "os" + "testing" + "time" + + "github.com/syndtr/gocapability/capability" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/lisafs" + "gvisor.dev/gvisor/pkg/unet" +) + +// Tester is the client code using this test suite. This interface abstracts +// away all the caller specific details. +type Tester interface { + // NewServer returns a new instance of the tester server. + NewServer(t *testing.T) *lisafs.Server + + // LinkSupported returns true if the backing server supports LinkAt. + LinkSupported() bool + + // SetUserGroupIDSupported returns true if the backing server supports + // changing UID/GID for files. + SetUserGroupIDSupported() bool +} + +// RunAllLocalFSTests runs all local FS tests as subtests. +func RunAllLocalFSTests(t *testing.T, tester Tester) { + for name, testFn := range localFSTests { + t.Run(name, func(t *testing.T) { + runServerClient(t, tester, testFn) + }) + } +} + +type testFunc func(context.Context, *testing.T, Tester, lisafs.ClientFD) + +var localFSTests map[string]testFunc = map[string]testFunc{ + "Stat": testStat, + "RegularFileIO": testRegularFileIO, + "RegularFileOpen": testRegularFileOpen, + "SetStat": testSetStat, + "Allocate": testAllocate, + "StatFS": testStatFS, + "Unlink": testUnlink, + "Symlink": testSymlink, + "HardLink": testHardLink, + "Walk": testWalk, + "Rename": testRename, + "Mknod": testMknod, + "Getdents": testGetdents, +} + +func runServerClient(t *testing.T, tester Tester, testFn testFunc) { + mountPath, err := ioutil.TempDir(os.Getenv("TEST_TMPDIR"), "") + if err != nil { + t.Fatalf("creation of temporary mountpoint failed: %v", err) + } + defer os.RemoveAll(mountPath) + + // fsgofer should run with a umask of 0, because we want to preserve file + // modes exactly for testing purposes. + unix.Umask(0) + + serverSocket, clientSocket, err := unet.SocketPair(false) + if err != nil { + t.Fatalf("socketpair got err %v expected nil", err) + } + + server := tester.NewServer(t) + conn, err := server.CreateConnection(serverSocket, false /* readonly */) + if err != nil { + t.Fatalf("starting connection failed: %v", err) + return + } + server.StartConnection(conn) + + c, root, err := lisafs.NewClient(clientSocket, mountPath) + if err != nil { + t.Fatalf("client creation failed: %v", err) + } + + if !root.ControlFD.Ok() { + t.Fatalf("root control FD is not valid") + } + rootFile := c.NewFD(root.ControlFD) + + ctx := context.Background() + testFn(ctx, t, tester, rootFile) + closeFD(ctx, t, rootFile) + + c.Close() // This should trigger client and server shutdown. + server.Wait() +} + +func closeFD(ctx context.Context, t testing.TB, fdLisa lisafs.ClientFD) { + if err := fdLisa.Close(ctx); err != nil { + t.Errorf("failed to close FD: %v", err) + } +} + +func statTo(ctx context.Context, t *testing.T, fdLisa lisafs.ClientFD, stat *linux.Statx) { + if err := fdLisa.StatTo(ctx, stat); err != nil { + t.Fatalf("stat failed: %v", err) + } +} + +func openCreateFile(ctx context.Context, t *testing.T, fdLisa lisafs.ClientFD, name string) (lisafs.ClientFD, linux.Statx, lisafs.ClientFD, int) { + child, childFD, childHostFD, err := fdLisa.OpenCreateAt(ctx, name, unix.O_RDWR, 0777, lisafs.UID(unix.Getuid()), lisafs.GID(unix.Getgid())) + if err != nil { + t.Fatalf("OpenCreateAt failed: %v", err) + } + if childHostFD == -1 { + t.Error("no host FD donated") + } + client := fdLisa.Client() + return client.NewFD(child.ControlFD), child.Stat, fdLisa.Client().NewFD(childFD), childHostFD +} + +func openFile(ctx context.Context, t *testing.T, fdLisa lisafs.ClientFD, flags uint32, isReg bool) (lisafs.ClientFD, int) { + newFD, hostFD, err := fdLisa.OpenAt(ctx, flags) + if err != nil { + t.Fatalf("OpenAt failed: %v", err) + } + if hostFD == -1 && isReg { + t.Error("no host FD donated") + } + return fdLisa.Client().NewFD(newFD), hostFD +} + +func unlinkFile(ctx context.Context, t *testing.T, dir lisafs.ClientFD, name string, isDir bool) { + var flags uint32 + if isDir { + flags = unix.AT_REMOVEDIR + } + if err := dir.UnlinkAt(ctx, name, flags); err != nil { + t.Errorf("unlinking file %s failed: %v", name, err) + } +} + +func symlink(ctx context.Context, t *testing.T, dir lisafs.ClientFD, name, target string) (lisafs.ClientFD, linux.Statx) { + linkIno, err := dir.SymlinkAt(ctx, name, target, lisafs.UID(unix.Getuid()), lisafs.GID(unix.Getgid())) + if err != nil { + t.Fatalf("symlink failed: %v", err) + } + return dir.Client().NewFD(linkIno.ControlFD), linkIno.Stat +} + +func link(ctx context.Context, t *testing.T, dir lisafs.ClientFD, name string, target lisafs.ClientFD) (lisafs.ClientFD, linux.Statx) { + linkIno, err := dir.LinkAt(ctx, target.ID(), name) + if err != nil { + t.Fatalf("link failed: %v", err) + } + return dir.Client().NewFD(linkIno.ControlFD), linkIno.Stat +} + +func mkdir(ctx context.Context, t *testing.T, dir lisafs.ClientFD, name string) (lisafs.ClientFD, linux.Statx) { + childIno, err := dir.MkdirAt(ctx, name, 0777, lisafs.UID(unix.Getuid()), lisafs.GID(unix.Getgid())) + if err != nil { + t.Fatalf("mkdir failed: %v", err) + } + return dir.Client().NewFD(childIno.ControlFD), childIno.Stat +} + +func mknod(ctx context.Context, t *testing.T, dir lisafs.ClientFD, name string) (lisafs.ClientFD, linux.Statx) { + nodeIno, err := dir.MknodAt(ctx, name, unix.S_IFREG|0777, lisafs.UID(unix.Getuid()), lisafs.GID(unix.Getgid()), 0, 0) + if err != nil { + t.Fatalf("mknod failed: %v", err) + } + return dir.Client().NewFD(nodeIno.ControlFD), nodeIno.Stat +} + +func walk(ctx context.Context, t *testing.T, dir lisafs.ClientFD, names []string) []lisafs.Inode { + _, inodes, err := dir.WalkMultiple(ctx, names) + if err != nil { + t.Fatalf("walk failed while trying to walk components %+v: %v", names, err) + } + return inodes +} + +func walkStat(ctx context.Context, t *testing.T, dir lisafs.ClientFD, names []string) []linux.Statx { + stats, err := dir.WalkStat(ctx, names) + if err != nil { + t.Fatalf("walk failed while trying to walk components %+v: %v", names, err) + } + return stats +} + +func writeFD(ctx context.Context, t *testing.T, fdLisa lisafs.ClientFD, off uint64, buf []byte) error { + count, err := fdLisa.Write(ctx, buf, off) + if err != nil { + return err + } + if int(count) != len(buf) { + t.Errorf("partial write: buf size = %d, written = %d", len(buf), count) + } + return nil +} + +func readFDAndCmp(ctx context.Context, t *testing.T, fdLisa lisafs.ClientFD, off uint64, want []byte) { + buf := make([]byte, len(want)) + n, err := fdLisa.Read(ctx, buf, off) + if err != nil { + t.Errorf("read failed: %v", err) + return + } + if int(n) != len(want) { + t.Errorf("partial read: buf size = %d, read = %d", len(want), n) + return + } + if bytes.Compare(buf, want) != 0 { + t.Errorf("bytes read differ from what was expected: want = %v, got = %v", want, buf) + } +} + +func allocateAndVerify(ctx context.Context, t *testing.T, fdLisa lisafs.ClientFD, off uint64, length uint64) { + if err := fdLisa.Allocate(ctx, 0, off, length); err != nil { + t.Fatalf("fallocate failed: %v", err) + } + + var stat linux.Statx + statTo(ctx, t, fdLisa, &stat) + if want := off + length; stat.Size != want { + t.Errorf("incorrect file size after allocate: expected %d, got %d", off+length, stat.Size) + } +} + +func cmpStatx(t *testing.T, want, got linux.Statx) { + if got.Mask&unix.STATX_MODE != 0 && want.Mask&unix.STATX_MODE != 0 { + if got.Mode != want.Mode { + t.Errorf("mode differs: want %d, got %d", want.Mode, got.Mode) + } + } + if got.Mask&unix.STATX_INO != 0 && want.Mask&unix.STATX_INO != 0 { + if got.Ino != want.Ino { + t.Errorf("inode number differs: want %d, got %d", want.Ino, got.Ino) + } + } + if got.Mask&unix.STATX_NLINK != 0 && want.Mask&unix.STATX_NLINK != 0 { + if got.Nlink != want.Nlink { + t.Errorf("nlink differs: want %d, got %d", want.Nlink, got.Nlink) + } + } + if got.Mask&unix.STATX_UID != 0 && want.Mask&unix.STATX_UID != 0 { + if got.UID != want.UID { + t.Errorf("UID differs: want %d, got %d", want.UID, got.UID) + } + } + if got.Mask&unix.STATX_GID != 0 && want.Mask&unix.STATX_GID != 0 { + if got.GID != want.GID { + t.Errorf("GID differs: want %d, got %d", want.GID, got.GID) + } + } + if got.Mask&unix.STATX_SIZE != 0 && want.Mask&unix.STATX_SIZE != 0 { + if got.Size != want.Size { + t.Errorf("size differs: want %d, got %d", want.Size, got.Size) + } + } + if got.Mask&unix.STATX_BLOCKS != 0 && want.Mask&unix.STATX_BLOCKS != 0 { + if got.Blocks != want.Blocks { + t.Errorf("blocks differs: want %d, got %d", want.Blocks, got.Blocks) + } + } + if got.Mask&unix.STATX_ATIME != 0 && want.Mask&unix.STATX_ATIME != 0 { + if got.Atime != want.Atime { + t.Errorf("atime differs: want %d, got %d", want.Atime, got.Atime) + } + } + if got.Mask&unix.STATX_MTIME != 0 && want.Mask&unix.STATX_MTIME != 0 { + if got.Mtime != want.Mtime { + t.Errorf("mtime differs: want %d, got %d", want.Mtime, got.Mtime) + } + } + if got.Mask&unix.STATX_CTIME != 0 && want.Mask&unix.STATX_CTIME != 0 { + if got.Ctime != want.Ctime { + t.Errorf("ctime differs: want %d, got %d", want.Ctime, got.Ctime) + } + } +} + +func hasCapability(c capability.Cap) bool { + caps, err := capability.NewPid2(os.Getpid()) + if err != nil { + return false + } + if err := caps.Load(); err != nil { + return false + } + return caps.Get(capability.EFFECTIVE, c) +} + +func testStat(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + var rootStat linux.Statx + if err := root.StatTo(ctx, &rootStat); err != nil { + t.Errorf("stat on the root dir failed: %v", err) + } + + if ftype := rootStat.Mode & unix.S_IFMT; ftype != unix.S_IFDIR { + t.Errorf("root inode is not a directory, file type = %d", ftype) + } +} + +func testRegularFileIO(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "tempFile" + controlFile, _, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, controlFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + // Test Read/Write RPCs. + data := make([]byte, 100) + rand.Read(data) + if err := writeFD(ctx, t, fd, 0, data); err != nil { + t.Fatalf("write failed: %v", err) + } + readFDAndCmp(ctx, t, fd, 0, data) + readFDAndCmp(ctx, t, fd, 50, data[50:]) + + // Make sure the host FD is configured properly. + hostReadData := make([]byte, len(data)) + if n, err := unix.Pread(hostFD, hostReadData, 0); err != nil { + t.Errorf("host read failed: %v", err) + } else if n != len(hostReadData) { + t.Errorf("partial read: buf size = %d, read = %d", len(hostReadData), n) + } else if bytes.Compare(hostReadData, data) != 0 { + t.Errorf("bytes read differ from what was expected: want = %v, got = %v", data, hostReadData) + } + + // Test syncing the writable FD. + if err := fd.Sync(ctx); err != nil { + t.Errorf("syncing the FD failed: %v", err) + } +} + +func testRegularFileOpen(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "tempFile" + controlFile, _, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, controlFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + // Open a readonly FD and try writing to it to get an EBADF. + roFile, roHostFD := openFile(ctx, t, controlFile, unix.O_RDONLY, true /* isReg */) + defer closeFD(ctx, t, roFile) + defer unix.Close(roHostFD) + if err := writeFD(ctx, t, roFile, 0, []byte{1, 2, 3}); err != unix.EBADF { + t.Errorf("writing to read only FD should generate EBADF, but got %v", err) + } +} + +func testSetStat(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "tempFile" + controlFile, _, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, controlFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + now := time.Now() + wantStat := linux.Statx{ + Mask: unix.STATX_MODE | unix.STATX_ATIME | unix.STATX_MTIME | unix.STATX_SIZE, + Mode: 0760, + UID: uint32(unix.Getuid()), + GID: uint32(unix.Getgid()), + Size: 50, + Atime: linux.NsecToStatxTimestamp(now.UnixNano()), + Mtime: linux.NsecToStatxTimestamp(now.UnixNano()), + } + if tester.SetUserGroupIDSupported() { + wantStat.Mask |= unix.STATX_UID | unix.STATX_GID + } + failureMask, failureErr, err := controlFile.SetStat(ctx, &wantStat) + if err != nil { + t.Fatalf("setstat failed: %v", err) + } + if failureMask != 0 { + t.Fatalf("some setstat operations failed: failureMask = %#b, failureErr = %v", failureMask, failureErr) + } + + // Verify that attributes were updated. + var gotStat linux.Statx + statTo(ctx, t, controlFile, &gotStat) + if gotStat.Mode&07777 != wantStat.Mode || + gotStat.Size != wantStat.Size || + gotStat.Atime.ToNsec() != wantStat.Atime.ToNsec() || + gotStat.Mtime.ToNsec() != wantStat.Mtime.ToNsec() || + (tester.SetUserGroupIDSupported() && (uint32(gotStat.UID) != wantStat.UID || uint32(gotStat.GID) != wantStat.GID)) { + t.Errorf("setStat did not update file correctly: setStat = %+v, stat = %+v", wantStat, gotStat) + } +} + +func testAllocate(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "tempFile" + controlFile, _, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, controlFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + allocateAndVerify(ctx, t, fd, 0, 40) + allocateAndVerify(ctx, t, fd, 20, 100) +} + +func testStatFS(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + var statFS lisafs.StatFS + if err := root.StatFSTo(ctx, &statFS); err != nil { + t.Errorf("statfs failed: %v", err) + } +} + +func testUnlink(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "tempFile" + controlFile, _, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, controlFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + unlinkFile(ctx, t, root, name, false /* isDir */) + if inodes := walk(ctx, t, root, []string{name}); len(inodes) > 0 { + t.Errorf("deleted file should not be generating inodes on walk: %+v", inodes) + } +} + +func testSymlink(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + target := "/tmp/some/path" + name := "symlinkFile" + link, linkStat := symlink(ctx, t, root, name, target) + defer closeFD(ctx, t, link) + + if linkStat.Mode&unix.S_IFMT != unix.S_IFLNK { + t.Errorf("stat return from symlink RPC indicates that the inode is not a symlink: mode = %d", linkStat.Mode) + } + + if gotTarget, err := link.ReadLinkAt(ctx); err != nil { + t.Fatalf("readlink failed: %v", err) + } else if gotTarget != target { + t.Errorf("readlink return incorrect target: expected %q, got %q", target, gotTarget) + } +} + +func testHardLink(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + if !tester.LinkSupported() { + t.Skipf("server does not support LinkAt RPC") + } + if !hasCapability(capability.CAP_DAC_READ_SEARCH) { + t.Skipf("TestHardLink requires CAP_DAC_READ_SEARCH, running as %d", unix.Getuid()) + } + name := "tempFile" + controlFile, fileIno, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, controlFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + link, linkStat := link(ctx, t, root, name, controlFile) + defer closeFD(ctx, t, link) + + if linkStat.Ino != fileIno.Ino { + t.Errorf("hard linked files have different inode numbers: %d %d", linkStat.Ino, fileIno.Ino) + } + if linkStat.DevMinor != fileIno.DevMinor { + t.Errorf("hard linked files have different minor device numbers: %d %d", linkStat.DevMinor, fileIno.DevMinor) + } + if linkStat.DevMajor != fileIno.DevMajor { + t.Errorf("hard linked files have different major device numbers: %d %d", linkStat.DevMajor, fileIno.DevMajor) + } +} + +func testWalk(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + // Create 10 nested directories. + n := 10 + curDir := root + + dirNames := make([]string, 0, n) + for i := 0; i < n; i++ { + name := fmt.Sprintf("tmpdir-%d", i) + childDir, _ := mkdir(ctx, t, curDir, name) + defer closeFD(ctx, t, childDir) + defer unlinkFile(ctx, t, curDir, name, true /* isDir */) + + curDir = childDir + dirNames = append(dirNames, name) + } + + // Walk all these directories. Add some junk at the end which should not be + // walked on. + dirNames = append(dirNames, []string{"a", "b", "c"}...) + inodes := walk(ctx, t, root, dirNames) + if len(inodes) != n { + t.Errorf("walk returned the incorrect number of inodes: wanted %d, got %d", n, len(inodes)) + } + + // Close all control FDs and collect stat results for all dirs including + // the root directory. + dirStats := make([]linux.Statx, 0, n+1) + var stat linux.Statx + statTo(ctx, t, root, &stat) + dirStats = append(dirStats, stat) + for _, inode := range inodes { + dirStats = append(dirStats, inode.Stat) + closeFD(ctx, t, root.Client().NewFD(inode.ControlFD)) + } + + // Test WalkStat which additonally returns Statx for root because the first + // path component is "". + dirNames = append([]string{""}, dirNames...) + gotStats := walkStat(ctx, t, root, dirNames) + if len(gotStats) != len(dirStats) { + t.Errorf("walkStat returned the incorrect number of statx: wanted %d, got %d", len(dirStats), len(gotStats)) + } else { + for i := range gotStats { + cmpStatx(t, dirStats[i], gotStats[i]) + } + } +} + +func testRename(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "tempFile" + tempFile, _, fd, hostFD := openCreateFile(ctx, t, root, name) + defer closeFD(ctx, t, tempFile) + defer closeFD(ctx, t, fd) + defer unix.Close(hostFD) + + tempDir, _ := mkdir(ctx, t, root, "tempDir") + defer closeFD(ctx, t, tempDir) + + // Move tempFile into tempDir. + if err := tempFile.RenameTo(ctx, tempDir.ID(), "movedFile"); err != nil { + t.Fatalf("rename failed: %v", err) + } + + inodes := walkStat(ctx, t, root, []string{"tempDir", "movedFile"}) + if len(inodes) != 2 { + t.Errorf("expected 2 files on walk but only found %d", len(inodes)) + } +} + +func testMknod(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + name := "namedPipe" + pipeFile, pipeStat := mknod(ctx, t, root, name) + defer closeFD(ctx, t, pipeFile) + + var stat linux.Statx + statTo(ctx, t, pipeFile, &stat) + + if stat.Mode != pipeStat.Mode { + t.Errorf("mknod mode is incorrect: want %d, got %d", pipeStat.Mode, stat.Mode) + } + if stat.UID != pipeStat.UID { + t.Errorf("mknod UID is incorrect: want %d, got %d", pipeStat.UID, stat.UID) + } + if stat.GID != pipeStat.GID { + t.Errorf("mknod GID is incorrect: want %d, got %d", pipeStat.GID, stat.GID) + } +} + +func testGetdents(ctx context.Context, t *testing.T, tester Tester, root lisafs.ClientFD) { + tempDir, _ := mkdir(ctx, t, root, "tempDir") + defer closeFD(ctx, t, tempDir) + defer unlinkFile(ctx, t, root, "tempDir", true /* isDir */) + + // Create 10 files in tempDir. + n := 10 + fileStats := make(map[string]linux.Statx) + for i := 0; i < n; i++ { + name := fmt.Sprintf("file-%d", i) + newFile, fileStat := mknod(ctx, t, tempDir, name) + defer closeFD(ctx, t, newFile) + defer unlinkFile(ctx, t, tempDir, name, false /* isDir */) + + fileStats[name] = fileStat + } + + // Use opened directory FD for getdents. + openDirFile, _ := openFile(ctx, t, tempDir, unix.O_RDONLY, false /* isReg */) + defer closeFD(ctx, t, openDirFile) + + dirents := make([]lisafs.Dirent64, 0, n) + for i := 0; i < n+2; i++ { + gotDirents, err := openDirFile.Getdents64(ctx, 40) + if err != nil { + t.Fatalf("getdents failed: %v", err) + } + if len(gotDirents) == 0 { + break + } + for _, dirent := range gotDirents { + if dirent.Name != "." && dirent.Name != ".." { + dirents = append(dirents, dirent) + } + } + } + + if len(dirents) != n { + t.Errorf("got incorrect number of dirents: wanted %d, got %d", n, len(dirents)) + } + for _, dirent := range dirents { + stat, ok := fileStats[string(dirent.Name)] + if !ok { + t.Errorf("received a dirent that was not created: %+v", dirent) + continue + } + + if dirent.Type != unix.DT_REG { + t.Errorf("dirent type of %s is incorrect: %d", dirent.Name, dirent.Type) + } + if uint64(dirent.Ino) != stat.Ino { + t.Errorf("dirent ino of %s is incorrect: want %d, got %d", dirent.Name, stat.Ino, dirent.Ino) + } + if uint32(dirent.DevMinor) != stat.DevMinor { + t.Errorf("dirent dev minor of %s is incorrect: want %d, got %d", dirent.Name, stat.DevMinor, dirent.DevMinor) + } + if uint32(dirent.DevMajor) != stat.DevMajor { + t.Errorf("dirent dev major of %s is incorrect: want %d, got %d", dirent.Name, stat.DevMajor, dirent.DevMajor) + } + } +} diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4244f2cf5..509dd0e1a 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -54,7 +54,10 @@ go_library( "//pkg/fdnotifier", "//pkg/fspath", "//pkg/hostarch", + "//pkg/lisafs", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/metric", "//pkg/p9", "//pkg/refs", diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 5c48a9fee..d99a6112c 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -222,47 +222,88 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() - if d.readFile.isNil() { + if !d.isReadFileOk() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. d.handleMu.RUnlock() panic("gofer.dentry.getDirents called without a readable handle") } + // shouldSeek0 indicates whether the server should SEEK to 0 before reading + // directory entries. + shouldSeek0 := true for { - p9ds, err := d.readFile.readdir(ctx, off, count) - if err != nil { - d.handleMu.RUnlock() - return nil, err - } - if len(p9ds) == 0 { - d.handleMu.RUnlock() - break - } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { - continue + if d.fs.opts.lisaEnabled { + countLisa := int32(count) + if shouldSeek0 { + // See lisafs.Getdents64Req.Count. + countLisa = -countLisa + shouldSeek0 = false + } + lisafsDs, err := d.readFDLisa.Getdents64(ctx, countLisa) + if err != nil { + d.handleMu.RUnlock() + return nil, err + } + if len(lisafsDs) == 0 { + d.handleMu.RUnlock() + break + } + for i := range lisafsDs { + name := string(lisafsDs[i].Name) + if name == "." || name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: name, + Ino: d.fs.inoFromKey(inoKey{ + ino: uint64(lisafsDs[i].Ino), + devMinor: uint32(lisafsDs[i].DevMinor), + devMajor: uint32(lisafsDs[i].DevMajor), + }), + NextOff: int64(len(dirents) + 1), + Type: uint8(lisafsDs[i].Type), + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[name] = struct{}{} + } } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: d.fs.inoFromQIDPath(p9d.QID.Path), - NextOff: int64(len(dirents) + 1), + } else { + p9ds, err := d.readFile.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break } - dirents = append(dirents, dirent) - if realChildren != nil { - realChildren[p9d.Name] = struct{}{} + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: d.fs.inoFromQIDPath(p9d.QID.Path), + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } } + off = p9ds[len(p9ds)-1].Offset } - off = p9ds[len(p9ds)-1].Offset } } // Emit entries for synthetic children. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 9d943bd4a..f7b3446d3 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,10 +21,12 @@ import ( "sync" "sync/atomic" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/fsmetric" @@ -53,9 +55,47 @@ func (fs *filesystem) Sync(ctx context.Context) error { // regardless. var retErr error + if fs.opts.lisaEnabled { + // Try accumulating all FDIDs to fsync and fsync then via one RPC as + // opposed to making an RPC per FDID. Passing a non-nil accFsyncFDIDs to + // dentry.syncCachedFile() and specialFileFD.sync() will cause them to not + // make an RPC, instead accumulate syncable FDIDs in the passed slice. + accFsyncFDIDs := make([]lisafs.FDID, 0, len(ds)+len(sffds)) + + // Sync syncable dentries. + for _, d := range ds { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */, &accFsyncFDIDs); err != nil { + ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) + if retErr == nil { + retErr = err + } + } + } + + // Sync special files, which may be writable but do not use dentry shared + // handles (so they won't be synced by the above). + for _, sffd := range sffds { + if err := sffd.sync(ctx, true /* forFilesystemSync */, &accFsyncFDIDs); err != nil { + ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) + if retErr == nil { + retErr = err + } + } + } + + if err := fs.clientLisa.SyncFDs(ctx, accFsyncFDIDs); err != nil { + ctx.Infof("gofer.filesystem.Sync: fs.fsyncMultipleFDLisa failed: %v", err) + if retErr == nil { + retErr = err + } + } + + return retErr + } + // Sync syncable dentries. for _, d := range ds { - if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */); err != nil { ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) if retErr == nil { retErr = err @@ -66,7 +106,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { + if err := sffd.sync(ctx, true /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */); err != nil { ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) if retErr == nil { retErr = err @@ -197,7 +237,13 @@ afterSymlink: rp.Advance() return d.parent, followedSymlink, nil } - child, err := fs.getChildLocked(ctx, d, name, ds) + var child *dentry + var err error + if fs.opts.lisaEnabled { + child, err = fs.getChildAndWalkPathLocked(ctx, d, rp, ds) + } else { + child, err = fs.getChildLocked(ctx, d, name, ds) + } if err != nil { return nil, false, err } @@ -219,6 +265,99 @@ afterSymlink: return child, followedSymlink, nil } +// Preconditions: +// * fs.opts.lisaEnabled. +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +// * parent.isDir(). +// * parent and the dentry at name have been revalidated. +func (fs *filesystem) getChildAndWalkPathLocked(ctx context.Context, parent *dentry, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + // Note that pit is a copy of the iterator that does not affect rp. + pit := rp.Pit() + first := pit.String() + if len(first) > maxFilenameLen { + return nil, linuxerr.ENAMETOOLONG + } + if child, ok := parent.children[first]; ok || parent.isSynthetic() { + if child == nil { + return nil, linuxerr.ENOENT + } + return child, nil + } + + // Walk as much of the path as possible in 1 RPC. + names := []string{first} + for pit = pit.Next(); pit.Ok(); pit = pit.Next() { + name := pit.String() + if name == "." { + continue + } + if name == ".." { + break + } + names = append(names, name) + } + status, inodes, err := parent.controlFDLisa.WalkMultiple(ctx, names) + if err != nil { + return nil, err + } + if len(inodes) == 0 { + parent.cacheNegativeLookupLocked(first) + return nil, linuxerr.ENOENT + } + + // Add the walked inodes into the dentry tree. + curParent := parent + curParentDirMuLock := func() { + if curParent != parent { + curParent.dirMu.Lock() + } + } + curParentDirMuUnlock := func() { + if curParent != parent { + curParent.dirMu.Unlock() // +checklocksforce: locked via curParentDirMuLock(). + } + } + var ret *dentry + var dentryCreationErr error + for i := range inodes { + if dentryCreationErr != nil { + fs.clientLisa.CloseFDBatched(ctx, inodes[i].ControlFD) + continue + } + + child, err := fs.newDentryLisa(ctx, &inodes[i]) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, inodes[i].ControlFD) + dentryCreationErr = err + continue + } + curParentDirMuLock() + curParent.cacheNewChildLocked(child, names[i]) + curParentDirMuUnlock() + // For now, child has 0 references, so our caller should call + // child.checkCachingLocked(). curParent gained a ref so we should also + // call curParent.checkCachingLocked() so it can be removed from the cache + // if needed. We only do that for the first iteration because all + // subsequent parents would have already been added to ds. + if i == 0 { + *ds = appendDentry(*ds, curParent) + } + *ds = appendDentry(*ds, child) + curParent = child + if i == 0 { + ret = child + } + } + + if status == lisafs.WalkComponentDoesNotExist && curParent.isDir() { + curParentDirMuLock() + curParent.cacheNegativeLookupLocked(names[len(inodes)]) + curParentDirMuUnlock() + } + return ret, dentryCreationErr +} + // getChildLocked returns a dentry representing the child of parent with the // given name. Returns ENOENT if the child doesn't exist. // @@ -227,7 +366,7 @@ afterSymlink: // * parent.dirMu must be locked. // * parent.isDir(). // * name is not "." or "..". -// * dentry at name has been revalidated +// * parent and the dentry at name have been revalidated. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { return nil, linuxerr.ENAMETOOLONG @@ -239,20 +378,35 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s return child, nil } - qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) - if err != nil { - if linuxerr.Equals(linuxerr.ENOENT, err) { - parent.cacheNegativeLookupLocked(name) + var child *dentry + if fs.opts.lisaEnabled { + childInode, err := parent.controlFDLisa.Walk(ctx, name) + if err != nil { + if linuxerr.Equals(linuxerr.ENOENT, err) { + parent.cacheNegativeLookupLocked(name) + } + return nil, err + } + // Create a new dentry representing the file. + child, err = fs.newDentryLisa(ctx, childInode) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, childInode.ControlFD) + return nil, err + } + } else { + qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) + if err != nil { + if linuxerr.Equals(linuxerr.ENOENT, err) { + parent.cacheNegativeLookupLocked(name) + } + return nil, err + } + // Create a new dentry representing the file. + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) + if err != nil { + file.close(ctx) + return nil, err } - return nil, err - } - - // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) - if err != nil { - file.close(ctx) - delete(parent.children, name) - return nil, err } parent.cacheNewChildLocked(child, name) appendNewChildDentry(ds, parent, child) @@ -328,7 +482,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error), createInSyntheticDir func(parent *dentry, name string) error, updateChild func(child *dentry)) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) @@ -415,9 +569,26 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // No cached dentry exists; however, in InteropModeShared there might still be // an existing file at name. Just attempt the file creation RPC anyways. If a // file does exist, the RPC will fail with EEXIST like we would have. - if err := createInRemoteDir(parent, name, &ds); err != nil { + lisaInode, err := createInRemoteDir(parent, name, &ds) + if err != nil { return err } + // lisafs may aggresively cache newly created inodes. This has helped reduce + // Walk RPCs in practice. + if lisaInode != nil { + child, err := fs.newDentryLisa(ctx, lisaInode) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, lisaInode.ControlFD) + return err + } + parent.cacheNewChildLocked(child, name) + appendNewChildDentry(&ds, parent, child) + + // lisafs may update dentry properties upon successful creation. + if updateChild != nil { + updateChild(child) + } + } if fs.opts.interop != InteropModeShared { if child, ok := parent.children[name]; ok && child == nil { // Delete the now-stale negative dentry. @@ -565,7 +736,11 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return linuxerr.ENOENT } } else if child == nil || !child.isSynthetic() { - err = parent.file.unlinkAt(ctx, name, flags) + if fs.opts.lisaEnabled { + err = parent.controlFDLisa.UnlinkAt(ctx, name, flags) + } else { + err = parent.file.unlinkAt(ctx, name, flags) + } if err != nil { if child != nil { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. @@ -658,40 +833,43 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error { + err := fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, ds **[]*dentry) (*lisafs.Inode, error) { if rp.Mount() != vd.Mount() { - return linuxerr.EXDEV + return nil, linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) if d.isDir() { - return linuxerr.EPERM + return nil, linuxerr.EPERM } gid := auth.KGID(atomic.LoadUint32(&d.gid)) uid := auth.KUID(atomic.LoadUint32(&d.uid)) mode := linux.FileMode(atomic.LoadUint32(&d.mode)) if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { - return err + return nil, err } if d.nlink == 0 { - return linuxerr.ENOENT + return nil, linuxerr.ENOENT } if d.nlink == math.MaxUint32 { - return linuxerr.EMLINK + return nil, linuxerr.EMLINK } - if err := parent.file.link(ctx, d.file, childName); err != nil { - return err + if fs.opts.lisaEnabled { + return parent.controlFDLisa.LinkAt(ctx, d.controlFDLisa.ID(), childName) } + return nil, parent.file.link(ctx, d.file, childName) + }, nil, nil) + if err == nil { // Success! - atomic.AddUint32(&d.nlink, 1) - return nil - }, nil) + vd.Dentry().Impl().(*dentry).incLinks() + } + return err } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { creds := rp.Credentials() - return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) { // If the parent is a setgid directory, use the parent's GID // rather than the caller's and enable setgid. kgid := creds.EffectiveKGID @@ -700,9 +878,18 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v kgid = auth.KGID(atomic.LoadUint32(&parent.gid)) mode |= linux.S_ISGID } - if _, err := parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)); err != nil { + var ( + childDirInode *lisafs.Inode + err error + ) + if fs.opts.lisaEnabled { + childDirInode, err = parent.controlFDLisa.MkdirAt(ctx, name, mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(kgid)) + } else { + _, err = parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) + } + if err != nil { if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { - return err + return nil, err } ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) parent.createSyntheticChildLocked(&createSyntheticOpts{ @@ -716,7 +903,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if fs.opts.interop != InteropModeShared { parent.incLinks() } - return nil + return childDirInode, nil }, func(parent *dentry, name string) error { if !opts.ForSyntheticMountpoint { // Can't create non-synthetic files in synthetic directories. @@ -730,16 +917,26 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }) parent.incLinks() return nil - }) + }, nil) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) { creds := rp.Credentials() - _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - if !linuxerr.Equals(linuxerr.EPERM, err) { - return err + var ( + childInode *lisafs.Inode + err error + ) + if fs.opts.lisaEnabled { + childInode, err = parent.controlFDLisa.MknodAt(ctx, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID), opts.DevMinor, opts.DevMajor) + } else { + _, err = parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + } + if err == nil { + return childInode, nil + } else if !linuxerr.Equals(linuxerr.EPERM, err) { + return nil, err } // EPERM means that gofer does not allow creating a socket or pipe. Fallback @@ -750,10 +947,10 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v switch { case err == nil: // Step succeeded, another file exists. - return linuxerr.EEXIST + return nil, linuxerr.EEXIST case !linuxerr.Equals(linuxerr.ENOENT, err): // Unexpected error. - return err + return nil, err } switch opts.Mode.FileType() { @@ -766,7 +963,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v endpoint: opts.Endpoint, }) *ds = appendDentry(*ds, parent) - return nil + return nil, nil case linux.S_IFIFO: parent.createSyntheticChildLocked(&createSyntheticOpts{ name: name, @@ -776,11 +973,11 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize), }) *ds = appendDentry(*ds, parent) - return nil + return nil, nil } // Retain error from gofer if synthetic file cannot be created internally. - return linuxerr.EPERM - }, nil) + return nil, linuxerr.EPERM + }, nil, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -986,6 +1183,23 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } + if d.fs.opts.lisaEnabled { + // Note that special value of linux.SockType = 0 is interpreted by lisafs + // as "do not care about the socket type". Analogous to p9.AnonymousSocket. + sockFD, err := d.controlFDLisa.Connect(ctx, 0 /* sockType */) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), sockFD, &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + unix.Close(sockFD) + return nil, err + } + return fd, nil + } fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) if err != nil { return nil, err @@ -998,6 +1212,7 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio fdObj.Close() return nil, err } + // Ownership has been transferred to fd. fdObj.Release() return fd, nil } @@ -1017,7 +1232,13 @@ func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs. // since closed its end. isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 retry: - h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + var h handle + var err error + if d.fs.opts.lisaEnabled { + h, err = openHandleLisa(ctx, d.controlFDLisa, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + } else { + h, err = openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + } if err != nil { if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) { // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails @@ -1061,18 +1282,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } defer mnt.EndWrite() - // 9P2000.L's lcreate takes a fid representing the parent directory, and - // converts it into an open fid representing the created file, so we need - // to duplicate the directory fid first. - _, dirfile, err := d.file.walk(ctx, nil) - if err != nil { - return nil, err - } creds := rp.Credentials() name := rp.Component() - // We only want the access mode for creating the file. - createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask - // If the parent is a setgid directory, use the parent's GID rather // than the caller's. kgid := creds.EffectiveKGID @@ -1080,51 +1291,87 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving kgid = auth.KGID(atomic.LoadUint32(&d.gid)) } - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) - if err != nil { - dirfile.close(ctx) - return nil, err - } - // Then we need to walk to the file we just created to get a non-open fid - // representing it, and to get its metadata. This must use d.file since, as - // explained above, dirfile was invalidated by dirfile.Create(). - _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) - if err != nil { - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() + var child *dentry + var openP9File p9file + openLisaFD := lisafs.InvalidFDID + openHostFD := int32(-1) + if d.fs.opts.lisaEnabled { + ino, openFD, hostFD, err := d.controlFDLisa.OpenCreateAt(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(kgid)) + if err != nil { + return nil, err + } + openHostFD = int32(hostFD) + openLisaFD = openFD + + child, err = d.fs.newDentryLisa(ctx, &ino) + if err != nil { + d.fs.clientLisa.CloseFDBatched(ctx, ino.ControlFD) + d.fs.clientLisa.CloseFDBatched(ctx, openFD) + if hostFD >= 0 { + unix.Close(hostFD) + } + return nil, err + } + } else { + // 9P2000.L's lcreate takes a fid representing the parent directory, and + // converts it into an open fid representing the created file, so we need + // to duplicate the directory fid first. + _, dirfile, err := d.file.walk(ctx, nil) + if err != nil { + return nil, err + } + // We only want the access mode for creating the file. + createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask + + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) + if err != nil { + dirfile.close(ctx) + return nil, err + } + // Then we need to walk to the file we just created to get a non-open fid + // representing it, and to get its metadata. This must use d.file since, as + // explained above, dirfile was invalidated by dirfile.Create(). + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + if err != nil { + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + + // Construct the new dentry. + child, err = d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) + if err != nil { + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err } - return nil, err - } - // Construct the new dentry. - child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) - if err != nil { - nonOpenFile.close(ctx) - openFile.close(ctx) if fdobj != nil { - fdobj.Close() + openHostFD = int32(fdobj.Release()) } - return nil, err + openP9File = openFile } // Incorporate the fid that was opened by lcreate. useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { - openFD := int32(-1) - if fdobj != nil { - openFD = int32(fdobj.Release()) - } child.handleMu.Lock() if vfs.MayReadFileWithOpenFlags(opts.Flags) { - child.readFile = openFile - if fdobj != nil { - child.readFD = openFD - child.mmapFD = openFD + child.readFile = openP9File + child.readFDLisa = d.fs.clientLisa.NewFD(openLisaFD) + if openHostFD != -1 { + child.readFD = openHostFD + child.mmapFD = openHostFD } } if vfs.MayWriteFileWithOpenFlags(opts.Flags) { - child.writeFile = openFile - child.writeFD = openFD + child.writeFile = openP9File + child.writeFDLisa = d.fs.clientLisa.NewFD(openLisaFD) + child.writeFD = openHostFD } child.handleMu.Unlock() } @@ -1146,11 +1393,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving childVFSFD = &fd.vfsfd } else { h := handle{ - file: openFile, - fd: -1, - } - if fdobj != nil { - h.fd = int32(fdobj.Release()) + file: openP9File, + fdLisa: d.fs.clientLisa.NewFD(openLisaFD), + fd: openHostFD, } fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) if err != nil { @@ -1304,7 +1549,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Update the remote filesystem. if !renamed.isSynthetic() { - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + if fs.opts.lisaEnabled { + err = renamed.controlFDLisa.RenameTo(ctx, newParent.controlFDLisa.ID(), newName) + } else { + err = renamed.file.rename(ctx, newParent.file, newName) + } + if err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } @@ -1315,7 +1565,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced.isDir() { flags = linux.AT_REMOVEDIR } - if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + if fs.opts.lisaEnabled { + err = newParent.controlFDLisa.UnlinkAt(ctx, newName, flags) + } else { + err = newParent.file.unlinkAt(ctx, newName, flags) + } + if err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } @@ -1431,6 +1686,28 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu for d.isSynthetic() { d = d.parent } + if fs.opts.lisaEnabled { + var statFS lisafs.StatFS + if err := d.controlFDLisa.StatFSTo(ctx, &statFS); err != nil { + return linux.Statfs{}, err + } + if statFS.NameLength > maxFilenameLen { + statFS.NameLength = maxFilenameLen + } + return linux.Statfs{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + BlockSize: statFS.BlockSize, + Blocks: statFS.Blocks, + BlocksFree: statFS.BlocksFree, + BlocksAvailable: statFS.BlocksAvailable, + Files: statFS.Files, + FilesFree: statFS.FilesFree, + NameLength: statFS.NameLength, + }, nil + } fsstat, err := d.file.statFS(ctx) if err != nil { return linux.Statfs{}, err @@ -1456,11 +1733,21 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) { creds := rp.Credentials() + if fs.opts.lisaEnabled { + return parent.controlFDLisa.SymlinkAt(ctx, name, target, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) + } _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err - }, nil) + return nil, err + }, nil, func(child *dentry) { + if fs.opts.interop != InteropModeShared { + // lisafs caches the symlink target on creation. In practice, this + // helps avoid a lot of ReadLink RPCs. + child.haveTarget = true + child.target = target + } + }) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. @@ -1505,7 +1792,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si if err != nil { return nil, err } - return d.listXattr(ctx, rp.Credentials(), size) + return d.listXattr(ctx, size) } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 13971d086..7bef8242f 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -48,6 +48,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" @@ -119,6 +120,10 @@ type filesystem struct { // client is the client used by this filesystem. client is immutable. client *p9.Client `state:"nosave"` + // clientLisa is the client used for communicating with the server when + // lisafs is enabled. lisafsCient is immutable. + clientLisa *lisafs.Client `state:"nosave"` + // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock @@ -162,6 +167,12 @@ type filesystem struct { inoMu sync.Mutex `state:"nosave"` inoByQIDPath map[uint64]uint64 `state:"nosave"` + // inoByKey is the same as inoByQIDPath but only used by lisafs. It helps + // identify inodes based on the device ID and host inode number provided + // by the gofer process. It is not preserved across checkpoint/restore for + // the same reason as above. inoByKey is protected by inoMu. + inoByKey map[inoKey]uint64 `state:"nosave"` + // lastIno is the last inode number assigned to a file. lastIno is accessed // using atomic memory operations. lastIno uint64 @@ -471,44 +482,83 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), inoByQIDPath: make(map[uint64]uint64), + inoByKey: make(map[inoKey]uint64), } fs.vfsfs.Init(vfsObj, &fstype, fs) + if err := fs.initClientAndRoot(ctx); err != nil { + fs.vfsfs.DecRef(ctx) + return nil, nil, err + } + + return &fs.vfsfs, &fs.root.vfsd, nil +} + +func (fs *filesystem) initClientAndRoot(ctx context.Context) error { + var err error + if fs.opts.lisaEnabled { + var rootInode *lisafs.Inode + rootInode, err = fs.initClientLisa(ctx) + if err != nil { + return err + } + fs.root, err = fs.newDentryLisa(ctx, rootInode) + if err != nil { + fs.clientLisa.CloseFDBatched(ctx, rootInode.ControlFD) + } + } else { + fs.root, err = fs.initClient(ctx) + } + + // Set the root's reference count to 2. One reference is returned to the + // caller, and the other is held by fs to prevent the root from being "cached" + // and subsequently evicted. + if err == nil { + fs.root.refs = 2 + } + return err +} + +func (fs *filesystem) initClientLisa(ctx context.Context) (*lisafs.Inode, error) { + sock, err := unet.NewSocket(fs.opts.fd) + if err != nil { + return nil, err + } + + var rootInode *lisafs.Inode + ctx.UninterruptibleSleepStart(false) + fs.clientLisa, rootInode, err = lisafs.NewClient(sock, fs.opts.aname) + ctx.UninterruptibleSleepFinish(false) + return rootInode, err +} + +func (fs *filesystem) initClient(ctx context.Context) (*dentry, error) { // Connect to the server. if err := fs.dial(ctx); err != nil { - return nil, nil, err + return nil, err } // Perform attach to obtain the filesystem root. ctx.UninterruptibleSleepStart(false) - attached, err := fs.client.Attach(fsopts.aname) + attached, err := fs.client.Attach(fs.opts.aname) ctx.UninterruptibleSleepFinish(false) if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err + return nil, err } attachFile := p9file{attached} qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef(ctx) - return nil, nil, err + return nil, err } // Construct the root dentry. root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef(ctx) - return nil, nil, err + return nil, err } - // Set the root's reference count to 2. One reference is returned to the - // caller, and the other is held by fs to prevent the root from being "cached" - // and subsequently evicted. - root.refs = 2 - fs.root = root - - return &fs.vfsfs, &root.vfsd, nil + return root, nil } func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { @@ -626,7 +676,11 @@ func (fs *filesystem) Release(ctx context.Context) { if !fs.iopts.LeakConnection { // Close the connection to the server. This implicitly clunks all fids. - fs.client.Close() + if fs.opts.lisaEnabled { + fs.clientLisa.Close() + } else { + fs.client.Close() + } } fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) @@ -657,6 +711,23 @@ func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { } } +// inoKey is the key used to identify the inode backed by this dentry. +// +// +stateify savable +type inoKey struct { + ino uint64 + devMinor uint32 + devMajor uint32 +} + +func inoKeyFromStat(stat *linux.Statx) inoKey { + return inoKey{ + ino: stat.Ino, + devMinor: stat.DevMinor, + devMajor: stat.DevMajor, + } +} + // dentry implements vfs.DentryImpl. // // +stateify savable @@ -687,6 +758,9 @@ type dentry struct { // qidPath is the p9.QID.Path for this file. qidPath is immutable. qidPath uint64 + // inoKey is used to identify this dentry's inode. + inoKey inoKey + // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file @@ -694,6 +768,14 @@ type dentry struct { // only files that can be synthetic are sockets, pipes, and directories. file p9file `state:"nosave"` + // controlFDLisa is used by lisafs to perform path based operations on this + // dentry. + // + // if !controlFDLisa.Ok(), this dentry represents a synthetic file, i.e. a + // file that does not exist on the remote filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. + controlFDLisa lisafs.ClientFD `state:"nosave"` + // If deleted is non-zero, the file represented by this dentry has been // deleted. deleted is accessed using atomic memory operations. deleted uint32 @@ -804,12 +886,14 @@ type dentry struct { // always either -1 or equal to readFD; if !writeFile.isNil() (the file has // been opened for writing), it is additionally either -1 or equal to // writeFD. - handleMu sync.RWMutex `state:"nosave"` - readFile p9file `state:"nosave"` - writeFile p9file `state:"nosave"` - readFD int32 `state:"nosave"` - writeFD int32 `state:"nosave"` - mmapFD int32 `state:"nosave"` + handleMu sync.RWMutex `state:"nosave"` + readFile p9file `state:"nosave"` + writeFile p9file `state:"nosave"` + readFDLisa lisafs.ClientFD `state:"nosave"` + writeFDLisa lisafs.ClientFD `state:"nosave"` + readFD int32 `state:"nosave"` + writeFD int32 `state:"nosave"` + mmapFD int32 `state:"nosave"` dataMu sync.RWMutex `state:"nosave"` @@ -933,6 +1017,79 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma return d, nil } +func (fs *filesystem) newDentryLisa(ctx context.Context, ino *lisafs.Inode) (*dentry, error) { + if ino.Stat.Mask&linux.STATX_TYPE == 0 { + ctx.Warningf("can't create gofer.dentry without file type") + return nil, linuxerr.EIO + } + if ino.Stat.Mode&linux.FileTypeMask == linux.ModeRegular && ino.Stat.Mask&linux.STATX_SIZE == 0 { + ctx.Warningf("can't create regular file gofer.dentry without file size") + return nil, linuxerr.EIO + } + + inoKey := inoKeyFromStat(&ino.Stat) + d := &dentry{ + fs: fs, + inoKey: inoKey, + ino: fs.inoFromKey(inoKey), + mode: uint32(ino.Stat.Mode), + uid: uint32(fs.opts.dfltuid), + gid: uint32(fs.opts.dfltgid), + blockSize: hostarch.PageSize, + readFD: -1, + writeFD: -1, + mmapFD: -1, + controlFDLisa: fs.clientLisa.NewFD(ino.ControlFD), + } + + d.pf.dentry = d + if ino.Stat.Mask&linux.STATX_UID != 0 { + d.uid = dentryUIDFromLisaUID(lisafs.UID(ino.Stat.UID)) + } + if ino.Stat.Mask&linux.STATX_GID != 0 { + d.gid = dentryGIDFromLisaGID(lisafs.GID(ino.Stat.GID)) + } + if ino.Stat.Mask&linux.STATX_SIZE != 0 { + d.size = ino.Stat.Size + } + if ino.Stat.Blksize != 0 { + d.blockSize = ino.Stat.Blksize + } + if ino.Stat.Mask&linux.STATX_ATIME != 0 { + d.atime = dentryTimestampFromLisa(ino.Stat.Atime) + } + if ino.Stat.Mask&linux.STATX_MTIME != 0 { + d.mtime = dentryTimestampFromLisa(ino.Stat.Mtime) + } + if ino.Stat.Mask&linux.STATX_CTIME != 0 { + d.ctime = dentryTimestampFromLisa(ino.Stat.Ctime) + } + if ino.Stat.Mask&linux.STATX_BTIME != 0 { + d.btime = dentryTimestampFromLisa(ino.Stat.Btime) + } + if ino.Stat.Mask&linux.STATX_NLINK != 0 { + d.nlink = ino.Stat.Nlink + } + d.vfsd.Init(d) + refsvfs2.Register(d) + fs.syncMu.Lock() + fs.syncableDentries[d] = struct{}{} + fs.syncMu.Unlock() + return d, nil +} + +func (fs *filesystem) inoFromKey(key inoKey) uint64 { + fs.inoMu.Lock() + defer fs.inoMu.Unlock() + + if ino, ok := fs.inoByKey[key]; ok { + return ino + } + ino := fs.nextIno() + fs.inoByKey[key] = ino + return ino +} + func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 { fs.inoMu.Lock() defer fs.inoMu.Unlock() @@ -949,7 +1106,7 @@ func (fs *filesystem) nextIno() uint64 { } func (d *dentry) isSynthetic() bool { - return d.file.isNil() + return !d.isControlFileOk() } func (d *dentry) cachedMetadataAuthoritative() bool { @@ -999,6 +1156,50 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { } } +// updateFromLisaStatLocked is called to update d's metadata after an update +// from the remote filesystem. +// Precondition: d.metadataMu must be locked. +// +checklocks:d.metadataMu +func (d *dentry) updateFromLisaStatLocked(stat *linux.Statx) { + if stat.Mask&linux.STATX_TYPE != 0 { + if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { + panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) + } + } + if stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, uint32(stat.Mode)) + } + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, dentryUIDFromLisaUID(lisafs.UID(stat.UID))) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.uid, dentryGIDFromLisaGID(lisafs.GID(stat.GID))) + } + if stat.Blksize != 0 { + atomic.StoreUint32(&d.blockSize, stat.Blksize) + } + // Don't override newer client-defined timestamps with old server-defined + // ones. + if stat.Mask&linux.STATX_ATIME != 0 && atomic.LoadUint32(&d.atimeDirty) == 0 { + atomic.StoreInt64(&d.atime, dentryTimestampFromLisa(stat.Atime)) + } + if stat.Mask&linux.STATX_MTIME != 0 && atomic.LoadUint32(&d.mtimeDirty) == 0 { + atomic.StoreInt64(&d.mtime, dentryTimestampFromLisa(stat.Mtime)) + } + if stat.Mask&linux.STATX_CTIME != 0 { + atomic.StoreInt64(&d.ctime, dentryTimestampFromLisa(stat.Ctime)) + } + if stat.Mask&linux.STATX_BTIME != 0 { + atomic.StoreInt64(&d.btime, dentryTimestampFromLisa(stat.Btime)) + } + if stat.Mask&linux.STATX_NLINK != 0 { + atomic.StoreUint32(&d.nlink, stat.Nlink) + } + if stat.Mask&linux.STATX_SIZE != 0 { + d.updateSizeLocked(stat.Size) + } +} + // Preconditions: !d.isSynthetic(). // Preconditions: d.metadataMu is locked. // +checklocks:d.metadataMu @@ -1008,6 +1209,9 @@ func (d *dentry) refreshSizeLocked(ctx context.Context) error { if d.writeFD < 0 { d.handleMu.RUnlock() // Ask the gofer if we don't have a host FD. + if d.fs.opts.lisaEnabled { + return d.updateFromStatLisaLocked(ctx, nil) + } return d.updateFromGetattrLocked(ctx, p9file{}) } @@ -1027,6 +1231,9 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { // updating stale attributes in d.updateFromP9AttrsLocked(). d.metadataMu.Lock() defer d.metadataMu.Unlock() + if d.fs.opts.lisaEnabled { + return d.updateFromStatLisaLocked(ctx, nil) + } return d.updateFromGetattrLocked(ctx, p9file{}) } @@ -1034,6 +1241,45 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { // * !d.isSynthetic(). // * d.metadataMu is locked. // +checklocks:d.metadataMu +func (d *dentry) updateFromStatLisaLocked(ctx context.Context, fdLisa *lisafs.ClientFD) error { + handleMuRLocked := false + if fdLisa == nil { + // Use open FDs in preferenece to the control FD. This may be significantly + // more efficient in some implementations. Prefer a writable FD over a + // readable one since some filesystem implementations may update a writable + // FD's metadata after writes, without making metadata updates immediately + // visible to read-only FDs representing the same file. + d.handleMu.RLock() + switch { + case d.writeFDLisa.Ok(): + fdLisa = &d.writeFDLisa + handleMuRLocked = true + case d.readFDLisa.Ok(): + fdLisa = &d.readFDLisa + handleMuRLocked = true + default: + fdLisa = &d.controlFDLisa + d.handleMu.RUnlock() + } + } + + var stat linux.Statx + err := fdLisa.StatTo(ctx, &stat) + if handleMuRLocked { + // handleMu must be released before updateFromLisaStatLocked(). + d.handleMu.RUnlock() // +checklocksforce: complex case. + } + if err != nil { + return err + } + d.updateFromLisaStatLocked(&stat) + return nil +} + +// Preconditions: +// * !d.isSynthetic(). +// * d.metadataMu is locked. +// +checklocks:d.metadataMu func (d *dentry) updateFromGetattrLocked(ctx context.Context, file p9file) error { handleMuRLocked := false if file.isNil() { @@ -1173,6 +1419,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } } + // failureMask indicates which attributes could not be set on the remote + // filesystem. p9 returns an error if any of the attributes could not be set + // but that leads to inconsistency as the server could have set a few + // attributes successfully but a later failure will cause the successful ones + // to not be updated in the dentry cache. + var failureMask uint32 + var failureErr error if !d.isSynthetic() { if stat.Mask != 0 { if stat.Mask&linux.STATX_SIZE != 0 { @@ -1182,35 +1435,50 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // the remote file has been truncated). d.dataMu.Lock() } - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - if stat.Mask&linux.STATX_SIZE != 0 { - d.dataMu.Unlock() // +checklocksforce: locked conditionally above + if d.fs.opts.lisaEnabled { + var err error + failureMask, failureErr, err = d.controlFDLisa.SetStat(ctx, stat) + if err != nil { + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } + return err + } + } else { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } + return err } - return err } if stat.Mask&linux.STATX_SIZE != 0 { - // d.size should be kept up to date, and privatized - // copy-on-write mappings of truncated pages need to be - // invalidated, even if InteropModeShared is in effect. - d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above + if failureMask&linux.STATX_SIZE == 0 { + // d.size should be kept up to date, and privatized + // copy-on-write mappings of truncated pages need to be + // invalidated, even if InteropModeShared is in effect. + d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above + } else { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } } } if d.fs.opts.interop == InteropModeShared { @@ -1221,13 +1489,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } } - if stat.Mask&linux.STATX_MODE != 0 { + if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } - if stat.Mask&linux.STATX_UID != 0 { + if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { atomic.StoreUint32(&d.uid, stat.UID) } - if stat.Mask&linux.STATX_GID != 0 { + if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { atomic.StoreUint32(&d.gid, stat.GID) } // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because @@ -1235,15 +1503,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // stat.Mtime to client-local timestamps above, and if // !d.cachedMetadataAuthoritative() then we returned after calling // d.file.setAttr(). For the same reason, now must have been initialized. - if stat.Mask&linux.STATX_ATIME != 0 { + if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { atomic.StoreInt64(&d.atime, stat.Atime.ToNsec()) atomic.StoreUint32(&d.atimeDirty, 0) } - if stat.Mask&linux.STATX_MTIME != 0 { + if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec()) atomic.StoreUint32(&d.mtimeDirty, 0) } atomic.StoreInt64(&d.ctime, now) + if failureMask != 0 { + // Setting some attribute failed on the remote filesystem. + return failureErr + } return nil } @@ -1359,6 +1631,20 @@ func dentryGIDFromP9GID(gid p9.GID) uint32 { return uint32(gid) } +func dentryUIDFromLisaUID(uid lisafs.UID) uint32 { + if !uid.Ok() { + return uint32(auth.OverflowUID) + } + return uint32(uid) +} + +func dentryGIDFromLisaGID(gid lisafs.GID) uint32 { + if !gid.Ok() { + return uint32(auth.OverflowGID) + } + return uint32(gid) +} + // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { // d.refs may be 0 if d.fs.renameMu is locked, which serializes against @@ -1667,15 +1953,24 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.dirty.RemoveAll() } d.dataMu.Unlock() - // Clunk open fids and close open host FDs. - if !d.readFile.isNil() { - _ = d.readFile.close(ctx) - } - if !d.writeFile.isNil() && d.readFile != d.writeFile { - _ = d.writeFile.close(ctx) + if d.fs.opts.lisaEnabled { + if d.readFDLisa.Ok() && d.readFDLisa.ID() != d.writeFDLisa.ID() { + d.readFDLisa.CloseBatched(ctx) + } + if d.writeFDLisa.Ok() { + d.writeFDLisa.CloseBatched(ctx) + } + } else { + // Clunk open fids and close open host FDs. + if !d.readFile.isNil() { + _ = d.readFile.close(ctx) + } + if !d.writeFile.isNil() && d.readFile != d.writeFile { + _ = d.writeFile.close(ctx) + } + d.readFile = p9file{} + d.writeFile = p9file{} } - d.readFile = p9file{} - d.writeFile = p9file{} if d.readFD >= 0 { _ = unix.Close(int(d.readFD)) } @@ -1687,7 +1982,7 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.mmapFD = -1 d.handleMu.Unlock() - if !d.file.isNil() { + if d.isControlFileOk() { // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, // i.e. client and server timestamps may differ (because e.g. a client // write was serviced by the page cache, and only written back to the @@ -1696,10 +1991,16 @@ func (d *dentry) destroyLocked(ctx context.Context) { // instantiated for the same file would remain coherent. Unfortunately, // this turns out to be too expensive in many cases, so for now we // don't do this. - if err := d.file.close(ctx); err != nil { - log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) + + // Close the control FD. + if d.fs.opts.lisaEnabled { + d.controlFDLisa.CloseBatched(ctx) + } else { + if err := d.file.close(ctx); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) + } + d.file = p9file{} } - d.file = p9file{} // Remove d from the set of syncable dentries. d.fs.syncMu.Lock() @@ -1725,10 +2026,38 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { - if d.file.isNil() { +func (d *dentry) isControlFileOk() bool { + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.Ok() + } + return !d.file.isNil() +} + +func (d *dentry) isReadFileOk() bool { + if d.fs.opts.lisaEnabled { + return d.readFDLisa.Ok() + } + return !d.readFile.isNil() +} + +func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { + if !d.isControlFileOk() { return nil, nil } + + if d.fs.opts.lisaEnabled { + xattrs, err := d.controlFDLisa.ListXattr(ctx, size) + if err != nil { + return nil, err + } + + res := make([]string, 0, len(xattrs)) + for _, xattr := range xattrs { + res = append(res, xattr) + } + return res, nil + } + xattrMap, err := d.file.listXattr(ctx, size) if err != nil { return nil, err @@ -1741,32 +2070,41 @@ func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size ui } func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { - if d.file.isNil() { + if !d.isControlFileOk() { return "", linuxerr.ENODATA } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.GetXattr(ctx, opts.Name, opts.Size) + } return d.file.getXattr(ctx, opts.Name, opts.Size) } func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { - if d.file.isNil() { + if !d.isControlFileOk() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.SetXattr(ctx, opts.Name, opts.Value, opts.Flags) + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { - if d.file.isNil() { + if !d.isControlFileOk() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } + if d.fs.opts.lisaEnabled { + return d.controlFDLisa.RemoveXattr(ctx, name) + } return d.file.removeXattr(ctx, name) } @@ -1778,19 +2116,30 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // O_TRUNC). if !trunc { d.handleMu.RLock() - if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) { + var canReuseCurHandle bool + if d.fs.opts.lisaEnabled { + canReuseCurHandle = (!read || d.readFDLisa.Ok()) && (!write || d.writeFDLisa.Ok()) + } else { + canReuseCurHandle = (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) + } + d.handleMu.RUnlock() + if canReuseCurHandle { // Current handles are sufficient. - d.handleMu.RUnlock() return nil } - d.handleMu.RUnlock() } var fdsToCloseArr [2]int32 fdsToClose := fdsToCloseArr[:0] invalidateTranslations := false d.handleMu.Lock() - if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc { + var needNewHandle bool + if d.fs.opts.lisaEnabled { + needNewHandle = (read && !d.readFDLisa.Ok()) || (write && !d.writeFDLisa.Ok()) || trunc + } else { + needNewHandle = (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc + } + if needNewHandle { // Get a new handle. If this file has been opened for both reading and // writing, try to get a single handle that is usable for both: // @@ -1799,9 +2148,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // // - NOTE(b/141991141): Some filesystems may not ensure coherence // between multiple handles for the same file. - openReadable := !d.readFile.isNil() || read - openWritable := !d.writeFile.isNil() || write - h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) + var ( + openReadable bool + openWritable bool + h handle + err error + ) + if d.fs.opts.lisaEnabled { + openReadable = d.readFDLisa.Ok() || read + openWritable = d.writeFDLisa.Ok() || write + h, err = openHandleLisa(ctx, d.controlFDLisa, openReadable, openWritable, trunc) + } else { + openReadable = !d.readFile.isNil() || read + openWritable = !d.writeFile.isNil() || write + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { // It may not be possible to use a single handle for both // reading and writing, since permissions on the file may have @@ -1811,7 +2172,11 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) openReadable = read openWritable = write - h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + if d.fs.opts.lisaEnabled { + h, err = openHandleLisa(ctx, d.controlFDLisa, openReadable, openWritable, trunc) + } else { + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } } if err != nil { d.handleMu.Unlock() @@ -1873,9 +2238,16 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // previously opened for reading (without an FD), then existing // translations of the file may use the internal page cache; // invalidate those mappings. - if d.writeFile.isNil() { - invalidateTranslations = !d.readFile.isNil() - atomic.StoreInt32(&d.mmapFD, h.fd) + if d.fs.opts.lisaEnabled { + if !d.writeFDLisa.Ok() { + invalidateTranslations = d.readFDLisa.Ok() + atomic.StoreInt32(&d.mmapFD, h.fd) + } + } else { + if d.writeFile.isNil() { + invalidateTranslations = !d.readFile.isNil() + atomic.StoreInt32(&d.mmapFD, h.fd) + } } } else if openWritable && d.writeFD < 0 { atomic.StoreInt32(&d.writeFD, h.fd) @@ -1902,24 +2274,45 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool atomic.StoreInt32(&d.mmapFD, -1) } - // Switch to new fids. - var oldReadFile p9file - if openReadable { - oldReadFile = d.readFile - d.readFile = h.file - } - var oldWriteFile p9file - if openWritable { - oldWriteFile = d.writeFile - d.writeFile = h.file - } - // NOTE(b/141991141): Clunk old fids before making new fids visible (by - // unlocking d.handleMu). - if !oldReadFile.isNil() { - oldReadFile.close(ctx) - } - if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { - oldWriteFile.close(ctx) + // Switch to new fids/FDs. + if d.fs.opts.lisaEnabled { + oldReadFD := lisafs.InvalidFDID + if openReadable { + oldReadFD = d.readFDLisa.ID() + d.readFDLisa = h.fdLisa + } + oldWriteFD := lisafs.InvalidFDID + if openWritable { + oldWriteFD = d.writeFDLisa.ID() + d.writeFDLisa = h.fdLisa + } + // NOTE(b/141991141): Close old FDs before making new fids visible (by + // unlocking d.handleMu). + if oldReadFD.Ok() { + d.fs.clientLisa.CloseFDBatched(ctx, oldReadFD) + } + if oldWriteFD.Ok() && oldReadFD != oldWriteFD { + d.fs.clientLisa.CloseFDBatched(ctx, oldWriteFD) + } + } else { + var oldReadFile p9file + if openReadable { + oldReadFile = d.readFile + d.readFile = h.file + } + var oldWriteFile p9file + if openWritable { + oldWriteFile = d.writeFile + d.writeFile = h.file + } + // NOTE(b/141991141): Clunk old fids before making new fids visible (by + // unlocking d.handleMu). + if !oldReadFile.isNil() { + oldReadFile.close(ctx) + } + if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { + oldWriteFile.close(ctx) + } } } d.handleMu.Unlock() @@ -1943,27 +2336,29 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // Preconditions: d.handleMu must be locked. func (d *dentry) readHandleLocked() handle { return handle{ - file: d.readFile, - fd: d.readFD, + fdLisa: d.readFDLisa, + file: d.readFile, + fd: d.readFD, } } // Preconditions: d.handleMu must be locked. func (d *dentry) writeHandleLocked() handle { return handle{ - file: d.writeFile, - fd: d.writeFD, + fdLisa: d.writeFDLisa, + file: d.writeFile, + fd: d.writeFD, } } func (d *dentry) syncRemoteFile(ctx context.Context) error { d.handleMu.RLock() defer d.handleMu.RUnlock() - return d.syncRemoteFileLocked(ctx) + return d.syncRemoteFileLocked(ctx, nil /* accFsyncFDIDsLisa */) } // Preconditions: d.handleMu must be locked. -func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { +func (d *dentry) syncRemoteFileLocked(ctx context.Context, accFsyncFDIDsLisa *[]lisafs.FDID) error { // If we have a host FD, fsyncing it is likely to be faster than an fsync // RPC. Prefer syncing write handles over read handles, since some remote // filesystem implementations may not sync changes made through write @@ -1974,7 +2369,13 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { ctx.UninterruptibleSleepFinish(false) return err } - if !d.writeFile.isNil() { + if d.fs.opts.lisaEnabled && d.writeFDLisa.Ok() { + if accFsyncFDIDsLisa != nil { + *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, d.writeFDLisa.ID()) + return nil + } + return d.writeFDLisa.Sync(ctx) + } else if !d.fs.opts.lisaEnabled && !d.writeFile.isNil() { return d.writeFile.fsync(ctx) } if d.readFD >= 0 { @@ -1983,13 +2384,19 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { ctx.UninterruptibleSleepFinish(false) return err } - if !d.readFile.isNil() { + if d.fs.opts.lisaEnabled && d.readFDLisa.Ok() { + if accFsyncFDIDsLisa != nil { + *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, d.readFDLisa.ID()) + return nil + } + return d.readFDLisa.Sync(ctx) + } else if !d.fs.opts.lisaEnabled && !d.readFile.isNil() { return d.readFile.fsync(ctx) } return nil } -func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { +func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error { d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandleLocked() @@ -2002,7 +2409,7 @@ func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) err return err } } - if err := d.syncRemoteFileLocked(ctx); err != nil { + if err := d.syncRemoteFileLocked(ctx, accFsyncFDIDsLisa); err != nil { if !forFilesystemSync { return err } @@ -2059,18 +2466,33 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { - // Use specialFileFD.handle.file for the getattr if available, for the - // same reason that we try to use open file handles in - // dentry.updateFromGetattrLocked(). - var file p9file - if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { - file = sffd.handle.file - } - d.metadataMu.Lock() - err := d.updateFromGetattrLocked(ctx, file) - d.metadataMu.Unlock() - if err != nil { - return linux.Statx{}, err + if d.fs.opts.lisaEnabled { + // Use specialFileFD.handle.fileLisa for the Stat if available, for the + // same reason that we try to use open FD in updateFromStatLisaLocked(). + var fdLisa *lisafs.ClientFD + if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { + fdLisa = &sffd.handle.fdLisa + } + d.metadataMu.Lock() + err := d.updateFromStatLisaLocked(ctx, fdLisa) + d.metadataMu.Unlock() + if err != nil { + return linux.Statx{}, err + } + } else { + // Use specialFileFD.handle.file for the getattr if available, for the + // same reason that we try to use open file handles in + // dentry.updateFromGetattrLocked(). + var file p9file + if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { + file = sffd.handle.file + } + d.metadataMu.Lock() + err := d.updateFromGetattrLocked(ctx, file) + d.metadataMu.Unlock() + if err != nil { + return linux.Statx{}, err + } } } var stat linux.Statx @@ -2091,7 +2513,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { - return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size) + return fd.dentry().listXattr(ctx, size) } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 806392d50..d5cc73f33 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -33,6 +33,7 @@ func TestDestroyIdempotent(t *testing.T) { }, syncableDentries: make(map[*dentry]struct{}), inoByQIDPath: make(map[uint64]uint64), + inoByKey: make(map[inoKey]uint64), } attr := &p9.Attr{ diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 02540a754..394aecd62 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -17,6 +17,7 @@ package gofer import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostfd" @@ -26,10 +27,13 @@ import ( // handle represents a remote "open file descriptor", consisting of an opened // fid (p9.File) and optionally a host file descriptor. // +// If lisafs is being used, fdLisa points to an open file on the server. +// // These are explicitly not savable. type handle struct { - file p9file - fd int32 // -1 if unavailable + fdLisa lisafs.ClientFD + file p9file + fd int32 // -1 if unavailable } // Preconditions: read || write. @@ -65,13 +69,47 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand }, nil } +// Preconditions: read || write. +func openHandleLisa(ctx context.Context, fdLisa lisafs.ClientFD, read, write, trunc bool) (handle, error) { + var flags uint32 + switch { + case read && write: + flags = unix.O_RDWR + case read: + flags = unix.O_RDONLY + case write: + flags = unix.O_WRONLY + default: + panic("tried to open unreadable and unwritable handle") + } + if trunc { + flags |= unix.O_TRUNC + } + openFD, hostFD, err := fdLisa.OpenAt(ctx, flags) + if err != nil { + return handle{fd: -1}, err + } + h := handle{ + fdLisa: fdLisa.Client().NewFD(openFD), + fd: int32(hostFD), + } + return h, nil +} + func (h *handle) isOpen() bool { + if h.fdLisa.Client() != nil { + return h.fdLisa.Ok() + } return !h.file.isNil() } func (h *handle) close(ctx context.Context) { - h.file.close(ctx) - h.file = p9file{} + if h.fdLisa.Client() != nil { + h.fdLisa.CloseBatched(ctx) + } else { + h.file.close(ctx) + h.file = p9file{} + } if h.fd >= 0 { unix.Close(int(h.fd)) h.fd = -1 @@ -89,19 +127,27 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs return n, err } if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { - n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) - return uint64(n), err + if h.fdLisa.Client() != nil { + return h.fdLisa.Read(ctx, dsts.Head().ToSlice(), offset) + } + return h.file.readAt(ctx, dsts.Head().ToSlice(), offset) } // Buffer the read since p9.File.ReadAt() takes []byte. buf := make([]byte, dsts.NumBytes()) - n, err := h.file.readAt(ctx, buf, offset) + var n uint64 + var err error + if h.fdLisa.Client() != nil { + n, err = h.fdLisa.Read(ctx, buf, offset) + } else { + n, err = h.file.readAt(ctx, buf, offset) + } if n == 0 { return 0, err } if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { return cp, cperr } - return uint64(n), err + return n, err } func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { @@ -115,8 +161,10 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o return n, err } if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { - n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) - return uint64(n), err + if h.fdLisa.Client() != nil { + return h.fdLisa.Write(ctx, srcs.Head().ToSlice(), offset) + } + return h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) } // Buffer the write since p9.File.WriteAt() takes []byte. buf := make([]byte, srcs.NumBytes()) @@ -124,12 +172,18 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o if cp == 0 { return 0, cperr } - n, err := h.file.writeAt(ctx, buf[:cp], offset) + var n uint64 + var err error + if h.fdLisa.Client() != nil { + n, err = h.fdLisa.Write(ctx, buf[:cp], offset) + } else { + n, err = h.file.writeAt(ctx, buf[:cp], offset) + } // err takes precedence over cperr. if err != nil { - return uint64(n), err + return n, err } - return uint64(n), cperr + return n, cperr } type handleReadWriter struct { diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 5a3ddfc9d..0d97b60fd 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -141,18 +141,18 @@ func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, u return fdobj, qid, iounit, err } -func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { +func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (uint64, error) { ctx.UninterruptibleSleepStart(false) n, err := f.file.ReadAt(p, offset) ctx.UninterruptibleSleepFinish(false) - return n, err + return uint64(n), err } -func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { +func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (uint64, error) { ctx.UninterruptibleSleepStart(false) n, err := f.file.WriteAt(p, offset) ctx.UninterruptibleSleepFinish(false) - return n, err + return uint64(n), err } func (f p9file) fsync(ctx context.Context) error { diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 947dbe05f..874f9873d 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -98,6 +98,12 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() + if d.fs.opts.lisaEnabled { + if !d.writeFDLisa.Ok() { + return nil + } + return d.writeFDLisa.Flush(ctx) + } if d.writeFile.isNil() { return nil } @@ -110,6 +116,9 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint return d.doAllocate(ctx, offset, length, func() error { d.handleMu.RLock() defer d.handleMu.RUnlock() + if d.fs.opts.lisaEnabled { + return d.writeFDLisa.Allocate(ctx, mode, offset, length) + } return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) }) } @@ -282,8 +291,19 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // changes to the host. if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { atomic.StoreUint32(&d.mode, newMode) - if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { - return 0, offset, err + if d.fs.opts.lisaEnabled { + stat := linux.Statx{Mask: linux.STATX_MODE, Mode: uint16(newMode)} + failureMask, failureErr, err := d.controlFDLisa.SetStat(ctx, &stat) + if err != nil { + return 0, offset, err + } + if failureMask != 0 { + return 0, offset, failureErr + } + } else { + if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { + return 0, offset, err + } } } } @@ -677,7 +697,7 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */) + return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 226790a11..5d4009832 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -15,7 +15,9 @@ package gofer import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -234,28 +236,54 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // Lock metadata on all dentries *before* getting attributes for them. state.lockAllMetadata() - stats, err := state.start.file.multiGetAttr(ctx, state.names) - if err != nil { - return err + + var ( + stats []p9.FullStat + statsLisa []linux.Statx + numStats int + ) + if fs.opts.lisaEnabled { + var err error + statsLisa, err = state.start.controlFDLisa.WalkStat(ctx, state.names) + if err != nil { + return err + } + numStats = len(statsLisa) + } else { + var err error + stats, err = state.start.file.multiGetAttr(ctx, state.names) + if err != nil { + return err + } + numStats = len(stats) } i := -1 for d := state.popFront(); d != nil; d = state.popFront() { i++ - found := i < len(stats) + found := i < numStats if i == 0 && len(state.names[0]) == 0 { if found && !d.isSynthetic() { // First dentry is where the search is starting, just update attributes // since it cannot be replaced. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. + if fs.opts.lisaEnabled { + d.updateFromLisaStatLocked(&statsLisa[i]) // +checklocksforce: acquired by lockAllMetadata. + } else { + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. + } } d.metadataMu.Unlock() // +checklocksforce: see above. continue } - // Note that synthetic dentries will always fails the comparison check - // below. - if !found || d.qidPath != stats[i].QID.Path { + // Note that synthetic dentries will always fail this comparison check. + var shouldInvalidate bool + if fs.opts.lisaEnabled { + shouldInvalidate = !found || d.inoKey != inoKeyFromStat(&statsLisa[i]) + } else { + shouldInvalidate = !found || d.qidPath != stats[i].QID.Path + } + if shouldInvalidate { d.metadataMu.Unlock() // +checklocksforce: see above. if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace @@ -298,7 +326,11 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // The file at this path hasn't changed. Just update cached metadata. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. + if fs.opts.lisaEnabled { + d.updateFromLisaStatLocked(&statsLisa[i]) // +checklocksforce: see above. + } else { + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. + } d.metadataMu.Unlock() } diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go index 8dcbc61ed..475322527 100644 --- a/pkg/sentry/fsimpl/gofer/save_restore.go +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/safemem" @@ -112,10 +113,19 @@ func (d *dentry) prepareSaveRecursive(ctx context.Context) error { return err } } - if !d.readFile.isNil() || !d.writeFile.isNil() { - d.fs.savedDentryRW[d] = savedDentryRW{ - read: !d.readFile.isNil(), - write: !d.writeFile.isNil(), + if d.fs.opts.lisaEnabled { + if d.readFDLisa.Ok() || d.writeFDLisa.Ok() { + d.fs.savedDentryRW[d] = savedDentryRW{ + read: d.readFDLisa.Ok(), + write: d.writeFDLisa.Ok(), + } + } + } else { + if !d.readFile.isNil() || !d.writeFile.isNil() { + d.fs.savedDentryRW[d] = savedDentryRW{ + read: !d.readFile.isNil(), + write: !d.writeFile.isNil(), + } } } d.dirMu.Lock() @@ -177,25 +187,37 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID) } fs.opts.fd = fd - if err := fs.dial(ctx); err != nil { - return err - } fs.inoByQIDPath = make(map[uint64]uint64) + fs.inoByKey = make(map[inoKey]uint64) - // Restore the filesystem root. - ctx.UninterruptibleSleepStart(false) - attached, err := fs.client.Attach(fs.opts.aname) - ctx.UninterruptibleSleepFinish(false) - if err != nil { - return err - } - attachFile := p9file{attached} - qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) - if err != nil { - return err - } - if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil { - return err + if fs.opts.lisaEnabled { + rootInode, err := fs.initClientLisa(ctx) + if err != nil { + return err + } + if err := fs.root.restoreFileLisa(ctx, rootInode, &opts); err != nil { + return err + } + } else { + if err := fs.dial(ctx); err != nil { + return err + } + + // Restore the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := fs.client.Attach(fs.opts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + return err + } + if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil { + return err + } } // Restore remaining dentries. @@ -283,6 +305,55 @@ func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrM return nil } +func (d *dentry) restoreFileLisa(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions) error { + d.controlFDLisa = d.fs.clientLisa.NewFD(inode.ControlFD) + + // Gofers do not preserve inoKey across checkpoint/restore, so: + // + // - We must assume that the remote filesystem did not change in a way that + // would invalidate dentries, since we can't revalidate dentries by + // checking inoKey. + // + // - We need to associate the new inoKey with the existing d.ino. + d.inoKey = inoKeyFromStat(&inode.Stat) + d.fs.inoMu.Lock() + d.fs.inoByKey[d.inoKey] = d.ino + d.fs.inoMu.Unlock() + + // Check metadata stability before updating metadata. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.isRegularFile() { + if opts.ValidateFileSizes { + if inode.Stat.Mask&linux.STATX_SIZE != 0 { + return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d)) + } + if d.size != inode.Stat.Size { + return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, inode.Stat.Size) + } + } + if opts.ValidateFileModificationTimestamps { + if inode.Stat.Mask&linux.STATX_MTIME != 0 { + return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d)) + } + if want := dentryTimestampFromLisa(inode.Stat.Mtime); d.mtime != want { + return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want)) + } + } + } + if !d.cachedMetadataAuthoritative() { + d.updateFromLisaStatLocked(&inode.Stat) + } + + if rw, ok := d.fs.savedDentryRW[d]; ok { + if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { + return err + } + } + + return nil +} + // Preconditions: d is not synthetic. func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { for _, child := range d.children { @@ -305,19 +376,35 @@ func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.Comp // only be detected by checking filesystem.syncableDentries). d.parent has been // restored. func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { - qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name) - if err != nil { - return err - } - if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil { - return err + if d.fs.opts.lisaEnabled { + inode, err := d.parent.controlFDLisa.Walk(ctx, d.name) + if err != nil { + return err + } + if err := d.restoreFileLisa(ctx, inode, opts); err != nil { + return err + } + } else { + qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name) + if err != nil { + return err + } + if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil { + return err + } } return d.restoreDescendantsRecursive(ctx, opts) } func (fd *specialFileFD) completeRestore(ctx context.Context) error { d := fd.dentry() - h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + var h handle + var err error + if d.fs.opts.lisaEnabled { + h, err = openHandleLisa(ctx, d.controlFDLisa, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + } else { + h, err = openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + } if err != nil { return err } diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index fe15f8583..86ab70453 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -59,11 +59,6 @@ func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { - cf, ok := sockTypeToP9(ce.Type()) - if !ok { - return syserr.ErrConnectionRefused - } - // No lock ordering required as only the ConnectingEndpoint has a mutex. ce.Lock() @@ -77,7 +72,7 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec return syserr.ErrInvalidEndpointState } - c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue()) + c, err := e.newConnectedEndpoint(ctx, ce.Type(), ce.WaiterQueue()) if err != nil { ce.Unlock() return err @@ -95,7 +90,7 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec // UnidirectionalConnect implements // transport.BoundEndpoint.UnidirectionalConnect. func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { - c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{}) + c, err := e.newConnectedEndpoint(ctx, linux.SOCK_DGRAM, &waiter.Queue{}) if err != nil { return nil, err } @@ -111,25 +106,39 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect return c, nil } -func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { - hostFile, err := e.dentry.file.connect(ctx, flags) - if err != nil { +func (e *endpoint) newConnectedEndpoint(ctx context.Context, sockType linux.SockType, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { + if e.dentry.fs.opts.lisaEnabled { + hostSockFD, err := e.dentry.controlFDLisa.Connect(ctx, sockType) + if err != nil { + return nil, syserr.ErrConnectionRefused + } + + c, serr := host.NewSCMEndpoint(ctx, hostSockFD, queue, e.path) + if serr != nil { + unix.Close(hostSockFD) + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v sockType %d: %v", e.dentry.file, sockType, serr) + return nil, serr + } + return c, nil + } + + flags, ok := sockTypeToP9(sockType) + if !ok { return nil, syserr.ErrConnectionRefused } - // Dup the fd so that the new endpoint can manage its lifetime. - hostFD, err := unix.Dup(hostFile.FD()) + hostFile, err := e.dentry.file.connect(ctx, flags) if err != nil { - log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err) - return nil, syserr.FromError(err) + return nil, syserr.ErrConnectionRefused } - // After duplicating, we no longer need hostFile. - hostFile.Close() - c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) + c, serr := host.NewSCMEndpoint(ctx, hostFile.FD(), queue, e.path) if serr != nil { - log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr) + hostFile.Close() + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v sockType %d: %v", e.dentry.file, sockType, serr) return nil, serr } + // Ownership has been transferred to c. + hostFile.Release() return c, nil } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a8d47b65b..c568bbfd2 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" @@ -149,6 +150,9 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } + if fs := fd.filesystem(); fs.opts.lisaEnabled { + return fd.handle.fdLisa.Flush(ctx) + } return fd.handle.file.flush(ctx) } @@ -184,6 +188,9 @@ func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint if fd.isRegularFile { d := fd.dentry() return d.doAllocate(ctx, offset, length, func() error { + if d.fs.opts.lisaEnabled { + return fd.handle.fdLisa.Allocate(ctx, mode, offset, length) + } return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) }) } @@ -371,10 +378,10 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - return fd.sync(ctx, false /* forFilesystemSync */) + return fd.sync(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */) } -func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { +func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error { // Locks to ensure it didn't race with fd.Release(). fd.releaseMu.RLock() defer fd.releaseMu.RUnlock() @@ -391,6 +398,13 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error ctx.UninterruptibleSleepFinish(false) return err } + if fs := fd.filesystem(); fs.opts.lisaEnabled { + if accFsyncFDIDsLisa != nil { + *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, fd.handle.fdLisa.ID()) + return nil + } + return fd.handle.fdLisa.Sync(ctx) + } return fd.handle.file.fsync(ctx) }() if err != nil { diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index dbd834c67..27d9be5c4 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -35,7 +35,13 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { return target, nil } } - target, err := d.file.readlink(ctx) + var target string + var err error + if d.fs.opts.lisaEnabled { + target, err = d.controlFDLisa.ReadLinkAt(ctx) + } else { + target, err = d.file.readlink(ctx) + } if d.fs.opts.interop != InteropModeShared { if err == nil { d.haveTarget = true diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 9cbe805b9..07940b225 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -17,6 +17,7 @@ package gofer import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -24,6 +25,10 @@ func dentryTimestampFromP9(s, ns uint64) int64 { return int64(s*1e9 + ns) } +func dentryTimestampFromLisa(t linux.StatxTimestamp) int64 { + return t.Sec*1e9 + int64(t.Nsec) +} + // Preconditions: d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtime(mnt *vfs.Mount) { if mnt.Flags.NoATime || mnt.ReadOnly() { diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 7fd7f000d..40aff2927 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -223,6 +223,12 @@ func (rp *ResolvingPath) Final() bool { return rp.curPart == 0 && !rp.pit.NextOk() } +// Pit returns a copy of rp's current path iterator. Modifying the iterator +// does not change rp. +func (rp *ResolvingPath) Pit() fspath.Iterator { + return rp.pit +} + // Component returns the current path component in the stream represented by // rp. // |