From 05c89af6edde6158844d4debfe68bc598fec4418 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 19 May 2020 13:45:23 -0700 Subject: Implement mmap for host fs in vfs2. In VFS1, both fs/host and fs/gofer used the same utils for host file mappings. Refactor parts of fsimpl/gofer to create similar utils to share with fsimpl/host (memory accounting code moved to fsutil, page rounding arithmetic moved to usermem). Updates #1476. PiperOrigin-RevId: 312345090 --- pkg/sentry/fsimpl/gofer/BUILD | 1 - pkg/sentry/fsimpl/gofer/gofer.go | 4 +- pkg/sentry/fsimpl/gofer/pagemath.go | 31 -------- pkg/sentry/fsimpl/gofer/regular_file.go | 47 +++--------- pkg/sentry/fsimpl/host/BUILD | 4 + pkg/sentry/fsimpl/host/host.go | 70 ++++++++++++----- pkg/sentry/fsimpl/host/mmap.go | 132 ++++++++++++++++++++++++++++++++ 7 files changed, 197 insertions(+), 92 deletions(-) delete mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go create mode 100644 pkg/sentry/fsimpl/host/mmap.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 5ce82b793..67e916525 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -36,7 +36,6 @@ go_library( "gofer.go", "handle.go", "p9file.go", - "pagemath.go", "regular_file.go", "socket.go", "special_file.go", diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index ebf063a58..6295f6b54 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -928,8 +928,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // so we can't race with Write or another truncate.) d.dataMu.Unlock() if d.size < oldSize { - oldpgend := pageRoundUp(oldSize) - newpgend := pageRoundUp(d.size) + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go deleted file mode 100644 index 847cb0784..000000000 --- a/pkg/sentry/fsimpl/gofer/pagemath.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "gvisor.dev/gvisor/pkg/usermem" -) - -// This are equivalent to usermem.Addr.RoundDown/Up, but without the -// potentially truncating conversion to usermem.Addr. This is necessary because -// there is no way to define generic "PageRoundDown/Up" functions in Go. - -func pageRoundDown(x uint64) uint64 { - return x &^ (usermem.PageSize - 1) -} - -func pageRoundUp(x uint64) uint64 { - return pageRoundDown(x + usermem.PageSize - 1) -} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 857f7c74e..0d10cf7ac 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -148,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } // Remove touched pages from the cache. - pgstart := pageRoundDown(uint64(offset)) - pgend := pageRoundUp(uint64(offset + src.NumBytes())) - if pgend < pgstart { + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} @@ -306,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. + gapEnd, _ := usermem.PageRoundUp(gapMR.End) reqMR := memmap.MappableRange{ - Start: pageRoundDown(gapMR.Start), - End: pageRoundUp(gapMR.End), + Start: usermem.PageRoundDown(gapMR.Start), + End: gapEnd, } optMR := gap.Range() err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) @@ -671,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab // Constrain translations to d.size (rounded up) to prevent translation to // pages that may be concurrently truncated. - pgend := pageRoundUp(d.size) + pgend, _ := usermem.PageRoundUp(d.size) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { @@ -818,43 +819,15 @@ type dentryPlatformFile struct { // IncRef implements platform.File.IncRef. func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { d.dataMu.Lock() - seg, gap := d.fdRefs.Find(fr.Start) - for { - switch { - case seg.Ok() && seg.Start() < fr.End: - seg = d.fdRefs.Isolate(seg, fr) - seg.SetValue(seg.Value() + 1) - seg, gap = seg.NextNonEmpty() - case gap.Ok() && gap.Start() < fr.End: - newRange := gap.Range().Intersect(fr) - usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) - seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() - default: - d.fdRefs.MergeAdjacent(fr) - d.dataMu.Unlock() - return - } - } + d.fdRefs.IncRefAndAccount(fr) + d.dataMu.Unlock() } // DecRef implements platform.File.DecRef. func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { d.dataMu.Lock() - seg := d.fdRefs.FindSegment(fr.Start) - - for seg.Ok() && seg.Start() < fr.End { - seg = d.fdRefs.Isolate(seg, fr) - if old := seg.Value(); old == 1 { - usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) - seg = d.fdRefs.Remove(seg).NextSegment() - } else { - seg.SetValue(old - 1) - seg = seg.NextSegment() - } - } - d.fdRefs.MergeAdjacent(fr) + d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() - } // MapInternal implements platform.File.MapInternal. diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 39509f703..ca0fe6d2b 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -8,6 +8,7 @@ go_library( "control.go", "host.go", "ioctl_unsafe.go", + "mmap.go", "socket.go", "socket_iovec.go", "socket_unsafe.go", @@ -23,12 +24,15 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs/fsutil", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 8caf55a1b..65981197d 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -86,15 +86,16 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) i := &inode{ hostFD: hostFD, - seekable: seekable, + ino: fs.NextIno(), isTTY: opts.IsTTY, - canMap: canMap(uint32(fileType)), wouldBlock: wouldBlock(uint32(fileType)), - ino: fs.NextIno(), + seekable: seekable, // For simplicity, set offset to 0. Technically, we should use the existing // offset on the host if the file is seekable. offset: 0, + canMap: canMap(uint32(fileType)), } + i.pf.inode = i // Non-seekable files can't be memory mapped, assert this. if !i.seekable && i.canMap { @@ -189,11 +190,15 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int - // wouldBlock is true if the host FD would return EWOULDBLOCK for - // operations that would block. + // ino is an inode number unique within this filesystem. // // This field is initialized at creation time and is immutable. - wouldBlock bool + ino uint64 + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return @@ -202,29 +207,36 @@ type inode struct { // This field is initialized at creation time and is immutable. seekable bool - // isTTY is true if this file represents a TTY. + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. It is only meaningful when + // seekable is true. + offset int64 + + // wouldBlock is true if the host FD would return EWOULDBLOCK for + // operations that would block. // // This field is initialized at creation time and is immutable. - isTTY bool + wouldBlock bool + + // Event queue for blocking operations. + queue waiter.Queue // canMap specifies whether we allow the file to be memory mapped. // // This field is initialized at creation time and is immutable. canMap bool - // ino is an inode number unique within this filesystem. - // - // This field is initialized at creation time and is immutable. - ino uint64 - - // offsetMu protects offset. - offsetMu sync.Mutex + // mapsMu protects mappings. + mapsMu sync.Mutex - // offset specifies the current file offset. - offset int64 + // If canMap is true, mappings tracks mappings of hostFD into + // memmap.MappingSpaces. + mappings memmap.MappingSet - // Event queue for blocking operations. - queue waiter.Queue + // pf implements platform.File for mappings of hostFD. + pf inodePlatformFile } // CheckPermissions implements kernfs.Inode. @@ -388,6 +400,21 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } + oldSize := uint64(hostStat.Size) + if s.Size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(s.Size) + if oldpgend != newpgend { + i.mapsMu.Lock() + i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + i.mapsMu.Unlock() + } + } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { ts := [2]syscall.Timespec{ @@ -666,8 +693,9 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts if !f.inode.canMap { return syserror.ENODEV } - // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. - return syserror.ENODEV + i := f.inode + i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init) + return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go new file mode 100644 index 000000000..8545a82f0 --- /dev/null +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -0,0 +1,132 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// inodePlatformFile implements platform.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// +// inodePlatformFile should only be used if inode.canMap is true. +type inodePlatformFile struct { + *inode + + // fdRefsMu protects fdRefs. + fdRefsMu sync.Mutex + + // fdRefs counts references on platform.File offsets. It is used solely for + // memory accounting. + fdRefs fsutil.FrameRefSet + + // fileMapper caches mappings of the host file represented by this inode. + fileMapper fsutil.HostFileMapper + + // fileMapperInitOnce is used to lazily initialize fileMapper. + fileMapperInitOnce sync.Once +} + +// IncRef implements platform.File.IncRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) IncRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.IncRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// DecRef implements platform.File.DecRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) DecRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.DecRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// MapInternal implements platform.File.MapInternal. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) +} + +// FD implements platform.File.FD. +func (i *inodePlatformFile) FD() int { + return i.hostFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + i.mapsMu.Lock() + mapped := i.mappings.AddMapping(ms, ar, offset, writable) + for _, r := range mapped { + i.pf.fileMapper.IncRefOn(r) + } + i.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + i.mapsMu.Lock() + unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + i.pf.fileMapper.DecRefOn(r) + } + i.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return i.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + mr := optional + return []memmap.Translation{ + { + Source: mr, + File: &i.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) InvalidateUnsavable(ctx context.Context) error { + // We expect the same host fd across save/restore, so all translations + // should be valid. + return nil +} -- cgit v1.2.3 From 76369b6480b69bb7bf680db36ce8787fb324911c Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 20 May 2020 14:49:40 -0700 Subject: Move fsimpl/host file offset from inode to fileDescription. PiperOrigin-RevId: 312559861 --- pkg/sentry/fsimpl/host/host.go | 78 ++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 40 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 65981197d..18b127521 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -90,10 +90,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) isTTY: opts.IsTTY, wouldBlock: wouldBlock(uint32(fileType)), seekable: seekable, - // For simplicity, set offset to 0. Technically, we should use the existing - // offset on the host if the file is seekable. - offset: 0, - canMap: canMap(uint32(fileType)), + canMap: canMap(uint32(fileType)), } i.pf.inode = i @@ -118,6 +115,10 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) // i.open will take a reference on d. defer d.DecRef() + + // For simplicity, fileDescription.offset is set to 0. Technically, we + // should only set to 0 on files that are not seekable (sockets, pipes, + // etc.), and use the offset from the host fd otherwise when importing. return i.open(ctx, d.VFSDentry(), mnt, flags) } @@ -207,13 +208,6 @@ type inode struct { // This field is initialized at creation time and is immutable. seekable bool - // offsetMu protects offset. - offsetMu sync.Mutex - - // offset specifies the current file offset. It is only meaningful when - // seekable is true. - offset int64 - // wouldBlock is true if the host FD would return EWOULDBLOCK for // operations that would block. // @@ -491,9 +485,6 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return vfsfd, nil } - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. fd := &fileDescription{inode: i} vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { @@ -514,6 +505,13 @@ type fileDescription struct { // // inode is immutable after fileDescription creation. inode *inode + + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. It is only meaningful when + // inode.seekable is true. + offset int64 } // SetStat implements vfs.FileDescriptionImpl. @@ -559,10 +557,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts return n, err } // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - i.offsetMu.Lock() - n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags) - i.offset += n - i.offsetMu.Unlock() + f.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() return n, err } @@ -599,10 +597,10 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt } // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. - i.offsetMu.Lock() - n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags) - i.offset += n - i.offsetMu.Unlock() + f.offsetMu.Lock() + n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() return n, err } @@ -627,41 +625,41 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i return 0, syserror.ESPIPE } - i.offsetMu.Lock() - defer i.offsetMu.Unlock() + f.offsetMu.Lock() + defer f.offsetMu.Unlock() switch whence { case linux.SEEK_SET: if offset < 0 { - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - i.offset = offset + f.offset = offset case linux.SEEK_CUR: - // Check for overflow. Note that underflow cannot occur, since i.offset >= 0. - if offset > math.MaxInt64-i.offset { - return i.offset, syserror.EOVERFLOW + // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. + if offset > math.MaxInt64-f.offset { + return f.offset, syserror.EOVERFLOW } - if i.offset+offset < 0 { - return i.offset, syserror.EINVAL + if f.offset+offset < 0 { + return f.offset, syserror.EINVAL } - i.offset += offset + f.offset += offset case linux.SEEK_END: var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { - return i.offset, err + return f.offset, err } size := s.Size // Check for overflow. Note that underflow cannot occur, since size >= 0. if offset > math.MaxInt64-size { - return i.offset, syserror.EOVERFLOW + return f.offset, syserror.EOVERFLOW } if size+offset < 0 { - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - i.offset = size + offset + f.offset = size + offset case linux.SEEK_DATA, linux.SEEK_HOLE: // Modifying the offset in the host file table should not matter, since @@ -670,16 +668,16 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // For reading and writing, we always rely on our internal offset. n, err := unix.Seek(i.hostFD, offset, int(whence)) if err != nil { - return i.offset, err + return f.offset, err } - i.offset = n + f.offset = n default: // Invalid whence. - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - return i.offset, nil + return f.offset, nil } // Sync implements FileDescriptionImpl. -- cgit v1.2.3 From af3121a52383fb60579d769994be5d91bd788015 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 26 May 2020 21:42:07 -0700 Subject: Implement splice(2) and tee(2) for VFS2. Updates #138 PiperOrigin-RevId: 313326354 --- pkg/buffer/safemem.go | 82 ++++----- pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 2 + pkg/sentry/kernel/pipe/pipe.go | 6 + pkg/sentry/kernel/pipe/pipe_unsafe.go | 35 ++++ pkg/sentry/kernel/pipe/vfs.go | 219 ++++++++++++++++++++++- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/splice.go | 286 +++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 4 +- pkg/sentry/vfs/file_description.go | 5 + test/syscalls/linux/splice.cc | 49 ++++++ 11 files changed, 648 insertions(+), 43 deletions(-) create mode 100644 pkg/sentry/kernel/pipe/pipe_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/splice.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go index 0e5b86344..b789e56e9 100644 --- a/pkg/buffer/safemem.go +++ b/pkg/buffer/safemem.go @@ -28,12 +28,11 @@ func (b *buffer) ReadBlock() safemem.Block { return safemem.BlockFromSafeSlice(b.ReadSlice()) } -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. -// -// This will advance the write index. -func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - need := int(srcs.NumBytes()) - if need == 0 { +// WriteFromSafememReader writes up to count bytes from r to v and advances the +// write index by the number of bytes written. It calls r.ReadToBlocks() at +// most once. +func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) { + if count == 0 { return 0, nil } @@ -50,32 +49,33 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { } // Does the last block have sufficient capacity alone? - if l := firstBuf.WriteSize(); l >= need { - dst = safemem.BlockSeqOf(firstBuf.WriteBlock()) + if l := uint64(firstBuf.WriteSize()); l >= count { + dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count)) } else { // Append blocks until sufficient. - need -= l + count -= l blocks = append(blocks, firstBuf.WriteBlock()) - for need > 0 { + for count > 0 { emptyBuf := bufferPool.Get().(*buffer) v.data.PushBack(emptyBuf) - need -= emptyBuf.WriteSize() - blocks = append(blocks, emptyBuf.WriteBlock()) + block := emptyBuf.WriteBlock().TakeFirst64(count) + count -= uint64(block.Len()) + blocks = append(blocks, block) } dst = safemem.BlockSeqFromSlice(blocks) } - // Perform the copy. - n, err := safemem.CopySeq(dst, srcs) + // Perform I/O. + n, err := r.ReadToBlocks(dst) v.size += int64(n) // Update all indices. - for left := int(n); left > 0; firstBuf = firstBuf.Next() { - if l := firstBuf.WriteSize(); left >= l { + for left := n; left > 0; firstBuf = firstBuf.Next() { + if l := firstBuf.WriteSize(); left >= uint64(l) { firstBuf.WriteMove(l) // Whole block. - left -= l + left -= uint64(l) } else { - firstBuf.WriteMove(left) // Partial block. + firstBuf.WriteMove(int(left)) // Partial block. left = 0 } } @@ -83,14 +83,16 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { return n, err } -// ReadToBlocks implements safemem.Reader.ReadToBlocks. -// -// This will not advance the read index; the caller should follow -// this call with a call to TrimFront in order to remove the read -// data from the buffer. This is done to support pipe sematics. -func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - need := int(dsts.NumBytes()) - if need == 0 { +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the +// write index by the number of bytes written. +func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes()) +} + +// ReadToSafememWriter reads up to count bytes from v to w. It does not advance +// the read index. It calls w.WriteFromBlocks() at most once. +func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) { + if count == 0 { return 0, nil } @@ -105,25 +107,27 @@ func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { } // Is all the data in a single block? - if l := firstBuf.ReadSize(); l >= need { - src = safemem.BlockSeqOf(firstBuf.ReadBlock()) + if l := uint64(firstBuf.ReadSize()); l >= count { + src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count)) } else { // Build a list of all the buffers. - need -= l + count -= l blocks = append(blocks, firstBuf.ReadBlock()) - for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() { - need -= buf.ReadSize() - blocks = append(blocks, buf.ReadBlock()) + for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() { + block := buf.ReadBlock().TakeFirst64(count) + count -= uint64(block.Len()) + blocks = append(blocks, block) } src = safemem.BlockSeqFromSlice(blocks) } - // Perform the copy. - n, err := safemem.CopySeq(dsts, src) - - // See above: we would normally advance the read index here, but we - // don't do that in order to support pipe semantics. We rely on a - // separate call to TrimFront() in this case. + // Perform I/O. As documented, we don't advance the read index. + return w.WriteFromBlocks(src) +} - return n, err +// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the +// read index by the number of bytes read, such that it's only safe to call if +// the caller guarantees that ReadToBlocks will only be called once. +func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes()) } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 3f433d666..fee174375 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -312,7 +312,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off f := fd.inode().impl.(*regularFile) if end := offset + srclen; end < offset { // Overflow. - return 0, syserror.EFBIG + return 0, syserror.EINVAL } var err error diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index f29dc0472..7bfa9075a 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -8,6 +8,7 @@ go_library( "device.go", "node.go", "pipe.go", + "pipe_unsafe.go", "pipe_util.go", "reader.go", "reader_writer.go", @@ -20,6 +21,7 @@ go_library( "//pkg/amutex", "//pkg/buffer", "//pkg/context", + "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 62c8691f1..79645d7d2 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() + return p.readLocked(ctx, ops) +} +func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { // Is the pipe empty? if p.view.Size() == 0 { if !p.HasWriters() { @@ -246,7 +249,10 @@ type writeOps struct { func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() + return p.writeLocked(ctx, ops) +} +func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { // Can't write to a pipe with no readers. if !p.HasReaders() { return 0, syscall.EPIPE diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go new file mode 100644 index 000000000..dd60cba24 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "unsafe" +) + +// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be +// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that +// concurrent calls cannot deadlock. +// +// Preconditions: x != y. +func lockTwoPipes(x, y *Pipe) { + // Lock the two pipes in order of increasing address. + if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { + x.mu.Lock() + y.mu.Lock() + } else { + y.mu.Lock() + x.mu.Lock() + } +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index b54f08a30..2602bed72 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,7 +16,9 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -150,7 +152,9 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) * return &fd.vfsfd } -// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements +// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to +// other FileDescriptions for splice(2) and tee(2). type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -229,3 +233,216 @@ func (fd *VFSPipeFD) PipeSize() int64 { func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { return fd.pipe.SetFifoSize(size) } + +// IOSequence returns a useremm.IOSequence that reads up to count bytes from, +// or writes up to count bytes to, fd. +func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence { + return usermem.IOSequence{ + IO: fd, + Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(dst)) + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return int64(len(dst)) + }, + limit: func(l int64) { + dst = dst[:l] + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadAt(dst, 0) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// CopyOut implements usermem.IO.CopyOut. +func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(src)) + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return int64(len(src)) + }, + limit: func(l int64) { + src = src[:l] + }, + write: func(view *buffer.View) (int64, error) { + view.Append(src) + return int64(len(src)), nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + origCount := toZero + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return toZero + }, + limit: func(l int64) { + toZero = l + }, + write: func(view *buffer.View) (int64, error) { + view.Grow(view.Size()+toZero, true /* zero */) + return toZero, nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToSafememWriter(dst, uint64(count)) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromSafememReader(src, uint64(count)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + // How did a pipe get passed as the virtual address space to futex(2)? + panic("VFSPipeFD.SwapUint32 called unexpectedly") +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly") +} + +// LoadUint32 implements usermem.IO.LoadUint32. +func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.LoadUint32 called unexpectedly") +} + +// Splice reads up to count bytes from src and writes them to dst. It returns +// the number of bytes moved. +// +// Preconditions: count > 0. +func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */) +} + +// Tee reads up to count bytes from src and writes them to dst, without +// removing the read bytes from src. It returns the number of bytes copied. +// +// Preconditions: count > 0. +func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */) +} + +// Preconditions: count > 0. +func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { + if dst.pipe == src.pipe { + return 0, syserror.EINVAL + } + + lockTwoPipes(dst.pipe, src.pipe) + defer dst.pipe.mu.Unlock() + defer src.pipe.mu.Unlock() + + n, err := dst.pipe.writeLocked(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(dstView *buffer.View) (int64, error) { + return src.pipe.readLocked(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(srcView *buffer.View) (int64, error) { + n, err := srcView.ReadToSafememWriter(dstView, uint64(count)) + if n > 0 && removeFromSrc { + srcView.TrimFront(int64(n)) + } + return int64(n), err + }, + }) + }, + }) + if n > 0 { + dst.pipe.Notify(waiter.EventIn) + src.pipe.Notify(waiter.EventOut) + } + return n, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index f882ef840..d56927ff5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -22,6 +22,7 @@ go_library( "setstat.go", "signal.go", "socket.go", + "splice.go", "stat.go", "stat_amd64.go", "stat_arm64.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go new file mode 100644 index 000000000..8f3c22a02 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -0,0 +1,286 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Splice implements Linux syscall splice(2). +func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + inOffsetPtr := args[1].Pointer() + outFD := args[2].Int() + outOffsetPtr := args[3].Pointer() + count := int64(args[4].SizeT()) + flags := args[5].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // At least one file description must represent a pipe. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe && !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy in offsets. + inOffset := int64(-1) + if inOffsetPtr != 0 { + if inIsPipe { + return 0, nil, syserror.ESPIPE + } + if inFile.Options().DenyPRead { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + if inOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + outOffset := int64(-1) + if outOffsetPtr != 0 { + if outIsPipe { + return 0, nil, syserror.ESPIPE + } + if outFile.Options().DenyPWrite { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + if outOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + + // Move data. + var ( + n int64 + err error + inCh chan struct{} + outCh chan struct{} + ) + for { + // If both input and output are pipes, delegate to the pipe + // implementation. Otherwise, exactly one end is a pipe, which we + // ensure is consistently ordered after the non-pipe FD's locks by + // passing the pipe FD as usermem.IO to the non-pipe end. + switch { + case inIsPipe && outIsPipe: + n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) + case inIsPipe: + if outOffset != -1 { + n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{}) + outOffset += n + } else { + n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{}) + } + case outIsPipe: + if inOffset != -1 { + n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{}) + inOffset += n + } else { + n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) + } + } + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + break + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the splice operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err = t.Block(inCh); err != nil { + break + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err = t.Block(outCh); err != nil { + break + } + } + } + + // Copy updated offsets out. + if inOffsetPtr != 0 { + if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + } + if outOffsetPtr != 0 { + if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + } + + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Tee implements Linux syscall tee(2). +func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + outFD := args[1].Int() + count := int64(args[2].SizeT()) + flags := args[3].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // Both file descriptions must represent pipes. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe || !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy data. + var ( + inCh chan struct{} + outCh chan struct{} + ) + for { + n, err := pipe.Tee(t, outPipeFD, inPipeFD, count) + if n != 0 { + return uintptr(n), nil, nil + } + if err != syserror.ErrWouldBlock || nonBlock { + return 0, nil, err + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the tee operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err := t.Block(inCh); err != nil { + return 0, nil, err + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err := t.Block(outCh); err != nil { + return 0, nil, err + } + } + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index a332d01bd..083fdcf82 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -134,8 +134,8 @@ func Override() { s.Table[269] = syscalls.Supported("faccessat", Faccessat) s.Table[270] = syscalls.Supported("pselect", Pselect) s.Table[271] = syscalls.Supported("ppoll", Ppoll) - delete(s.Table, 275) // splice - delete(s.Table, 276) // tee + s.Table[275] = syscalls.Supported("splice", Splice) + s.Table[276] = syscalls.Supported("tee", Tee) s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) s.Table[280] = syscalls.Supported("utimensat", Utimensat) s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index cfabd936c..bb294563d 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -210,6 +210,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry { return fd.vd } +// Options returns the options passed to fd.Init(). +func (fd *FileDescription) Options() FileDescriptionOptions { + return fd.opts +} + // StatusFlags returns file description status flags, as for fcntl(F_GETFL). func (fd *FileDescription) StatusFlags() uint32 { return atomic.LoadUint32(&fd.statusFlags) diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc index f103e2e56..08fc4b1b7 100644 --- a/test/syscalls/linux/splice.cc +++ b/test/syscalls/linux/splice.cc @@ -430,6 +430,55 @@ TEST(SpliceTest, TwoPipes) { EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0); } +TEST(SpliceTest, TwoPipesCircular) { + // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is + // based on fs.File.UniqueID, which does not prevent circular ordering between + // e.g. inode-level locks taken by fs.FileOperations. + SKIP_IF(IsRunningWithVFS1()); + + // Create two pipes. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor first_rfd(fds[0]); + const FileDescriptor first_wfd(fds[1]); + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor second_rfd(fds[0]); + const FileDescriptor second_wfd(fds[1]); + + // On Linux, each pipe is normally limited to + // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data. + constexpr size_t PIPE_DEF_BUFFERS = 16; + + // Write some data to each pipe. Below we splice 1 byte at a time between + // pipes, which very quickly causes each byte to be stored in a separate + // buffer, so we must ensure that the total amount of data in the system is <= + // PIPE_DEF_BUFFERS bytes. + std::vector buf(PIPE_DEF_BUFFERS / 2); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(buf.size())); + ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(buf.size())); + + // Have another thread splice from the second pipe to the first, while we + // splice from the first to the second. The test passes if this does not + // deadlock. + const int kIterations = 1000; + DisableSave ds; + ScopedThread t([&]() { + for (int i = 0; i < kIterations; i++) { + ASSERT_THAT( + splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0), + SyscallSucceedsWithValue(1)); + } + }); + for (int i = 0; i < kIterations; i++) { + ASSERT_THAT( + splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0), + SyscallSucceedsWithValue(1)); + } +} + TEST(SpliceTest, Blocking) { // Create two new pipes. int first[2], second[2]; -- cgit v1.2.3 From e028714a0dd390b2321c4beeac62c5b2904cd917 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 26 May 2020 22:42:54 -0700 Subject: Support dfltuid and dfltgid mount options in the VFS2 gofer client. PiperOrigin-RevId: 313332542 --- pkg/sentry/fsimpl/gofer/gofer.go | 48 +++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 6295f6b54..131da332f 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -84,12 +84,6 @@ type filesystem struct { // devMinor is the filesystem's minor device number. devMinor is immutable. devMinor uint32 - // uid and gid are the effective KUID and KGID of the filesystem's creator, - // and are used as the owner and group for files that don't specify one. - // uid and gid are immutable. - uid auth.KUID - gid auth.KGID - // renameMu serves two purposes: // // - It synchronizes path resolution with renaming initiated by this @@ -122,6 +116,8 @@ type filesystemOptions struct { fd int aname string interop InteropMode // derived from the "cache" mount option + dfltuid auth.KUID + dfltgid auth.KGID msize uint32 version string @@ -230,6 +226,15 @@ type InternalFilesystemOptions struct { OpenSocketsByConnecting bool } +// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default +// UIDs and GIDs used for files that do not provide a specific owner or group +// respectively. +const ( + // uint32(-2) doesn't work in Go. + _V9FS_DEFUID = auth.KUID(4294967294) + _V9FS_DEFGID = auth.KGID(4294967294) +) + // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name @@ -315,6 +320,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + // Parse the default UID and GID. + fsopts.dfltuid = _V9FS_DEFUID + if dfltuidstr, ok := mopts["dfltuid"]; ok { + delete(mopts, "dfltuid") + dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr) + return nil, nil, syserror.EINVAL + } + // In Linux, dfltuid is interpreted as a UID and is converted to a KUID + // in the caller's user namespace, but goferfs isn't + // application-mountable. + fsopts.dfltuid = auth.KUID(dfltuid) + } + fsopts.dfltgid = _V9FS_DEFGID + if dfltgidstr, ok := mopts["dfltgid"]; ok { + delete(mopts, "dfltgid") + dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr) + return nil, nil, syserror.EINVAL + } + fsopts.dfltgid = auth.KGID(dfltgid) + } + // Parse the 9P message size. fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M if msizestr, ok := mopts["msize"]; ok { @@ -422,8 +452,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt client: client, clock: ktime.RealtimeClockFromContext(ctx), devMinor: devMinor, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -672,8 +700,8 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma file: file, ino: qid.Path, mode: uint32(attr.Mode), - uid: uint32(fs.uid), - gid: uint32(fs.gid), + uid: uint32(fs.opts.dfltuid), + gid: uint32(fs.opts.dfltgid), blockSize: usermem.PageSize, handle: handle{ fd: -1, -- cgit v1.2.3