summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs')
-rw-r--r--pkg/sentry/fs/copy_up.go13
-rw-r--r--pkg/sentry/fs/dev/BUILD1
-rw-r--r--pkg/sentry/fs/dev/net_tun.go69
-rw-r--r--pkg/sentry/fs/dirent.go12
-rw-r--r--pkg/sentry/fs/file_operations.go5
-rw-r--r--pkg/sentry/fs/fsutil/file_range_set.go31
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper.go19
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go21
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md99
-rw-r--r--pkg/sentry/fs/host/BUILD1
-rw-r--r--pkg/sentry/fs/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fs/host/tty.go42
-rw-r--r--pkg/sentry/fs/inode.go2
-rw-r--r--pkg/sentry/fs/inode_overlay.go11
-rw-r--r--pkg/sentry/fs/overlay.go20
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/sys_net.go120
-rw-r--r--pkg/sentry/fs/proc/sys_net_state.go15
-rw-r--r--pkg/sentry/fs/proc/sys_net_test.go73
-rw-r--r--pkg/sentry/fs/proc/task.go46
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go2
-rw-r--r--pkg/sentry/fs/tmpfs/tmpfs.go12
-rw-r--r--pkg/sentry/fs/tty/BUILD3
-rw-r--r--pkg/sentry/fs/tty/dir.go46
-rw-r--r--pkg/sentry/fs/tty/fs.go4
-rw-r--r--pkg/sentry/fs/tty/line_discipline.go55
-rw-r--r--pkg/sentry/fs/tty/master.go37
-rw-r--r--pkg/sentry/fs/tty/queue.go23
-rw-r--r--pkg/sentry/fs/tty/replica.go (renamed from pkg/sentry/fs/tty/slave.go)88
-rw-r--r--pkg/sentry/fs/tty/terminal.go39
-rw-r--r--pkg/sentry/fs/tty/tty_test.go4
-rw-r--r--pkg/sentry/fs/user/path.go1
-rw-r--r--pkg/sentry/fs/user/user.go1
33 files changed, 652 insertions, 268 deletions
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 735452b07..ff2fe6712 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -107,8 +107,7 @@ func copyUp(ctx context.Context, d *Dirent) error {
// leave the upper filesystem filled with any number of parent directories
// but the upper filesystem will never be in an inconsistent state.
//
-// Preconditions:
-// - d.Inode.overlay is non-nil.
+// Preconditions: d.Inode.overlay is non-nil.
func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
for {
// Did we race with another copy up or does there
@@ -183,12 +182,12 @@ func doCopyUp(ctx context.Context, d *Dirent) error {
// Returns a generic error on failure.
//
// Preconditions:
-// - parent.Inode.overlay.upper must be non-nil.
-// - next.Inode.overlay.copyMu must be locked writable.
-// - next.Inode.overlay.lower must be non-nil.
-// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
+// * parent.Inode.overlay.upper must be non-nil.
+// * next.Inode.overlay.copyMu must be locked writable.
+// * next.Inode.overlay.lower must be non-nil.
+// * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
// or Symlink.
-// - upper filesystem must support setting file ownership and timestamps.
+// * upper filesystem must support setting file ownership and timestamps.
func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
// Extract the attributes of the file we wish to copy.
attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9379a4d7b..6b7b451b8 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -34,6 +34,7 @@ go_library(
"//pkg/sentry/socket/netstack",
"//pkg/syserror",
"//pkg/tcpip/link/tun",
+ "//pkg/tcpip/network/arp",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index ec474e554..19ffdec47 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -15,6 +15,8 @@
package dev
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -25,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -60,7 +63,7 @@ func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMod
}
// GetFile implements fs.InodeOperations.GetFile.
-func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+func (*netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
}
@@ -80,21 +83,22 @@ type netTunFileOperations struct {
var _ fs.FileOperations = (*netTunFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (fops *netTunFileOperations) Release(ctx context.Context) {
- fops.device.Release(ctx)
+func (n *netTunFileOperations) Release(ctx context.Context) {
+ n.device.Release(ctx)
}
// Ioctl implements fs.FileOperations.Ioctl.
-func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
request := args[1].Uint()
data := args[2].Pointer()
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("Ioctl should be called from a task context")
+ }
+
switch request {
case linux.TUNSETIFF:
- t := kernel.TaskFromContext(ctx)
- if t == nil {
- panic("Ioctl should be called from a task context")
- }
if !t.HasCapability(linux.CAP_NET_ADMIN) {
return 0, syserror.EPERM
}
@@ -104,27 +108,32 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
}
var req linux.IFReq
- if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := req.CopyIn(t, data); err != nil {
return 0, err
}
flags := usermem.ByteOrder.Uint16(req.Data[:])
- return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+ created, err := n.device.SetIff(stack.Stack, req.Name(), flags)
+ if err == nil && created {
+ // Always start with an ARP address for interfaces so they can handle ARP
+ // packets.
+ nicID := n.device.NICID()
+ if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+ }
+ }
+ return 0, err
case linux.TUNGETIFF:
var req linux.IFReq
- copy(req.IFName[:], fops.device.Name())
+ copy(req.IFName[:], n.device.Name())
// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
- flags := fops.device.Flags() | linux.IFF_NOFILTER
+ flags := n.device.Flags() | linux.IFF_NOFILTER
usermem.ByteOrder.PutUint16(req.Data[:], flags)
- _, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := req.CopyOut(t, data)
return 0, err
default:
@@ -133,41 +142,41 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
}
// Write implements fs.FileOperations.Write.
-func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+func (n *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
data := make([]byte, src.NumBytes())
if _, err := src.CopyIn(ctx, data); err != nil {
return 0, err
}
- return fops.device.Write(data)
+ return n.device.Write(data)
}
// Read implements fs.FileOperations.Read.
-func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
- data, err := fops.device.Read()
+func (n *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ data, err := n.device.Read()
if err != nil {
return 0, err
}
- n, err := dst.CopyOut(ctx, data)
- if n > 0 && n < len(data) {
+ bytesCopied, err := dst.CopyOut(ctx, data)
+ if bytesCopied > 0 && bytesCopied < len(data) {
// Not an error for partial copying. Packet truncated.
err = nil
}
- return int64(n), err
+ return int64(bytesCopied), err
}
// Readiness implements watier.Waitable.Readiness.
-func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
- return fops.device.Readiness(mask)
+func (n *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return n.device.Readiness(mask)
}
// EventRegister implements watier.Waitable.EventRegister.
-func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- fops.device.EventRegister(e, mask)
+func (n *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ n.device.EventRegister(e, mask)
}
// EventUnregister implements watier.Waitable.EventUnregister.
-func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
- fops.device.EventUnregister(e)
+func (n *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+ n.device.EventUnregister(e)
}
// isNetTunSupported returns whether /dev/net/tun device is supported for s.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index a2f751068..00c526b03 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -413,9 +413,9 @@ func (d *Dirent) descendantOf(p *Dirent) bool {
// Inode.Lookup, otherwise walk will keep d.mu locked.
//
// Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
if !IsDir(d.Inode.StableAttr) {
return nil, syscall.ENOTDIR
@@ -577,9 +577,9 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
// exists returns true if name exists in relation to d.
//
// Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
child, err := d.walk(ctx, root, name, false /* may unlock */)
if err != nil {
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 305c0f840..6ec721022 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -159,8 +159,9 @@ type FileOperations interface {
// io provides access to the virtual memory space to which pointers in args
// refer.
//
- // Preconditions: The AddressSpace (if any) that io refers to is activated.
- // Must only be called from a task goroutine.
+ // Preconditions:
+ // * The AddressSpace (if any) that io refers to is activated.
+ // * Must only be called from a task goroutine.
Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
}
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index bbafebf03..1dc409d38 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -70,7 +70,9 @@ func (seg FileRangeIterator) FileRange() memmap.FileRange {
// FileRangeOf returns the FileRange mapped by mr.
//
-// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * seg.Range().IsSupersetOf(mr).
+// * mr.Length() != 0.
func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
frstart := seg.Value() + (mr.Start - seg.Start())
return memmap.FileRange{frstart, frstart + mr.Length()}
@@ -82,15 +84,18 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRan
// returns a successful partial read, Fill will call it repeatedly until all
// bytes have been read.) EOF is handled consistently with the requirements of
// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
-// invalid.
+// invalid. fileSize is an upper bound on the file's size; bytes after fileSize
+// will be zeroed without calling readAt.
//
// Fill may read offsets outside of required, but will never read offsets
// outside of optional. It returns a non-nil error if any error occurs, even
// if the error only affects offsets in optional, but not in required.
//
-// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-// required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+// Preconditions:
+// * required.Length() > 0.
+// * optional.IsSupersetOf(required).
+// * required and optional must be page-aligned.
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
gap := frs.LowerBoundGap(required.Start)
for gap.Ok() && gap.Start() < required.End {
if gap.Range().Length() == 0 {
@@ -103,7 +108,21 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
var done uint64
for !dsts.IsEmpty() {
- n, err := readAt(ctx, dsts, gr.Start+done)
+ n, err := func() (uint64, error) {
+ off := gr.Start + done
+ if off >= fileSize {
+ return 0, io.EOF
+ }
+ if off+dsts.NumBytes() > fileSize {
+ rd := fileSize - off
+ n, err := readAt(ctx, dsts.TakeFirst64(rd), off)
+ if n == rd && err == nil {
+ return n, io.EOF
+ }
+ return n, err
+ }
+ return readAt(ctx, dsts, off)
+ }()
done += n
dsts = dsts.DropFirst64(n)
if err != nil {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index ef0113b52..4468f5dd2 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -70,6 +70,13 @@ func (f *HostFileMapper) Init() {
f.mappings = make(map[uint64]mapping)
}
+// IsInited returns true if f.Init() has been called. This is used when
+// restoring a checkpoint that contains a HostFileMapper that may or may not
+// have been initialized.
+func (f *HostFileMapper) IsInited() bool {
+ return f.refs != nil
+}
+
// NewHostFileMapper returns an initialized HostFileMapper allocated on the
// heap with no references or cached mappings.
func NewHostFileMapper() *HostFileMapper {
@@ -80,7 +87,9 @@ func NewHostFileMapper() *HostFileMapper {
// IncRefOn increments the reference count on all offsets in mr.
//
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
f.refsMu.Lock()
defer f.refsMu.Unlock()
@@ -97,7 +106,9 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
// DecRefOn decrements the reference count on all offsets in mr.
//
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
f.refsMu.Lock()
defer f.refsMu.Unlock()
@@ -204,7 +215,9 @@ func (f *HostFileMapper) UnmapAll() {
}
}
-// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+// Preconditions:
+// * f.mapsMu must be locked.
+// * f.mappings[chunkStart] == m.
func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
// This leaks address space and is unexpected, but is otherwise
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index fe8b0b6ac..82eda3e43 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel/time"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -444,7 +443,7 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.
// time.
//
// Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) {
c.attr.AccessTime = now
c.dirtyAttr.AccessTime = true
}
@@ -461,7 +460,7 @@ func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx contex
// and status change times in-place to the current time.
//
// Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) {
c.attr.ModificationTime = now
c.dirtyAttr.ModificationTime = true
c.attr.StatusChangeTime = now
@@ -480,7 +479,7 @@ func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
// in-place to the current time.
//
// Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) {
c.attr.StatusChangeTime = now
c.dirtyAttr.StatusChangeTime = true
}
@@ -645,7 +644,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
End: fs.OffsetPageEnd(int64(gapMR.End)),
}
optMR := gap.Range()
- err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+ err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.c.cache.Find(uint64(rw.offset))
if !seg.Ok() {
@@ -672,9 +671,6 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
// Continue.
seg, gap = gap.NextSegment(), FileRangeGapIterator{}
}
-
- default:
- break
}
}
unlock()
@@ -684,7 +680,9 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
// maybeGrowFile grows the file's size if data has been written past the old
// size.
//
-// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
+// Preconditions:
+// * rw.c.attrMu must be locked.
+// * rw.c.dataMu must be locked.
func (rw *inodeReadWriter) maybeGrowFile() {
// If the write ends beyond the file's previous size, it causes the
// file to grow.
@@ -766,9 +764,6 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
// Continue.
seg, gap = gap.NextSegment(), FileRangeGapIterator{}
-
- default:
- break
}
}
rw.maybeGrowFile()
@@ -875,7 +870,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
}
mf := c.mfp.MemoryFile()
- cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
+ cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index 2ca84dd74..05e043583 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -79,7 +79,7 @@ ops can be implemented in parallel.
- Implement `/dev/fuse` - a character device used to establish an FD for
communication between the sentry and the server daemon.
-- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+- Implement basic FUSE ops like `FUSE_INIT`.
#### Read-only mount with basic file operations
@@ -95,6 +95,103 @@ ops can be implemented in parallel.
- Implement the remaining FUSE ops and decide if we can omit rarely used
operations like ioctl.
+### Design Details
+
+#### Lifecycle for a FUSE Request
+
+- User invokes a syscall
+- Sentry prepares corresponding request
+ - If FUSE device is available
+ - Write the request in binary
+ - If FUSE device is full
+ - Kernel task blocked until available
+- Sentry notifies the readers of fuse device that it's ready for read
+- FUSE daemon reads the request and processes it
+- Sentry waits until a reply is written to the FUSE device
+ - but returns directly for async requests
+- FUSE daemon writes to the fuse device
+- Sentry processes the reply
+ - For sync requests, unblock blocked kernel task
+ - For async requests, execute pre-specified callback if any
+- Sentry returns the syscall to the user
+
+#### Channels and Queues for Requests in Different Stages
+
+`connection.initializedChan`
+
+- a channel that the requests issued before connection initialization blocks
+ on.
+
+`fd.queue`
+
+- a queue of requests that haven’t been read by the FUSE daemon yet.
+
+`fd.completions`
+
+- a map of the requests that have been prepared but not yet received a
+ response, including the ones on the `fd.queue`.
+
+`fd.waitQueue`
+
+- a queue of waiters that is waiting for the fuse device fd to be available,
+ such as the FUSE daemon.
+
+`fd.fullQueueCh`
+
+- a channel that the kernel task will be blocked on when the fd is not
+ available.
+
+#### Basic I/O Implementation
+
+Currently we have implemented basic functionalities of read and write for our
+FUSE. We describe the design and ways to improve it here:
+
+##### Basic FUSE Read
+
+The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and
+`vfs.FileDescriptionImpl.PRead()`. When a syscall is made, it will eventually
+reach our implementation of those interface functions located at
+`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files.
+
+After validation checks of the input, sentry sends `FUSE_READ` requests to the
+FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the
+responses. For the first version, we create a copy in kernel memory of those
+data. They are represented as a byte slice in the marshalled struct. This
+happens as a common process for all the FUSE responses at this moment at
+`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this
+intermediate buffer to the input buffer provided by the read syscall.
+
+There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter
+or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are
+the upperbound of the bytes to read in each `FUSE_READ` request. We implemented
+the code to handle the fragmented reads.
+
+To improve the performance: ideally we should have buffer cache to copy those
+data from the responses of FUSE daemon into, as is also the design of several
+other existing file system implementations for sentry, instead of a single-use
+temporary buffer. Directly mapping the memory of one process to another could
+also boost the performance, but to keep them isolated, we did not choose to do
+so.
+
+##### Basic FUSE Write
+
+The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and
+`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a
+user makes write(2) and pwrite(2) syscall.
+
+For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header
+(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first
+version, we allocate a buffer inside kernel memory to store the bytes from the
+user, and copy directly from that buffer to the memory of FUSE daemon. This
+happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
+
+The parameters `max_write` and `max_pages` restrict the number of bytes in one
+`FUSE_WRITE`. There are code handling fragmented writes in current
+implementation.
+
+To have better performance: the extra copy created to store the bytes to write
+can be replaced by the buffer cache as well.
+
# Appendix
## FUSE Protocol
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d41d23a43..1368014c4 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -32,6 +32,7 @@ go_library(
"//pkg/fdnotifier",
"//pkg/iovec",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/secio",
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5d4f312cf..c8231e0aa 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index e29ae00f2..1183727ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -17,6 +17,7 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -53,7 +54,7 @@ type TTYFileOperations struct {
func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
fileOperations: fileOperations{iops: iops},
- termios: linux.DefaultSlaveTermios,
+ termios: linux.DefaultReplicaTermios,
})
}
@@ -123,6 +124,11 @@ func (t *TTYFileOperations) Release(ctx context.Context) {
// Ioctl implements fs.FileOperations.Ioctl.
func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ return 0, syserror.ENOTTY
+ }
+
// Ignore arg[0]. This is the real FD:
fd := t.fileOperations.iops.fileState.FD()
ioctl := args[1].Uint64()
@@ -132,9 +138,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = termios.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -146,9 +150,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
}
var termios linux.Termios
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetTermios(fd, ioctl, &termios)
@@ -173,10 +175,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
// Map the ProcessGroup into a ProcessGroupID in the task's PID
// namespace.
- pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+ _, err := pgID.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSPGRP:
@@ -184,11 +184,6 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
// Equivalent to tcsetpgrp(fd, *argp).
// Set the foreground process group ID of this terminal.
- task := kernel.TaskFromContext(ctx)
- if task == nil {
- return 0, syserror.ENOTTY
- }
-
t.mu.Lock()
defer t.mu.Unlock()
@@ -208,12 +203,11 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
return 0, syserror.ENOTTY
}
- var pgID kernel.ProcessGroupID
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgIDP primitive.Int32
+ if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
+ pgID := kernel.ProcessGroupID(pgIDP)
// pgID must be non-negative.
if pgID < 0 {
@@ -242,9 +236,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = winsize.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSWINSZ:
@@ -255,9 +247,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
// background ones) can set the winsize.
var winsize linux.Winsize
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b79cd9877..004910453 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
// SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
if i.overlay != nil {
- return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
+ return overlaySetXattr(ctx, i.overlay, d, name, value, flags)
}
return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index dc2e353d9..b16ab08ba 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -16,7 +16,6 @@ package fs
import (
"fmt"
- "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
// Don't forward the value of the extended attribute if it would
// unexpectedly change the behavior of a wrapping overlay layer.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return "", syserror.ENODATA
}
@@ -553,9 +552,9 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
return s, err
}
-func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
// Don't allow changes to overlay xattrs through a setxattr syscall.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return syserror.EPERM
}
@@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
for name := range names {
// Same as overlayGetXattr, we shouldn't forward along
// overlay attributes.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
delete(names, name)
}
}
@@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
// Don't allow changes to overlay xattrs through a removexattr syscall.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return syserror.EPERM
}
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 35013a21b..01a1235b8 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -86,13 +86,12 @@ func isXattrOverlay(name string) bool {
// NewOverlayRoot produces the root of an overlay.
//
// Preconditions:
-//
-// - upper and lower must be non-nil.
-// - upper must not be an overlay.
-// - lower should not expose character devices, pipes, or sockets, because
+// * upper and lower must be non-nil.
+// * upper must not be an overlay.
+// * lower should not expose character devices, pipes, or sockets, because
// copying up these types of files is not supported.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
if !IsDir(upper.StableAttr) {
return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type)
@@ -117,12 +116,11 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
// NewOverlayRootFile produces the root of an overlay that points to a file.
//
// Preconditions:
-//
-// - lower must be non-nil.
-// - lower should not expose character devices, pipes, or sockets, because
+// * lower must be non-nil.
+// * lower should not expose character devices, pipes, or sockets, because
// copying up these types of files is not supported. Neither it can be a dir.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
if !IsRegular(lower.StableAttr) {
return nil, fmt.Errorf("lower Inode is not a regular file")
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 77c2c5c0e..b8b2281a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -50,6 +50,7 @@ go_library(
"//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
+ "//pkg/tcpip/network/ipv4",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 8615b60f0..e555672ad 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -54,7 +55,7 @@ type tcpMemInode struct {
// size stores the tcp buffer size during save, and sets the buffer
// size in netstack in restore. We must save/restore this here, since
- // netstack itself is stateless.
+ // a netstack instance is created on restore.
size inet.TCPBufferSize
// mu protects against concurrent reads/writes to files based on this
@@ -258,6 +259,9 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
if src.NumBytes() == 0 {
return 0, nil
}
+
+ // Only consider size of one memory page for input for performance reasons.
+ // We are only reading if it's zero or not anyway.
src = src.TakeFirst(usermem.PageSize - 1)
var v int32
@@ -383,11 +387,125 @@ func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.S
return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
}
+// ipForwarding implements fs.InodeOperations.
+//
+// ipForwarding is used to enable/disable packet forwarding of netstack.
+//
+// +stateify savable
+type ipForwarding struct {
+ fsutil.SimpleFileInode
+
+ stack inet.Stack `state:"wait"`
+
+ // enabled stores the IPv4 forwarding state on save.
+ // We must save/restore this here, since a netstack instance
+ // is created on restore.
+ enabled *bool
+}
+
+func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+ ipf := &ipForwarding{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+ stack: s,
+ }
+ sattr := fs.StableAttr{
+ DeviceID: device.ProcDevice.DeviceID(),
+ InodeID: device.ProcDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ Type: fs.SpecialFile,
+ }
+ return fs.NewInode(ctx, ipf, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*ipForwarding) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// +stateify savable
+type ipForwardingFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ ipf *ipForwarding
+
+ stack inet.Stack `state:"wait"`
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (ipf *ipForwarding) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ flags.Pread = true
+ flags.Pwrite = true
+ return fs.NewFile(ctx, dirent, flags, &ipForwardingFile{
+ stack: ipf.stack,
+ ipf: ipf,
+ }), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ return 0, io.EOF
+ }
+
+ if f.ipf.enabled == nil {
+ enabled := f.stack.Forwarding(ipv4.ProtocolNumber)
+ f.ipf.enabled = &enabled
+ }
+
+ val := "0\n"
+ if *f.ipf.enabled {
+ // Technically, this is not quite compatible with Linux. Linux
+ // stores these as an integer, so if you write "2" into
+ // ip_forward, you should get 2 back.
+ val = "1\n"
+ }
+ n, err := dst.CopyOut(ctx, []byte(val))
+ return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+//
+// Offset is ignored, multiple writes are not supported.
+func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Only consider size of one memory page for input for performance reasons.
+ // We are only reading if it's zero or not anyway.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return n, err
+ }
+ if f.ipf.enabled == nil {
+ f.ipf.enabled = new(bool)
+ }
+ *f.ipf.enabled = v != 0
+ return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled)
+}
+
func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
contents := map[string]*fs.Inode{
// Add tcp_sack.
"tcp_sack": newTCPSackInode(ctx, msrc, s),
+ // Add ip_forward.
+ "ip_forward": newIPForwardingInode(ctx, msrc, s),
+
// The following files are simple stubs until they are
// implemented in netstack, most of these files are
// configuration related. We use the value closest to the
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 6eba709c6..4cb4741af 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -14,7 +14,11 @@
package proc
-import "fmt"
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+)
// beforeSave is invoked by stateify.
func (t *tcpMemInode) beforeSave() {
@@ -40,3 +44,12 @@ func (s *tcpSack) afterLoad() {
}
}
}
+
+// afterLoad is invoked by stateify.
+func (ipf *ipForwarding) afterLoad() {
+ if ipf.enabled != nil {
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+ panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err))
+ }
+ }
+}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 355e83d47..6ef5738e7 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -123,3 +123,76 @@ func TestConfigureRecvBufferSize(t *testing.T) {
}
}
}
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestIPForwarding(t *testing.T) {
+ ctx := context.Background()
+ s := inet.NewTestStack()
+
+ var cases = []struct {
+ comment string
+ initial bool
+ str string
+ final bool
+ }{
+ {
+ comment: `Forwarding is disabled; write 1 and enable forwarding`,
+ initial: false,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is disabled; write 0 and disable forwarding`,
+ initial: false,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is enabled; write 1 and enable forwarding`,
+ initial: true,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 0 and disable forwarding`,
+ initial: true,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+ initial: false,
+ str: "2404",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+ initial: true,
+ str: "2404",
+ final: true,
+ },
+ }
+ for _, c := range cases {
+ t.Run(c.comment, func(t *testing.T) {
+ s.IPForwarding = c.initial
+ ipf := &ipForwarding{stack: s}
+ file := &ipForwardingFile{
+ stack: s,
+ ipf: ipf,
+ }
+
+ // Write the values.
+ src := usermem.BytesIOSequence([]byte(c.str))
+ if n, err := file.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+ t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+ }
+
+ // Read the values from the stack and check them.
+ if got, want := s.IPForwarding, c.final; got != want {
+ t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+ }
+
+ })
+ }
+}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9cf7f2a62..22d658acf 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
"auxv": newAuxvec(t, msrc),
"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
"comm": newComm(t, msrc),
+ "cwd": newCwd(t, msrc),
"environ": newExecArgInode(t, msrc, environExecArg),
"exe": newExe(t, msrc),
"fd": newFdDir(t, msrc),
@@ -300,6 +301,49 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
return exec.PathnameWithDeleted(ctx), nil
}
+// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file.
+//
+// +stateify savable
+type cwd struct {
+ ramfs.Symlink
+
+ t *kernel.Task
+}
+
+func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+ cwdSymlink := &cwd{
+ Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+ t: t,
+ }
+ return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t)
+}
+
+// Readlink implements fs.InodeOperations.
+func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+ if !kernel.ContextCanTrace(ctx, e.t, false) {
+ return "", syserror.EACCES
+ }
+ if err := checkTaskState(e.t); err != nil {
+ return "", err
+ }
+ cwd := e.t.FSContext().WorkingDirectory()
+ if cwd == nil {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer cwd.DecRef(ctx)
+
+ root := fs.RootFromContext(ctx)
+ if root == nil {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
+
+ name, _ := cwd.FullName(root)
+ return name, nil
+}
+
// namespaceSymlink represents a symlink in the namespacefs, such as the files
// in /proc/<pid>/ns.
//
@@ -604,7 +648,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
var vss, rss, data uint64
s.t.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ fds = fdTable.CurrentMaxFDs()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1dc75291d..fc0498f17 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -613,7 +613,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
}
mf := f.kernel.MemoryFile()
- cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index b095312fe..998b697ca 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -16,6 +16,8 @@
package tmpfs
import (
+ "math"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -32,9 +34,15 @@ import (
var fsInfo = fs.Info{
Type: linux.TMPFS_MAGIC,
+ // tmpfs currently does not support configurable size limits. In Linux,
+ // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+ // statfs(2). However, many applications treat this as having a size limit
+ // of 0. To work around this, claim to have a very large but non-zero size,
+ // chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+ // applications may also handle incorrectly).
// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
- TotalBlocks: 0,
- FreeBlocks: 0,
+ TotalBlocks: math.MaxInt64 / usermem.PageSize,
+ FreeBlocks: math.MaxInt64 / usermem.PageSize,
}
// rename implements fs.InodeOperations.Rename for tmpfs nodes.
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5cb0e0417..e6d0eb359 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -10,13 +10,14 @@ go_library(
"line_discipline.go",
"master.go",
"queue.go",
- "slave.go",
+ "replica.go",
"terminal.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 463f6189e..c2da80bc2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -37,14 +37,14 @@ import (
// This indirectly manages all terminals within the mount.
//
// New Terminals are created by masterInodeOperations.GetFile, which registers
-// the slave Inode in the this directory for discovery via Lookup/Readdir. The
-// slave inode is unregistered when the master file is Released, as the slave
+// the replica Inode in the this directory for discovery via Lookup/Readdir. The
+// replica inode is unregistered when the master file is Released, as the replica
// is no longer discoverable at that point.
//
// References on the underlying Terminal are held by masterFileOperations and
-// slaveInodeOperations.
+// replicaInodeOperations.
//
-// masterInodeOperations and slaveInodeOperations hold a pointer to
+// masterInodeOperations and replicaInodeOperations hold a pointer to
// dirInodeOperations, which is reference counted by the refcount their
// corresponding Dirents hold on their parent (this directory).
//
@@ -76,16 +76,16 @@ type dirInodeOperations struct {
// master is the master PTY inode.
master *fs.Inode
- // slaves contains the slave inodes reachable from the directory.
+ // replicas contains the replica inodes reachable from the directory.
//
- // A new slave is added by allocateTerminal and is removed by
+ // A new replica is added by allocateTerminal and is removed by
// masterFileOperations.Release.
//
- // A reference is held on every slave in the map.
- slaves map[uint32]*fs.Inode
+ // A reference is held on every replica in the map.
+ replicas map[uint32]*fs.Inode
// dentryMap is a SortedDentryMap used to implement Readdir containing
- // the master and all entries in slaves.
+ // the master and all entries in replicas.
dentryMap *fs.SortedDentryMap
// next is the next pty index to use.
@@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
d := &dirInodeOperations{
InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
msrc: m,
- slaves: make(map[uint32]*fs.Inode),
+ replicas: make(map[uint32]*fs.Inode),
dentryMap: fs.NewSortedDentryMap(nil),
}
// Linux devpts uses a default mode of 0000 for ptmx which can be
@@ -133,7 +133,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
defer d.mu.Unlock()
d.master.DecRef(ctx)
- if len(d.slaves) != 0 {
+ if len(d.replicas) != 0 {
panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
}
}
@@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str
return fs.NewDirent(ctx, d.master, name), nil
}
- // Slave number?
+ // Replica number?
n, err := strconv.ParseUint(name, 10, 32)
if err != nil {
// Not found.
return nil, syserror.ENOENT
}
- s, ok := d.slaves[uint32(n)]
+ s, ok := d.replicas[uint32(n)]
if !ok {
return nil, syserror.ENOENT
}
@@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
return nil, syserror.ENOMEM
}
- if _, ok := d.slaves[n]; ok {
+ if _, ok := d.replicas[n]; ok {
panic(fmt.Sprintf("pty index collision; index %d already exists", n))
}
@@ -244,19 +244,19 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
d.next++
// The reference returned by newTerminal is returned to the caller.
- // Take another for the slave inode.
+ // Take another for the replica inode.
t.IncRef()
// Create a pts node. The owner is based on the context that opens
// ptmx.
creds := auth.CredentialsFromContext(ctx)
uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
- slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+ replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
- d.slaves[n] = slave
+ d.replicas[n] = replica
d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
- Type: slave.StableAttr.Type,
- InodeID: slave.StableAttr.InodeID,
+ Type: replica.StableAttr.Type,
+ InodeID: replica.StableAttr.InodeID,
})
return t, nil
@@ -267,18 +267,18 @@ func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
d.mu.Lock()
defer d.mu.Unlock()
- // The slave end disappears from the directory when the master end is
- // closed, even if the slave end is open elsewhere.
+ // The replica end disappears from the directory when the master end is
+ // closed, even if the replica end is open elsewhere.
//
// N.B. since we're using a backdoor method to remove a directory entry
// we won't properly fire inotify events like Linux would.
- s, ok := d.slaves[t.n]
+ s, ok := d.replicas[t.n]
if !ok {
panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
}
s.DecRef(ctx)
- delete(d.slaves, t.n)
+ delete(d.replicas, t.n)
d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
}
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 2d4d44bf3..13f4901db 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -79,8 +79,8 @@ type superOperations struct{}
//
// It always returns true, forcing a Lookup for all entries.
//
-// Slave entries are dropped from dir when their master is closed, so an
-// existing slave Dirent in the tree is not sufficient to guarantee that it
+// Replica entries are dropped from dir when their master is closed, so an
+// existing replica Dirent in the tree is not sufficient to guarantee that it
// still exists on the filesystem.
func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
return true
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 2e9dd2d55..b34f4a0eb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -43,7 +44,7 @@ const (
)
// lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
@@ -54,8 +55,8 @@ const (
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
@@ -64,7 +65,7 @@ const (
// | (inputQueueWrite) +-------------+ (inputQueueRead) |
// | |
// | v
-// masterFD slaveFD
+// masterFD replicaFD
// ^ |
// | |
// | output to terminal +--------------+ output from process |
@@ -103,8 +104,8 @@ type lineDiscipline struct {
// masterWaiter is used to wait on the master end of the TTY.
masterWaiter waiter.Queue `state:"zerovalue"`
- // slaveWaiter is used to wait on the slave end of the TTY.
- slaveWaiter waiter.Queue `state:"zerovalue"`
+ // replicaWaiter is used to wait on the replica end of the TTY.
+ replicaWaiter waiter.Queue `state:"zerovalue"`
}
func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
}
// getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
// We must copy a Termios struct, not KernelTermios.
t := l.termios.ToTermios()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyOut(task, args[2].Pointer())
return 0, err
}
// setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.Lock()
defer l.termiosMu.Unlock()
oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
// We must copy a Termios struct, not KernelTermios.
var t linux.Termios
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyIn(task, args[2].Pointer())
l.termios.FromTermios(t)
// If canonical mode is turned off, move bytes from inQueue's wait
@@ -146,27 +143,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
l.inQueue.pushWaitBufLocked(l)
l.inQueue.readable = true
l.inQueue.mu.Unlock()
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return 0, err
}
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyOut(t, args[2].Pointer())
return err
}
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyIn(t, args[2].Pointer())
return err
}
@@ -176,14 +169,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+ return l.inQueue.readableSize(t, args)
}
func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -196,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
if n > 0 {
l.masterWaiter.Notify(waiter.EventOut)
if pushed {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return n, nil
}
@@ -211,14 +204,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
return n, nil
}
return 0, syserror.ErrWouldBlock
}
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+ return l.outQueue.readableSize(t, args)
}
func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -229,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventOut)
+ l.replicaWaiter.Notify(waiter.EventOut)
if pushed {
l.masterWaiter.Notify(waiter.EventIn)
}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e00746017..b91184b1b 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -17,9 +17,11 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
// Ioctl implements fs.FileOperations.Ioctl.
func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the output queue read buffer.
- return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
+ return 0, mf.t.ld.outputQueueReadSize(t, args)
case linux.TCGETS:
// N.B. TCGETS on the master actually returns the configuration
- // of the slave end.
- return mf.t.ld.getTermios(ctx, io, args)
+ // of the replica end.
+ return mf.t.ld.getTermios(t, args)
case linux.TCSETS:
// N.B. TCSETS on the master actually affects the configuration
- // of the slave end.
- return mf.t.ld.setTermios(ctx, io, args)
+ // of the replica end.
+ return mf.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return mf.t.ld.setTermios(ctx, io, args)
+ return mf.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(mf.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCSPTLCK:
// TODO(b/29356795): Implement pty locking. For now just pretend we do.
return 0, nil
case linux.TIOCGWINSZ:
- return 0, mf.t.ld.windowSize(ctx, io, args)
+ return 0, mf.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, mf.t.ld.setWindowSize(ctx, io, args)
+ return 0, mf.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index ceabb9b1e..79975d812 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -17,8 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -32,7 +34,7 @@ import (
const waitBufMaxBytes = 131072
// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
@@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
}
// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error {
q.mu.Lock()
defer q.mu.Unlock()
- var size int32
+ size := primitive.Int32(0)
if q.readable {
- size = int32(len(q.readBuf))
+ size = primitive.Int32(len(q.readBuf))
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := size.CopyOut(t, args[2].Pointer())
return err
}
@@ -104,8 +104,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
// as whether the read caused more readable data to become available (whether
// data was pushed from the wait buffer to the read buffer).
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -145,8 +144,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
// write writes to q from userspace.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -188,8 +186,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
// writeBytes writes to q from b.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
q.mu.Lock()
defer q.mu.Unlock()
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/replica.go
index 7c7292687..385d230fb 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/replica.go
@@ -17,9 +17,11 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
@@ -27,11 +29,11 @@ import (
// LINT.IfChange
-// slaveInodeOperations are the fs.InodeOperations for the slave end of the
+// replicaInodeOperations are the fs.InodeOperations for the replica end of the
// Terminal (pts file).
//
// +stateify savable
-type slaveInodeOperations struct {
+type replicaInodeOperations struct {
fsutil.SimpleFileInode
// d is the containing dir.
@@ -41,13 +43,13 @@ type slaveInodeOperations struct {
t *Terminal
}
-var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
+var _ fs.InodeOperations = (*replicaInodeOperations)(nil)
-// newSlaveInode creates an fs.Inode for the slave end of a terminal.
+// newReplicaInode creates an fs.Inode for the replica end of a terminal.
//
-// newSlaveInode takes ownership of t.
-func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
- iops := &slaveInodeOperations{
+// newReplicaInode takes ownership of t.
+func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+ iops := &replicaInodeOperations{
SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
d: d,
t: t,
@@ -64,18 +66,18 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
Type: fs.CharacterDevice,
// See fs/devpts/inode.c:devpts_fill_super.
BlockSize: 1024,
- DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
+ DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR,
DeviceFileMinor: t.n,
})
}
// Release implements fs.InodeOperations.Release.
-func (si *slaveInodeOperations) Release(ctx context.Context) {
+func (si *replicaInodeOperations) Release(ctx context.Context) {
si.t.DecRef(ctx)
}
// Truncate implements fs.InodeOperations.Truncate.
-func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
return nil
}
@@ -83,14 +85,15 @@ func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
//
// This may race with destruction of the terminal. If the terminal is gone, it
// returns ENOENT.
-func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
- return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
+func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil
}
-// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+// replicaFileOperations are the fs.FileOperations for the replica end of a
+// terminal.
//
// +stateify savable
-type slaveFileOperations struct {
+type replicaFileOperations struct {
fsutil.FilePipeSeek `state:"nosave"`
fsutil.FileNotDirReaddir `state:"nosave"`
fsutil.FileNoFsync `state:"nosave"`
@@ -100,79 +103,84 @@ type slaveFileOperations struct {
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
// si is the inode operations.
- si *slaveInodeOperations
+ si *replicaInodeOperations
}
-var _ fs.FileOperations = (*slaveFileOperations)(nil)
+var _ fs.FileOperations = (*replicaFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release(context.Context) {
+func (sf *replicaFileOperations) Release(context.Context) {
}
// EventRegister implements waiter.Waitable.EventRegister.
-func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
+func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ sf.si.t.ld.replicaWaiter.EventRegister(e, mask)
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
- sf.si.t.ld.slaveWaiter.EventUnregister(e)
+func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) {
+ sf.si.t.ld.replicaWaiter.EventUnregister(e)
}
// Readiness implements waiter.Waitable.Readiness.
-func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sf.si.t.ld.slaveReadiness()
+func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return sf.si.t.ld.replicaReadiness()
}
// Read implements fs.FileOperations.Read.
-func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
return sf.si.t.ld.inputQueueRead(ctx, dst)
}
// Write implements fs.FileOperations.Write.
-func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
return sf.si.t.ld.outputQueueWrite(ctx, src)
}
// Ioctl implements fs.FileOperations.Ioctl.
-func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the input queue read buffer.
- return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
+ return 0, sf.si.t.ld.inputQueueReadSize(t, args)
case linux.TCGETS:
- return sf.si.t.ld.getTermios(ctx, io, args)
+ return sf.si.t.ld.getTermios(t, args)
case linux.TCSETS:
- return sf.si.t.ld.setTermios(ctx, io, args)
+ return sf.si.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return sf.si.t.ld.setTermios(ctx, io, args)
+ return sf.si.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(sf.si.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCGWINSZ:
- return 0, sf.si.t.ld.windowSize(ctx, io, args)
+ return 0, sf.si.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+ return 0, sf.si.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
}
}
-// LINT.ThenChange(../../fsimpl/devpts/slave.go)
+// LINT.ThenChange(../../fsimpl/devpts/replica.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ddcccf4da..4f431d74d 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,10 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
)
// LINT.IfChange
@@ -44,19 +44,19 @@ type Terminal struct {
// this terminal. This field is immutable.
masterKTTY *kernel.TTY
- // slaveKTTY contains the controlling process of the slave end of this
+ // replicaKTTY contains the controlling process of the replica end of this
// terminal. This field is immutable.
- slaveKTTY *kernel.TTY
+ replicaKTTY *kernel.TTY
}
func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
- termios := linux.DefaultSlaveTermios
+ termios := linux.DefaultReplicaTermios
t := Terminal{
- d: d,
- n: n,
- ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{Index: n},
- slaveKTTY: &kernel.TTY{Index: n},
+ d: d,
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{Index: n},
+ replicaKTTY: &kernel.TTY{Index: n},
}
t.EnableLeakCheck("tty.Terminal")
return &t
@@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("releaseControllingTTY must be called from a task context")
@@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
}
// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("foregroundProcessGroup must be called from a task context")
@@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
}
// Write it out to *arg.
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ retP := primitive.Int32(ret)
+ _, err = retP.CopyOut(task, args[2].Pointer())
return 0, err
}
// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setForegroundProcessGroup must be called from a task context")
}
// Read in the process group ID.
- var pgid int32
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgid primitive.Int32
+ if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
@@ -126,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
if isMaster {
return tm.masterKTTY
}
- return tm.slaveKTTY
+ return tm.replicaKTTY
}
// LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 2cbc05678..49edee83d 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func TestSimpleMasterToSlave(t *testing.T) {
- ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+ ld := newLineDiscipline(linux.DefaultReplicaTermios)
ctx := contexttest.Context(t)
inBytes := []byte("hello, tty\n")
src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
index 2f5a43b84..124bc95ed 100644
--- a/pkg/sentry/fs/user/path.go
+++ b/pkg/sentry/fs/user/path.go
@@ -121,6 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s
func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
for _, p := range paths {
if !path.IsAbs(p) {
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index 936fd3932..1f8684dc6 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -105,6 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
const defaultHome = "/"
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
creds := auth.CredentialsFromContext(ctx)