diff options
Diffstat (limited to 'pkg')
89 files changed, 1703 insertions, 656 deletions
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go index 89f9a793f..fa3ae5f18 100644 --- a/pkg/abi/linux/dev.go +++ b/pkg/abi/linux/dev.go @@ -36,6 +36,10 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) { // // See Documentations/devices.txt and uapi/linux/major.h. const ( + // UNNAMED_MAJOR is the major device number for "unnamed" devices, whose + // minor numbers are dynamically allocated by the kernel. + UNNAMED_MAJOR = 0 + // MEM_MAJOR is the major device number for "memory" character devices. MEM_MAJOR = 1 diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index a8d4f9d69..46d8b0b42 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -146,7 +146,7 @@ type IPTIP struct { // OutputInterface is the output network interface. OutputInterface [IFNAMSIZ]byte - // InputInterfaceMask is the intput interface mask. + // InputInterfaceMask is the input interface mask. InputInterfaceMask [IFNAMSIZ]byte // OuputInterfaceMask is the output interface mask. diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index 9612f072e..ffc918846 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -6,6 +6,7 @@ go_library( name = "amutex", srcs = ["amutex.go"], visibility = ["//:sandbox"], + deps = ["//pkg/syserror"], ) go_test( diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go index 1c4fd1784..a078a31db 100644 --- a/pkg/amutex/amutex.go +++ b/pkg/amutex/amutex.go @@ -18,6 +18,8 @@ package amutex import ( "sync/atomic" + + "gvisor.dev/gvisor/pkg/syserror" ) // Sleeper must be implemented by users of the abortable mutex to allow for @@ -53,6 +55,21 @@ func (NoopSleeper) SleepFinish(success bool) {} // Interrupted implements Sleeper.Interrupted. func (NoopSleeper) Interrupted() bool { return false } +// Block blocks until either receiving from ch succeeds (in which case it +// returns nil) or sleeper is interrupted (in which case it returns +// syserror.ErrInterrupted). +func Block(sleeper Sleeper, ch <-chan struct{}) error { + cancel := sleeper.SleepStart() + select { + case <-ch: + sleeper.SleepFinish(true) + return nil + case <-cancel: + sleeper.SleepFinish(false) + return syserror.ErrInterrupted + } +} + // AbortableMutex is an abortable mutex. It allows Lock() to be aborted while it // waits to acquire the mutex. type AbortableMutex struct { diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 529980267..343f81f59 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -274,7 +274,7 @@ const ( func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < ptraceRegsSize { + if maxlen < registersSize { return 0, syserror.EFAULT } return s.PtraceGetRegs(dst) @@ -287,7 +287,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < ptraceRegsSize { + if maxlen < registersSize { return 0, syserror.EFAULT } return s.PtraceSetRegs(src) diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 1cb1adf8c..642c79dda 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -19,6 +19,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) @@ -134,6 +135,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt c.Regs.Regs[1] = uint64(infoAddr) c.Regs.Regs[2] = uint64(ucAddr) c.Regs.Regs[30] = uint64(act.Restorer) + + // Save the thread's floating point state. + c.sigFPState = append(c.sigFPState, c.aarch64FPState) + // Signal handler gets a clean floating point state. + c.aarch64FPState = newAarch64FPState() return nil } @@ -155,5 +161,21 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt c.Regs.Sp = uc.MContext.Sp c.Regs.Pstate = uc.MContext.Pstate + // Restore floating point state. + l := len(c.sigFPState) + if l > 0 { + c.aarch64FPState = c.sigFPState[l-1] + // NOTE(cl/133042258): State save requires that any slice + // elements from '[len:cap]' to be zero value. + c.sigFPState[l-1] = nil + c.sigFPState = c.sigFPState[0 : l-1] + } else { + // This might happen if sigreturn(2) calls are unbalanced with + // respect to signal handler entries. This is not expected so + // don't bother to do anything fancy with the floating point + // state. + log.Warningf("sigreturn unable to restore application fpstate") + } + return uc.Sigset, uc.Stack, nil } diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index f6b3ef178..b5efc86f2 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -190,9 +190,9 @@ type session struct { // be socket/pipe files. This allows unix domain sockets and named pipes to // be used with paths that belong to a gofer. // - // TODO(gvisor.dev/issue/1200): there are few possible races with someone - // stat'ing the file and another deleting it concurrently, where the file - // will not be reported as socket file. + // There are a few possible races with someone stat'ing the file and another + // deleting it concurrently, where the file will not be reported as socket + // file. overrides *overrideMaps `state:"wait"` } diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 94db8fe5c..c03c65445 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -51,21 +51,37 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } - fs, root := fstype.newFilesystem(vfsObj, creds) - return fs.VFSFilesystem(), root.VFSDentry(), nil + fs, root, err := fstype.newFilesystem(vfsObj, creds) + if err != nil { + return nil, nil, err + } + return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 } // newFilesystem creates a new devpts filesystem with root directory and ptmx // master inode. It returns the filesystem and root Dentry. -func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, fstype, fs) +func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs) // Construct the root directory. This is always inode id 1. root := &rootInode{ slaves: make(map[uint32]*slaveInode), } - root.InodeAttrs.Init(creds, 1, linux.ModeDirectory|0555) + root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) root.dentry.Init(root) @@ -74,7 +90,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds master := &masterInode{ root: root, } - master.InodeAttrs.Init(creds, 2, linux.ModeCharacterDevice|0666) + master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) master.dentry.Init(master) // Add the master as a child of the root. @@ -83,7 +99,13 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds }) root.IncLinks(links) - return fs, &root.dentry + return fs, &root.dentry, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() } // rootInode is the root directory inode for the devpts mounts. @@ -140,7 +162,7 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) } // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). - slave.InodeAttrs.Init(creds, uint64(idx+3), linux.ModeCharacterDevice|0600) + slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) slave.dentry.Init(slave) i.slaves[idx] = slave diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD new file mode 100644 index 000000000..ea167d38c --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "eventfd", + srcs = ["eventfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/log", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + library = ":eventfd", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/vfs/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index f39dacacf..c573d7935 100644 --- a/pkg/sentry/vfs/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -package vfs +// Package eventfd implements event fds. +package eventfd import ( "math" @@ -23,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -32,9 +34,9 @@ import ( // notification (eventfd). Eventfds are usually internal to the Sentry but in // certain situations they may be converted into a host-backed eventfd. type EventFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl // queue is used to notify interested parties when the event object // becomes readable or writable. @@ -53,18 +55,18 @@ type EventFileDescription struct { hostfd int } -var _ FileDescriptionImpl = (*EventFileDescription)(nil) +var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) -// NewEventFD creates a new event fd. -func (vfs *VirtualFilesystem) NewEventFD(initVal uint64, semMode bool, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[eventfd]") +// New creates a new event fd. +func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[eventfd]") defer vd.DecRef() efd := &EventFileDescription{ val: initVal, semMode: semMode, hostfd: -1, } - if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, @@ -117,7 +119,7 @@ func (efd *EventFileDescription) Release() { } // Read implements FileDescriptionImpl.Read. -func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ ReadOptions) (int64, error) { +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { if dst.NumBytes() < 8 { return 0, syscall.EINVAL } @@ -128,7 +130,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc } // Write implements FileDescriptionImpl.Write. -func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ WriteOptions) (int64, error) { +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { if src.NumBytes() < 8 { return 0, syscall.EINVAL } diff --git a/pkg/sentry/vfs/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go index 2dff2d10b..20e3adffc 100644 --- a/pkg/sentry/vfs/eventfd_test.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -package vfs +package eventfd import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -34,15 +35,15 @@ func TestEventFD(t *testing.T) { for _, initVal := range initVals { ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} + vfsObj := &vfs.VirtualFilesystem{} if err := vfsObj.Init(); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(initVal, false, linux.O_RDWR) + eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) if err != nil { - t.Fatalf("NewEventFD failed: %v", err) + t.Fatalf("New() failed: %v", err) } defer eventfd.DecRef() @@ -53,7 +54,7 @@ func TestEventFD(t *testing.T) { data := []byte("00000124") // Create and submit a write request. - n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), WriteOptions{}) + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) if err != nil { t.Fatal(err) } @@ -72,19 +73,19 @@ func TestEventFD(t *testing.T) { func TestEventFDStat(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} + vfsObj := &vfs.VirtualFilesystem{} if err := vfsObj.Init(); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(0, false, linux.O_RDWR) + eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) if err != nil { - t.Fatalf("NewEventFD failed: %v", err) + t.Fatalf("New() failed: %v", err) } defer eventfd.DecRef() - statx, err := eventfd.Stat(ctx, StatOptions{ + statx, err := eventfd.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, }) if err != nil { diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 7176af6d1..dac6effbf 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -105,36 +105,50 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + dev, err := getDeviceFd(source, opts) if err != nil { return nil, nil, err } - fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} + fs := filesystem{ + dev: dev, + inodeCache: make(map[uint32]*inode), + devMinor: devMinor, + } fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. + fs.vfsfs.DecRef() return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { + fs.vfsfs.DecRef() return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 77b644275..557963e03 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -64,6 +64,10 @@ type filesystem struct { // bgs represents all the block group descriptors for the filesystem. // Immutable after initialization. bgs []disklayout.BlockGroup + + // devMinor is this filesystem's device minor number. Immutable after + // initialization. + devMinor uint32 } // Compiles only if filesystem implements vfs.FilesystemImpl. @@ -366,7 +370,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() {} +func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) +} // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index a98512350..485f86f4b 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -204,6 +204,8 @@ func (in *inode) statTo(stat *linux.Statx) { stat.Atime = in.diskInode.AccessTime().StatxTimestamp() stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = in.fs.devMinor // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks // (including metadata blocks) required to represent this file. } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 4a32821bd..7f2181216 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -835,6 +837,9 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if d.isSynthetic() { return nil, syserror.ENXIO } + if d.fs.iopts.OpenSocketsByConnecting { + return d.connectSocketLocked(ctx, opts) + } case linux.S_IFIFO: if d.isSynthetic() { return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) @@ -843,10 +848,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return d.openSpecialFileLocked(ctx, mnt, opts) } +func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + fdObj.Close() + return nil, err + } + fdObj.Release() + return fd, nil +} + func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - // Treat as a special file. This is done for non-synthetic pipes as well as - // regular files when d.fs.opts.regularFilesUseSpecialFileFD is true. if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } @@ -854,10 +877,15 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts if err != nil { return nil, err } + seekable := d.fileType() == linux.S_IFREG fd := &specialFileFD{ - handle: h, + handle: h, + seekable: seekable, } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { h.close(ctx) return nil, err } @@ -888,7 +916,11 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } creds := rp.Credentials() name := rp.Component() - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // Filter file creation flags and O_LARGEFILE out; the create RPC already + // has the semantics of O_CREAT|O_EXCL, while some servers will choke on + // O_LARGEFILE. + createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE)) + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) return nil, err @@ -896,7 +928,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // Then we need to walk to the file we just created to get a non-open fid // representing it, and to get its metadata. This must use d.file since, as // explained above, dirfile was invalidated by dirfile.Create(). - walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) if err != nil { openFile.close(ctx) if fdobj != nil { @@ -904,17 +936,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } - // Sanity-check that we walked to the file we created. - if createQID.Path != walkQID.Path { - // Probably due to concurrent remote filesystem mutation? - ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) - nonOpenFile.close(ctx) - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, syserror.EAGAIN - } // Construct the new dentry. child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) @@ -960,16 +981,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } else { + seekable := child.fileType() == linux.S_IFREG fd := &specialFileFD{ handle: handle{ file: openFile, fd: -1, }, + seekable: seekable, } if fdobj != nil { fd.handle.fd = int32(fdobj.Release()) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { fd.handle.close(ctx) return nil, err } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 9ab8fdc65..1da8d5d82 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -81,6 +81,9 @@ type filesystem struct { // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + // uid and gid are the effective KUID and KGID of the filesystem's creator, // and are used as the owner and group for files that don't specify one. // uid and gid are immutable. @@ -218,6 +221,10 @@ type InternalFilesystemOptions struct { // which servers can handle only a single client and report failure if that // client disconnects. LeakConnection bool + + // If OpenSocketsByConnecting is true, silently translate attempts to open + // files identifying as sockets to connect RPCs. + OpenSocketsByConnecting bool } // Name implements vfs.FilesystemType.Name. @@ -399,14 +406,21 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Construct the filesystem object. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } fs := &filesystem{ mfp: mfp, opts: fsopts, iopts: iopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, client: client, clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -464,6 +478,8 @@ func (fs *filesystem) Release() { // Close the connection to the server. This implicitly clunks all fids. fs.client.Close() } + + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -796,7 +812,8 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(gvisor.dev/issue/1198): device number + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = d.fs.devMinor } func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 507e0e276..a464e6a94 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -33,13 +33,14 @@ import ( type specialFileFD struct { fileDescription - // handle is immutable. + // handle is used for file I/O. handle is immutable. handle handle - // off is the file offset. off is protected by mu. (POSIX 2.9.7 only - // requires operations using the file offset to be atomic for regular files - // and symlinks; however, since specialFileFD may be used for regular - // files, we apply this atomicity unconditionally.) + // seekable is true if this file description represents a file for which + // file offset is significant, i.e. a regular file. seekable is immutable. + seekable bool + + // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex off int64 } @@ -63,7 +64,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { @@ -91,6 +92,10 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Read implements vfs.FileDescriptionImpl.Read. func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if !fd.seekable { + return fd.PRead(ctx, dst, -1, opts) + } + fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n @@ -100,14 +105,14 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } - if fd.dentry().fileType() == linux.S_IFREG { + if fd.seekable { limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, err @@ -130,6 +135,10 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Write implements vfs.FileDescriptionImpl.Write. func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + if !fd.seekable { + return fd.PWrite(ctx, src, -1, opts) + } + fd.mu.Lock() n, err := fd.PWrite(ctx, src, fd.off, opts) fd.off += n @@ -139,6 +148,9 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if !fd.seekable { + return 0, syserror.ESPIPE + } fd.mu.Lock() defer fd.mu.Unlock() switch whence { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 144e04905..8caf55a1b 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -40,8 +40,20 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { +// NewFDOptions contains options to NewFD. +type NewFDOptions struct { + // If IsTTY is true, the file descriptor is a TTY. + IsTTY bool + + // If HaveFlags is true, use Flags for the new file description. Otherwise, + // the new file description will inherit flags from hostFD. + HaveFlags bool + Flags uint32 +} + +// NewFD returns a vfs.FileDescription representing the given host file +// descriptor. mnt must be Kernel.HostMount(). +func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { fs, ok := mnt.Filesystem().Impl().(*filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) @@ -53,6 +65,16 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs return nil, err } + flags := opts.Flags + if !opts.HaveFlags { + // Get flags for the imported FD. + flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) + if err != nil { + return nil, err + } + flags = uint32(flagsInt) + } + fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() @@ -65,7 +87,7 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs i := &inode{ hostFD: hostFD, seekable: seekable, - isTTY: isTTY, + isTTY: opts.IsTTY, canMap: canMap(uint32(fileType)), wouldBlock: wouldBlock(uint32(fileType)), ino: fs.NextIno(), @@ -95,7 +117,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // i.open will take a reference on d. defer d.DecRef() - return i.open(ctx, d.VFSDentry(), mnt) + return i.open(ctx, d.VFSDentry(), mnt, flags) +} + +// ImportFD sets up and returns a vfs.FileDescription from a donated fd. +func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { + return NewFD(ctx, mnt, hostFD, &NewFDOptions{ + IsTTY: isTTY, + }) } // filesystemType implements vfs.FilesystemType. @@ -115,15 +144,28 @@ func (filesystemType) Name() string { // // Note that there should only ever be one instance of host.filesystem, // a global mount for host fds. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() + return fs.VFSFilesystem(), nil } // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem + + devMinor uint32 +} + +func (fs *filesystem) Release() { + fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -185,19 +227,6 @@ type inode struct { queue waiter.Queue } -// Note that these flags may become out of date, since they can be modified -// on the host, e.g. with fcntl. -func fileFlagsFromHostFD(fd int) (uint32, error) { - flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0) - if err != nil { - log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err) - return 0, err - } - // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. - flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND - return uint32(flags), nil -} - // CheckPermissions implements kernfs.Inode. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s syscall.Stat_t @@ -219,7 +248,7 @@ func (i *inode) Mode() linux.FileMode { } // Stat implements kernfs.Inode. -func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL } @@ -227,73 +256,73 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro return linux.Statx{}, syserror.EINVAL } + fs := vfsfs.Impl().(*filesystem) + // Limit our host call only to known flags. mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) - // Fallback to fstat(2), if statx(2) is not supported on the host. - // - // TODO(b/151263641): Remove fallback. if err == syserror.ENOSYS { - return i.fstat(opts) - } else if err != nil { + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + return i.fstat(fs) + } + if err != nil { return linux.Statx{}, err } - ls := linux.Statx{Mask: mask} - // Unconditionally fill blksize, attributes, and device numbers, as indicated - // by /include/uapi/linux/stat.h. - // - // RdevMajor/RdevMinor are left as zero, so as not to expose host device - // numbers. - // - // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined - // device numbers. If we use the device number from the host, it may collide - // with another sentry-internal device number. We handle device/inode - // numbers without relying on the host to prevent collisions. - ls.Blksize = s.Blksize - ls.Attributes = s.Attributes - ls.AttributesMask = s.Attributes_mask - - if mask&linux.STATX_TYPE != 0 { + // Unconditionally fill blksize, attributes, and device numbers, as + // indicated by /include/uapi/linux/stat.h. Inode number is always + // available, since we use our own rather than the host's. + ls := linux.Statx{ + Mask: linux.STATX_INO, + Blksize: s.Blksize, + Attributes: s.Attributes, + Ino: i.ino, + AttributesMask: s.Attributes_mask, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + } + + // Copy other fields that were returned by the host. RdevMajor/RdevMinor + // are never copied (and therefore left as zero), so as not to expose host + // device numbers. + ls.Mask |= s.Mask & linux.STATX_ALL + if s.Mask&linux.STATX_TYPE != 0 { ls.Mode |= s.Mode & linux.S_IFMT } - if mask&linux.STATX_MODE != 0 { + if s.Mask&linux.STATX_MODE != 0 { ls.Mode |= s.Mode &^ linux.S_IFMT } - if mask&linux.STATX_NLINK != 0 { + if s.Mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } - if mask&linux.STATX_UID != 0 { + if s.Mask&linux.STATX_UID != 0 { ls.UID = s.Uid } - if mask&linux.STATX_GID != 0 { + if s.Mask&linux.STATX_GID != 0 { ls.GID = s.Gid } - if mask&linux.STATX_ATIME != 0 { + if s.Mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) } - if mask&linux.STATX_BTIME != 0 { + if s.Mask&linux.STATX_BTIME != 0 { ls.Btime = unixToLinuxStatxTimestamp(s.Btime) } - if mask&linux.STATX_CTIME != 0 { + if s.Mask&linux.STATX_CTIME != 0 { ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) } - if mask&linux.STATX_MTIME != 0 { + if s.Mask&linux.STATX_MTIME != 0 { ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) } - if mask&linux.STATX_SIZE != 0 { + if s.Mask&linux.STATX_SIZE != 0 { ls.Size = s.Size } - if mask&linux.STATX_BLOCKS != 0 { + if s.Mask&linux.STATX_BLOCKS != 0 { ls.Blocks = s.Blocks } - // Use our own internal inode number. - if mask&linux.STATX_INO != 0 { - ls.Ino = i.ino - } - return ls, nil } @@ -305,36 +334,30 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro // of a mask or sync flags. fstat(2) does not provide any metadata // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so // those fields remain empty. -func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { var s unix.Stat_t if err := unix.Fstat(i.hostFD, &s); err != nil { return linux.Statx{}, err } - // Note that rdev numbers are left as 0; do not expose host device numbers. - ls := linux.Statx{ - Mask: linux.STATX_BASIC_STATS, - Blksize: uint32(s.Blksize), - Nlink: uint32(s.Nlink), - UID: s.Uid, - GID: s.Gid, - Mode: uint16(s.Mode), - Size: uint64(s.Size), - Blocks: uint64(s.Blocks), - Atime: timespecToStatxTimestamp(s.Atim), - Ctime: timespecToStatxTimestamp(s.Ctim), - Mtime: timespecToStatxTimestamp(s.Mtim), - } - - // Use our own internal inode number. - // - // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. - // If we use the device number from the host, it may collide with another - // sentry-internal device number. We handle device/inode numbers without - // relying on the host to prevent collisions. - ls.Ino = i.ino - - return ls, nil + // As with inode.Stat(), we always use internal device and inode numbers, + // and never expose the host's represented device numbers. + return linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + UID: s.Uid, + GID: s.Gid, + Mode: uint16(s.Mode), + Ino: i.ino, + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + }, nil } // SetStat implements kernfs.Inode. @@ -399,19 +422,19 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr if i.Mode().FileType() == linux.S_IFSOCK { return nil, syserror.ENXIO } - return i.open(ctx, vfsd, rp.Mount()) + return i.open(ctx, vfsd, rp.Mount(), opts.Flags) } -func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { +func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { return nil, err } fileType := s.Mode & linux.FileTypeMask - flags, err := fileFlagsFromHostFD(i.hostFD) - if err != nil { - return nil, err - } + + // Constrain flags to a subset we can handle. + // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. + flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND if fileType == syscall.S_IFSOCK { if i.isTTY { @@ -453,8 +476,6 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. -// -// TODO(gvisor.dev/issue/1672): Implement Waitable interface. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -471,12 +492,12 @@ type fileDescription struct { // SetStat implements vfs.FileDescriptionImpl. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return f.inode.SetStat(ctx, nil, creds, opts) + return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } // Stat implements vfs.FileDescriptionImpl. func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { - return f.inode.Stat(nil, opts) + return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts) } // Release implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c7779fc11..1568a9d49 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -44,11 +44,11 @@ type DynamicBytesFile struct { var _ Inode = (*DynamicBytesFile)(nil) // Init initializes a dynamic bytes file. -func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { +func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) f.data = data } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 615592d5f..982daa2e6 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -192,15 +192,17 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // // Must be initialized by Init prior to first use. type InodeAttrs struct { - ino uint64 - mode uint32 - uid uint32 - gid uint32 - nlink uint32 + devMajor uint32 + devMinor uint32 + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 } // Init initializes this InodeAttrs. -func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) { +func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { if mode.FileType() == 0 { panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) } @@ -209,6 +211,8 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo if mode.FileType() == linux.ModeDirectory { nlink = 2 } + a.devMajor = devMajor + a.devMinor = devMinor atomic.StoreUint64(&a.ino, ino) atomic.StoreUint32(&a.mode, uint32(mode)) atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) @@ -216,6 +220,16 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo atomic.StoreUint32(&a.nlink, nlink) } +// DevMajor returns the device major number. +func (a *InodeAttrs) DevMajor() uint32 { + return a.devMajor +} + +// DevMinor returns the device minor number. +func (a *InodeAttrs) DevMinor() uint32 { + return a.devMinor +} + // Ino returns the inode id. func (a *InodeAttrs) Ino() uint64 { return atomic.LoadUint64(&a.ino) @@ -232,6 +246,8 @@ func (a *InodeAttrs) Mode() linux.FileMode { func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.DevMajor = a.devMajor + stat.DevMinor = a.devMinor stat.Ino = atomic.LoadUint64(&a.ino) stat.Mode = uint16(a.Mode()) stat.UID = atomic.LoadUint32(&a.uid) @@ -544,9 +560,9 @@ type StaticDirectory struct { var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { inode := &StaticDirectory{} - inode.Init(creds, ino, perm) + inode.Init(creds, devMajor, devMinor, ino, perm) dentry := &Dentry{} dentry.Init(inode) @@ -559,11 +575,11 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil } // Init initializes StaticDirectory. -func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { +func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } // Open implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 1c5d3e7e7..412cf6ac9 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -75,7 +75,7 @@ type file struct { func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { f := &file{} f.content = content - f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777) + f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) d := &kernfs.Dentry{} d.Init(f) @@ -107,7 +107,7 @@ type readonlyDir struct { func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &readonlyDir{} - dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) dir.dentry.Init(dir) @@ -137,7 +137,7 @@ type dir struct { func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &dir{} dir.fs = fs - dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) dir.dentry.Init(dir) diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 0aa6dc979..2ab3f53fd 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -35,9 +35,9 @@ type StaticSymlink struct { var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry { +func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { inode := &StaticSymlink{} - inode.Init(creds, ino, target) + inode.Init(creds, devMajor, devMinor, ino, target) d := &Dentry{} d.Init(inode) @@ -45,9 +45,9 @@ func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentr } // Init initializes the instance. -func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) { +func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { s.target = target - s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777) + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } // Readlink implements Inode. diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 0d411606f..5950a2d59 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 5375e5e75..cab771211 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -17,8 +17,11 @@ package pipefs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" @@ -40,20 +43,36 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile panic("pipefs.filesystemType.GetFilesystem should never be called") } -// TODO(gvisor.dev/issue/1193): -// -// - kernfs does not provide a way to implement statfs, from which we -// should indicate PIPEFS_MAGIC. -// -// - kernfs does not provide a way to override names for -// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic -// name fmt.Sprintf("pipe:[%d]", inode.ino). +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} // NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} } // inode implements kernfs.Inode. @@ -71,11 +90,11 @@ type inode struct { ctime ktime.Time } -func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { +func newInode(ctx context.Context, fs *filesystem) *inode { creds := auth.CredentialsFromContext(ctx) return &inode{ pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), - ino: fs.NextIno(), + ino: fs.Filesystem.NextIno(), uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, ctime: ktime.NowFromContext(ctx), @@ -98,19 +117,20 @@ func (i *inode) Mode() linux.FileMode { func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) return linux.Statx{ - Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, - Blksize: usermem.PageSize, - Nlink: 1, - UID: uint32(i.uid), - GID: uint32(i.gid), - Mode: pipeMode, - Ino: i.ino, - Size: 0, - Blocks: 0, - Atime: ts, - Ctime: ts, - Mtime: ts, - // TODO(gvisor.dev/issue/1197): Device number. + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: vfsfs.Impl().(*filesystem).devMinor, }, nil } @@ -122,6 +142,9 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. return syserror.EPERM } +// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement +// statfs, from which we should indicate PIPEFS_MAGIC. + // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) @@ -132,7 +155,7 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + fs := mnt.Filesystem().Impl().(*filesystem) inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 104fc9030..609210253 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -41,6 +41,12 @@ func (FilesystemType) Name() string { return Name } +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) @@ -51,8 +57,13 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF if pidns == nil { return nil, nil, fmt.Errorf("procfs requires a PID namespace") } - - procfs := &kernfs.Filesystem{} + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + procfs := &filesystem{ + devMinor: devMinor, + } procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var cgroups map[string]string @@ -61,21 +72,27 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF cgroups = data.Cgroups } - _, dentry := newTasksInode(procfs, k, pidns, cgroups) + _, dentry := procfs.newTasksInode(k, pidns, cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + // dynamicInode is an overfitted interface for common Inodes with // dynamicByteSource types used in procfs. type dynamicInode interface { kernfs.Inode vfs.DynamicBytesSource - Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) + Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) } -func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { - inode.Init(creds, ino, inode, perm) +func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) d := &kernfs.Dentry{} d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index a5cfa8333..36a911db4 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -37,23 +37,23 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + fs *filesystem task *kernel.Task pidns *kernel.PIDNamespace - inoGen InoGenerator cgroupControllers map[string]string } var _ kernfs.Inode = (*subtasksInode)(nil) -func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry { subInode := &subtasksInode{ + fs: fs, task: task, pidns: pidns, - inoGen: inoGen, cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. - subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) inode := &taskOwnedInode{Inode: subInode, owner: task} @@ -78,7 +78,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e return nil, syserror.ENOENT } - subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers) + subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) return subTaskDentry.VFSDentry(), nil } @@ -102,7 +102,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 66419d91b..482055db1 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -43,45 +43,45 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. contents := map[string]*kernfs.Dentry{ - "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), - "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": newComm(task, inoGen.NextIno(), 0444), - "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - "exe": newExeSymlink(task, inoGen.NextIno()), - "fd": newFDDirInode(task, inoGen), - "fdinfo": newFDInfoDirInode(task, inoGen), - "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), - "mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}), - "net": newTaskNetDir(task, inoGen), - "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ - "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), - "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), - "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), + "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": fs.newComm(task, fs.NextIno(), 0444), + "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": fs.newExeSymlink(task, fs.NextIno()), + "fd": fs.newFDDirInode(task), + "fdinfo": fs.newFDInfoDirInode(task), + "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}), + "net": fs.newTaskNetDir(task), + "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{ + "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), + "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), + "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), }), - "oom_score": newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")), - "oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) + contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) } if len(cgroupControllers) > 0 { - contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers)) + contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) } taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. - taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -126,9 +126,9 @@ type taskOwnedInode struct { var _ kernfs.Inode = (*taskOwnedInode)(nil) -func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, inode, perm) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) taskInode := &taskOwnedInode{Inode: inode, owner: task} d := &kernfs.Dentry{} @@ -136,11 +136,11 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode return d } -func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &kernfs.StaticDirectory{} // Note: credentials are overridden by taskOwnedInode. - dir.Init(task.Credentials(), ino, perm) + dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 8ad976073..44ccc9e4a 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -53,8 +53,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { - inoGen InoGenerator - task *kernel.Task + fs *filesystem + task *kernel.Task // When produceSymlinks is set, dirents produces for the FDs are reported // as symlink. Otherwise, they are reported as regular files. @@ -85,7 +85,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(fd), 10), Type: typ, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -110,15 +110,15 @@ type fdDirInode struct { var _ kernfs.Inode = (*fdDirInode)(nil) -func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { inode := &fdDirInode{ fdDir: fdDir{ - inoGen: inoGen, + fs: fs, task: task, produceSymlink: true, }, } - inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -137,7 +137,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro if !taskFDExists(i.task, fd) { return nil, syserror.ENOENT } - taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno()) + taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) return taskDentry.VFSDentry(), nil } @@ -186,12 +186,12 @@ type fdSymlink struct { var _ kernfs.Inode = (*fdSymlink)(nil) -func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { inode := &fdSymlink{ task: task, fd: fd, } - inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -234,14 +234,14 @@ type fdInfoDirInode struct { var _ kernfs.Inode = (*fdInfoDirInode)(nil) -func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { inode := &fdInfoDirInode{ fdDir: fdDir{ - inoGen: inoGen, - task: task, + fs: fs, + task: task, }, } - inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -264,7 +264,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, task: i.task, fd: fd, } - dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) + dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data) return dentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 515f25327..2f297e48a 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -241,9 +241,9 @@ type commInode struct { task *kernel.Task } -func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { inode := &commInode{task: task} - inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm) + inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) d := &kernfs.Dentry{} d.Init(inode) @@ -596,9 +596,9 @@ type exeSymlink struct { var _ kernfs.Inode = (*exeSymlink)(nil) -func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { inode := &exeSymlink{task: task} - inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -729,7 +729,7 @@ type namespaceSymlink struct { task *kernel.Task } -func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { +func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. @@ -737,7 +737,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, target) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} d := &kernfs.Dentry{} @@ -780,11 +780,11 @@ type namespaceInode struct { var _ kernfs.Inode = (*namespaceInode)(nil) // Init initializes a namespace inode. -func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { +func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - i.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } // Open implements Inode.Open. diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 9c329341a..6bde27376 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -37,7 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) @@ -57,37 +57,37 @@ func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. contents = map[string]*kernfs.Dentry{ - "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), - "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), + "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. - "arp": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)), - "netlink": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)), - "netstat": newDentry(root, inoGen.NextIno(), 0444, &netStatData{}), - "packet": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)), - "protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)), + "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)), + "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)), + "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}), + "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)), + "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). - "psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)), - "ptype": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)), - "route": newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}), - "tcp": newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}), - "udp": newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}), - "unix": newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}), + "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)), + "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)), + "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { - contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack}) - contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")) - contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k}) - contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6)) + contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6)) } } - return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents) + return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents) } // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 5aeda8c9b..b51d43954 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -33,11 +33,6 @@ const ( threadSelfName = "thread-self" ) -// InoGenerator generates unique inode numbers for a given filesystem. -type InoGenerator interface { - NextIno() uint64 -} - // tasksInode represents the inode for /proc/ directory. // // +stateify savable @@ -48,8 +43,8 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - inoGen InoGenerator - pidns *kernel.PIDNamespace + fs *filesystem + pidns *kernel.PIDNamespace // '/proc/self' and '/proc/thread-self' have custom directory offsets in // Linux. So handle them outside of OrderedChildren. @@ -64,29 +59,29 @@ type tasksInode struct { var _ kernfs.Inode = (*tasksInode)(nil) -func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { +func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), - "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), - "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen, k), - "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), - "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), - "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), + "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}), + "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}), + "sys": fs.newSysDir(root, k), + "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), + "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}), + "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}), + "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, - inoGen: inoGen, - selfSymlink: newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), - threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), + fs: fs, + selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } - inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -118,7 +113,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers) + taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) return taskDentry.VFSDentry(), nil } @@ -144,7 +139,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: selfName, Type: linux.DT_LNK, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -156,7 +151,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: threadSelfName, Type: linux.DT_LNK, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -189,7 +184,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } if err := cb.Handle(dirent); err != nil { diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index e5f13b69e..7d8983aa5 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -41,9 +41,9 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &selfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|0777) + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -83,9 +83,9 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|0777) + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 0e90e02fe..6dac2afa4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -30,89 +30,89 @@ import ( ) // newSysDir returns the dentry corresponding to /proc/sys directory. -func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}), - "shmall": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)), - "shmmax": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)), - "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), +func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), + "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), + "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), + "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), }), - "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{k: k}), - "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), + "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), + "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), }), - "net": newSysNetDir(root, inoGen, k), + "net": fs.newSysNetDir(root, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. -func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { +func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry // TODO(gvisor.dev/issue/1833): Support for using the network stack in the // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ - "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), + "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). - "ip_local_port_range": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000 65535")), - "ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "ipfrag_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")), - "ip_nonlocal_bind": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "ip_no_pmtu_disc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")), + "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), // tcp_allowed_congestion_control tell the user what they are able to // do as an unprivledged process so we leave it empty. - "tcp_allowed_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), - "tcp_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), // Many of the following stub files are features netstack doesn't // support. The unsupported features return "0" to indicate they are // disabled. - "tcp_base_mss": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")), - "tcp_dsack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_early_retrans": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen_key": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "tcp_invalid_ratelimit": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_intvl": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_probes": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")), - "tcp_mtu_probing": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_no_metrics_save": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_probe_interval": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_probe_threshold": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_retries1": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), - "tcp_retries2": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")), - "tcp_rfc1337": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_synack_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), - "tcp_syn_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), - "tcp_timestamps": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")), + "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")), + "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")), + "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), }), - "core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")), - "message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")), - "message_cost": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), - "optmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "rmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "rmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "somaxconn": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")), - "wmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "wmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), + "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), + "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")), + "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), }), } } - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD new file mode 100644 index 000000000..067c1657f --- /dev/null +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "signalfd", + srcs = ["signalfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/kernel", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go new file mode 100644 index 000000000..d29ef3f83 --- /dev/null +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package signalfd + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalFileDescription implements FileDescriptionImpl for signal fds. +type SignalFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // target is the original signal target task. + // + // The semantics here are a bit broken. Linux will always use current + // for all reads, regardless of where the signalfd originated. We can't + // do exactly that because we need to plumb the context through + // EventRegister in order to support proper blocking behavior. This + // will undoubtedly become very complicated quickly. + target *kernel.Task + + // mu protects mask. + mu sync.Mutex + + // mask is the signal mask. Protected by mu. + mask linux.SignalSet +} + +var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) + +// New creates a new signal fd. +func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[signalfd]") + defer vd.DecRef() + sfd := &SignalFileDescription{ + target: target, + mask: mask, + } + if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &sfd.vfsfd, nil +} + +// Mask returns the signal mask. +func (sfd *SignalFileDescription) Mask() linux.SignalSet { + sfd.mu.Lock() + defer sfd.mu.Unlock() + return sfd.mask +} + +// SetMask sets the signal mask. +func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { + sfd.mu.Lock() + defer sfd.mu.Unlock() + sfd.mask = mask +} + +// Read implements FileDescriptionImpl.Read. +func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + // Attempt to dequeue relevant signals. + info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) + if err != nil { + // There must be no signal available. + return 0, syserror.ErrWouldBlock + } + + // Copy out the signal info using the specified format. + var buf [128]byte + binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + Signo: uint32(info.Signo), + Errno: info.Errno, + Code: info.Code, + PID: uint32(info.Pid()), + UID: uint32(info.Uid()), + Status: info.Status(), + Overrun: uint32(info.Overrun()), + Addr: info.Addr(), + }) + n, err := dst.CopyOut(ctx, buf[:]) + return int64(n), err +} + +// Readiness implements waiter.Waitable.Readiness. +func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + sfd.mu.Lock() + defer sfd.mu.Unlock() + if mask&waiter.EventIn != 0 && sfd.target.PendingSignals()&sfd.mask != 0 { + return waiter.EventIn // Pending signals. + } + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sfd *SignalFileDescription) EventRegister(entry *waiter.Entry, _ waiter.EventMask) { + sfd.mu.Lock() + defer sfd.mu.Unlock() + // Register for the signal set; ignore the passed events. + sfd.target.SignalRegister(entry, waiter.EventMask(sfd.mask)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { + // Unregister the original entry. + sfd.target.SignalUnregister(entry) +} + +// Release implements FileDescriptionImpl.Release() +func (sfd *SignalFileDescription) Release() {} diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 52084ddb5..9453277b8 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 239a9f4b4..ee0828a15 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -16,8 +16,11 @@ package sockfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -41,19 +44,42 @@ func (filesystemType) Name() string { return "sockfs" } +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + // NewFilesystem sets up and returns a new sockfs filesystem. // // Note that there should only ever be one instance of sockfs.Filesystem, // backing a global socket mount. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino())) + return vfs.PrependPathSyntheticError{} } // inode implements kernfs.Inode. -// -// TODO(gvisor.dev/issue/1193): Device numbers. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -67,11 +93,15 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } // NewDentry constructs and returns a sockfs dentry. -func NewDentry(creds *auth.Credentials, ino uint64) *vfs.Dentry { +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry { + fs := mnt.Filesystem().Impl().(*filesystem) + // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) i := &inode{} - i.Init(creds, ino, filemode) + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) d := &kernfs.Dentry{} d.Init(i) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 00f7d6214..0af373604 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -37,6 +37,8 @@ type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem + + devMinor uint32 } // Name implements vfs.FilesystemType.Name. @@ -46,7 +48,14 @@ func (FilesystemType) Name() string { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - fs := &filesystem{} + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() @@ -77,6 +86,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + // dir implements kernfs.Inode. type dir struct { kernfs.InodeAttrs @@ -90,7 +105,7 @@ type dir struct { func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { d := &dir{} - d.InodeAttrs.Init(creds, fs.NextIno(), linux.ModeDirectory|0755) + d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) d.dentry.Init(d) @@ -127,7 +142,7 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { c := &cpuFile{maxCores: maxCores} - c.DynamicBytesFile.Init(creds, fs.NextIno(), c, mode) + c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) d := &kernfs.Dentry{} d.Init(c) return d diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD new file mode 100644 index 000000000..fbb02a271 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "timerfd", + srcs = ["timerfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 42b880656..60c92d626 100644 --- a/pkg/sentry/vfs/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -package vfs +// Package timerfd implements timer fds. +package timerfd import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -27,9 +29,9 @@ import ( // TimerFileDescription implements FileDescriptionImpl for timer fds. It also // implements ktime.TimerListener. type TimerFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl events waiter.Queue timer *ktime.Timer @@ -40,20 +42,19 @@ type TimerFileDescription struct { val uint64 } -var _ FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) var _ ktime.TimerListener = (*TimerFileDescription)(nil) -// NewTimerFD returns a new timer fd. -func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[timerfd]") +// New returns a new timer fd. +func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[timerfd]") defer vd.DecRef() tfd := &TimerFileDescription{} tfd.timer = ktime.NewTimer(clock, tfd) - if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, - InvalidWrite: true, }); err != nil { return nil, err } @@ -61,7 +62,7 @@ func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*File } // Read implements FileDescriptionImpl.Read. -func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { return 0, syserror.EINVAL diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index efc931468..405928bd0 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -63,6 +63,9 @@ type filesystem struct { // clock is a realtime clock used to set timestamps in file operations. clock time.Clock + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + // mu serializes changes to the Dentry tree. mu sync.RWMutex @@ -96,11 +99,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } - clock := time.RealtimeClockFromContext(ctx) - fs := filesystem{ - memFile: memFileProvider.MemoryFile(), - clock: clock, - } rootFileType := uint16(linux.S_IFDIR) newFSType := vfs.FilesystemType(&fstype) @@ -114,6 +112,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + clock := time.RealtimeClockFromContext(ctx) + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + clock: clock, + devMinor: devMinor, + } fs.vfsfs.Init(vfsObj, newFSType, &fs) var root *dentry @@ -125,6 +133,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = &fs.newDirectory(creds, 01777).dentry default: + fs.vfsfs.DecRef() return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &root.vfsd, nil @@ -132,6 +141,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -188,8 +198,8 @@ func (d *dentry) DecRef() { // inode represents a filesystem object. type inode struct { - // clock is a realtime clock used to set timestamps in file operations. - clock time.Clock + // fs is the owning filesystem. fs is immutable. + fs *filesystem // refs is a reference count. refs is accessed using atomic memory // operations. @@ -232,7 +242,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, if mode.FileType() == 0 { panic("file type is required in FileMode") } - i.clock = fs.clock + i.fs = fs i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) @@ -327,7 +337,8 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) - // TODO(gvisor.dev/issue/1197): Device number. + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = i.fs.devMinor switch impl := i.impl.(type) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS @@ -401,7 +412,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EINVAL } } - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { if stat.Atime.Nsec == linux.UTIME_NOW { atomic.StoreInt64(&i.atime, now) @@ -518,7 +529,7 @@ func (i *inode) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.atime, now) i.mu.Unlock() @@ -527,7 +538,7 @@ func (i *inode) touchAtime(mnt *vfs.Mount) { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCtime() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.ctime, now) i.mu.Unlock() @@ -535,7 +546,7 @@ func (i *inode) touchCtime() { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCMtime() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) @@ -545,7 +556,7 @@ func (i *inode) touchCMtime() { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds // inode.mu. func (i *inode) touchCMtimeLocked() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) } diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go index 5e9e60fc4..cd4dc67fb 100644 --- a/pkg/sentry/hostfd/hostfd_unsafe.go +++ b/pkg/sentry/hostfd/hostfd_unsafe.go @@ -34,24 +34,13 @@ func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint6 n uintptr e syscall.Errno ) - // Avoid preadv2(2) if possible, since it's relatively new and thus least - // likely to be supported by the host kernel. - if flags == 0 { - if dsts.NumBlocks() == 1 { - // Use read() or pread() to avoid iovec allocation and copying. - dst := dsts.Head() - if offset == -1 { - n, _, e = syscall.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len())) - } else { - n, _, e = syscall.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) - } + if flags == 0 && dsts.NumBlocks() == 1 { + // Use read() or pread() to avoid iovec allocation and copying. + dst := dsts.Head() + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len())) } else { - iovs := safemem.IovecsFromBlockSeq(dsts) - if offset == -1 { - n, _, e = syscall.Syscall(unix.SYS_READV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs))) - } else { - n, _, e = syscall.Syscall6(unix.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */) - } + n, _, e = syscall.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) } } else { iovs := safemem.IovecsFromBlockSeq(dsts) @@ -77,24 +66,13 @@ func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint n uintptr e syscall.Errno ) - // Avoid pwritev2(2) if possible, since it's relatively new and thus least - // likely to be supported by the host kernel. - if flags == 0 { - if srcs.NumBlocks() == 1 { - // Use write() or pwrite() to avoid iovec allocation and copying. - src := srcs.Head() - if offset == -1 { - n, _, e = syscall.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len())) - } else { - n, _, e = syscall.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) - } + if flags == 0 && srcs.NumBlocks() == 1 { + // Use write() or pwrite() to avoid iovec allocation and copying. + src := srcs.Head() + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len())) } else { - iovs := safemem.IovecsFromBlockSeq(srcs) - if offset == -1 { - n, _, e = syscall.Syscall(unix.SYS_WRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs))) - } else { - n, _, e = syscall.Syscall6(unix.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */) - } + n, _, e = syscall.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) } } else { iovs := safemem.IovecsFromBlockSeq(srcs) diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e47af66d6..8104f50f3 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -172,6 +172,7 @@ go_library( "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c91b9dce2..3617da8c6 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -48,10 +48,11 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -372,7 +373,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("failed to initialize VFS: %v", err) } - pipeFilesystem := pipefs.NewFilesystem(&k.vfs) + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create pipefs filesystem: %v", err) + } defer pipeFilesystem.DecRef() pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { @@ -380,7 +384,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount - socketFilesystem := sockfs.NewFilesystem(&k.vfs) + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create sockfs filesystem: %v", err) + } defer socketFilesystem.DecRef() socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { @@ -1068,11 +1075,11 @@ func (k *Kernel) pauseTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.PauseTimer() } } @@ -1104,11 +1111,11 @@ func (k *Kernel) resumeTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.ResumeTimer() } } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index deedd35f7..e82d6cd1e 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,7 +26,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index a8278bffc..677743113 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -50,8 +49,7 @@ var _ = socket.SocketVFS2(&socketVFS2{}) func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) s := &socketVFS2{ socketOpsCommon: socketOpsCommon{ diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 40736fb38..789bb94c8 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,6 +17,7 @@ package netfilter import ( + "bytes" "errors" "fmt" @@ -58,6 +59,13 @@ type metadata struct { // developing iptables, but can pollute sentry logs otherwise. const enableLogging = false +// emptyFilter is for comparison with a rule's filters to determine whether it +// is also empty. It is immutable. +var emptyFilter = stack.IPHeaderFilter{ + Dst: "\x00\x00\x00\x00", + DstMask: "\x00\x00\x00\x00", +} + // nflog logs messages related to the writing and reading of iptables. func nflog(format string, args ...interface{}) { if enableLogging && log.IsLogging(log.Debug) { @@ -204,6 +212,16 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI TargetOffset: linux.SizeOfIPTEntry, }, } + copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst) + copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask) + copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface) + copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + if rule.Filter.DstInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP + } + if rule.Filter.OutputInterfaceInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT + } for _, matcher := range rule.Matchers { // Serialize the matcher and add it to the @@ -473,7 +491,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { } if offset == replace.Underflow[hook] { if !validUnderflow(table.Rules[ruleIdx]) { - nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP") + nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx) return syserr.ErrInvalidArgument } table.Underflows[hk] = ruleIdx @@ -536,7 +554,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // make sure all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { if hook == stack.Forward || hook == stack.Postrouting { - if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok { + if !isUnconditionalAccept(table.Rules[ruleIdx]) { nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument } @@ -719,11 +737,27 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) } + + n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) + if n == -1 { + n = len(iptip.OutputInterface) + } + ifname := string(iptip.OutputInterface[:n]) + + n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) + if n == -1 { + n = len(iptip.OutputInterfaceMask) + } + ifnameMask := string(iptip.OutputInterfaceMask[:n]) + return stack.IPHeaderFilter{ - Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), - Dst: tcpip.Address(iptip.Dst[:]), - DstMask: tcpip.Address(iptip.DstMask[:]), - DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, + Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), + Dst: tcpip.Address(iptip.Dst[:]), + DstMask: tcpip.Address(iptip.DstMask[:]), + DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, + OutputInterface: ifname, + OutputInterfaceMask: ifnameMask, + OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, }, nil } @@ -732,16 +766,15 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool { // - Protocol // - Dst and DstMask // - The inverse destination IP check flag + // - OutputInterface, OutputInterfaceMask and its inverse. var emptyInetAddr = linux.InetAddr{} var emptyInterface = [linux.IFNAMSIZ]byte{} // Disable any supported inverse flags. - inverseMask := uint8(linux.IPT_INV_DSTIP) + inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_VIA_OUT) return iptip.Src != emptyInetAddr || iptip.SrcMask != emptyInetAddr || iptip.InputInterface != emptyInterface || - iptip.OutputInterface != emptyInterface || iptip.InputInterfaceMask != emptyInterface || - iptip.OutputInterfaceMask != emptyInterface || iptip.Flags != 0 || iptip.InverseFlags&^inverseMask != 0 } @@ -750,6 +783,9 @@ func validUnderflow(rule stack.Rule) bool { if len(rule.Matchers) != 0 { return false } + if rule.Filter != emptyFilter { + return false + } switch rule.Target.(type) { case stack.AcceptTarget, stack.DropTarget: return true @@ -758,6 +794,14 @@ func validUnderflow(rule stack.Rule) bool { } } +func isUnconditionalAccept(rule stack.Rule) bool { + if !validUnderflow(rule) { + return false + } + _, ok := rule.Target.(stack.AcceptTarget) + return ok +} + func hookFromLinux(hook int) stack.Hook { switch hook { case linux.NF_INET_PRE_ROUTING: diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 09ca00a4a..7212d8644 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go index dcd92b5cd..bb205be0d 100644 --- a/pkg/sentry/socket/netlink/provider_vfs2.go +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -16,7 +16,6 @@ package netlink import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -52,8 +51,7 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol vfsfd := &s.vfsfd mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index ccf9fcf5c..333e0042e 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -18,6 +18,7 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/amutex", "//pkg/binary", "//pkg/context", "//pkg/log", @@ -27,7 +28,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 81053d8ef..9d032f052 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -34,6 +34,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -553,11 +554,9 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } if resCh != nil { - t := kernel.TaskFromContext(ctx) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) } @@ -626,11 +625,9 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader } if resCh != nil { - t := kernel.TaskFromContext(ctx) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{ Atomic: true, // See above. }) diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index f7d9b2ff4..fcd8013c0 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -16,9 +16,9 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -53,8 +53,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu } mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) s := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ @@ -91,11 +90,6 @@ func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { s.socketOpsCommon.EventUnregister(e) } -// PRead implements vfs.FileDescriptionImpl. -func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Read implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. @@ -117,11 +111,6 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. return int64(n), nil } -// PWrite implements vfs.FileDescriptionImpl. -func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Write implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. @@ -137,11 +126,9 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs } if resCh != nil { - t := kernel.TaskFromContext(ctx) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) } diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 941a91097..de2cc4bdf 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -21,7 +21,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 06d838868..45e109361 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -50,8 +49,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{}) // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) if err != nil { diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 245e8fe1e..217fcfef2 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -49,7 +49,8 @@ go_library( "sys_time.go", "sys_timer.go", "sys_timerfd.go", - "sys_tls.go", + "sys_tls_amd64.go", + "sys_tls_arm64.go", "sys_utsname.go", "sys_write.go", "sys_xattr.go", diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 2ddb2b146..434559b80 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -21,9 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and +// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and // STOP are clear. -func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { +// +// TODO(gvisor.dev/issue/1624): This is only exported because +// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2 +// syscalls are moved into this package, then they can be unexported. +func CopyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { return 0, syserror.EINVAL } diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 51bf205cf..7f460d30b 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -131,7 +131,7 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy maskSize := uint(args[5].Uint()) if maskAddr != 0 { - mask, err := copyInSigSet(t, maskAddr, maskSize) + mask, err := CopyInSigSet(t, maskAddr, maskSize) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 4f8762d7d..f0198141c 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -443,7 +443,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if maskAddr != 0 { - mask, err := copyInSigSet(t, maskAddr, maskSize) + mask, err := CopyInSigSet(t, maskAddr, maskSize) if err != nil { return 0, nil, err } @@ -525,7 +525,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } if maskAddr != 0 { - mask, err := copyInSigSet(t, maskAddr, size) + mask, err := CopyInSigSet(t, maskAddr, size) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 582d37e03..d2b0012ae 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -295,7 +295,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel } oldmask := t.SignalMask() if setaddr != 0 { - mask, err := copyInSigSet(t, setaddr, sigsetsize) + mask, err := CopyInSigSet(t, setaddr, sigsetsize) if err != nil { return 0, nil, err } @@ -366,7 +366,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne timespec := args[2].Pointer() sigsetsize := args[3].SizeT() - mask, err := copyInSigSet(t, sigset, sigsetsize) + mask, err := CopyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } @@ -518,7 +518,7 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // sharedSignalfd is shared between the two calls. func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { // Copy in the signal mask. - mask, err := copyInSigSet(t, sigset, sigsetsize) + mask, err := CopyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go index b3eb96a1c..b3eb96a1c 100644 --- a/pkg/sentry/syscalls/linux/sys_tls.go +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go new file mode 100644 index 000000000..fb08a356e --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//+build arm64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ArchPrctl is not defined for ARM64. +func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go index a393e28c1..e9d702e8e 100644 --- a/pkg/sentry/syscalls/linux/sys_utsname.go +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build amd64 arm64 - package linux import ( diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 4c7b8f819..c32f942fb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -19,6 +19,7 @@ go_library( "poll.go", "read_write.go", "setstat.go", + "signal.go", "socket.go", "stat.go", "stat_amd64.go", @@ -38,7 +39,10 @@ go_library( "//pkg/gohacks", "//pkg/sentry/arch", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", + "//pkg/sentry/fsimpl/signalfd", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index bd2194972..aff1a2070 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,12 +32,13 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EINVAL } + vfsObj := t.Kernel().VFS() fileFlags := uint32(linux.O_RDWR) if flags&linux.EFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := t.Kernel().VFS().NewEventFD(initVal, semMode, fileFlags) + eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go new file mode 100644 index 000000000..623992f6f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// sharedSignalfd is shared between the two calls. +func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { + // Copy in the signal mask. + mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + // Always check for valid flags, even if not creating. + if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Is this a change to an existing signalfd? + // + // The spec indicates that this should adjust the mask. + if fd != -1 { + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is this a signalfd? + if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { + sfd.SetMask(mask) + return 0, nil, nil + } + + // Not a signalfd. + return 0, nil, syserror.EINVAL + } + + fileFlags := uint32(linux.O_RDWR) + if flags&linux.SFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + + // Create a new file. + vfsObj := t.Kernel().VFS() + file, err := signalfd.New(vfsObj, t, mask, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + // Create a new descriptor. + fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.SFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + // Done. + return uintptr(fd), nil, nil +} + +// Signalfd implements the linux syscall signalfd(2). +func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + return sharedSignalfd(t, fd, sigset, sigsetsize, 0) +} + +// Signalfd4 implements the linux syscall signalfd4(2). +func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + flags := args[3].Int() + return sharedSignalfd(t, fd, sigset, sigsetsize, flags) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 7938a5249..5ac79bc09 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -17,9 +17,9 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +32,12 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } - var fileFlags uint32 + // Timerfds aren't writable per se (their implementation of Write just + // returns EINVAL), but they are "opened for writing", which is necessary + // to actually reach said implementation of Write. + fileFlags := uint32(linux.O_RDWR) if flags&linux.TFD_NONBLOCK != 0 { - fileFlags = linux.O_NONBLOCK + fileFlags |= linux.O_NONBLOCK } var clock ktime.Clock @@ -46,7 +49,8 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel default: return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewTimerFD(clock, fileFlags) + vfsObj := t.Kernel().VFS() + file, err := timerfd.New(vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } @@ -77,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } @@ -111,7 +115,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index f1b697844..9c04677f1 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -139,14 +139,14 @@ func Override() { s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) s.Table[280] = syscalls.Supported("utimensat", Utimensat) s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) - delete(s.Table, 282) // signalfd + s.Table[282] = syscalls.Supported("signalfd", Signalfd) s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) s.Table[284] = syscalls.Supported("eventfd", Eventfd) delete(s.Table, 285) // fallocate s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[288] = syscalls.Supported("accept4", Accept4) - delete(s.Table, 289) // signalfd4 + s.Table[289] = syscalls.Supported("signalfd4", Signalfd4) s.Table[290] = syscalls.Supported("eventfd2", Eventfd2) s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) s.Table[292] = syscalls.Supported("dup3", Dup3) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index c62505fe2..94d69c1cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -25,7 +25,6 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", - "eventfd.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -37,12 +36,12 @@ go_library( "pathname.go", "permissions.go", "resolving_path.go", - "timerfd.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", @@ -70,7 +69,6 @@ go_test( name = "vfs_test", size = "small", srcs = [ - "eventfd_test.go", "file_description_impl_util_test.go", "mount_test.go", ], @@ -82,6 +80,5 @@ go_test( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", - "//pkg/waiter", ], ) diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 981bd8caa..caf770fd5 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -91,8 +91,6 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error { } // AccessAt implements vfs.Filesystem.Impl.AccessAt. -// -// TODO(gvisor.dev/issue/1965): Implement access permissions. func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error { if !rp.Done() { return syserror.ENOTDIR @@ -204,7 +202,7 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St Ino: 1, Size: 0, Blocks: 0, - DevMajor: 0, + DevMajor: linux.UNNAMED_MAJOR, DevMinor: fs.devMinor, }, nil } diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index bda5576fa..1e9dffc8f 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -103,7 +103,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo } // GetAnonBlockDevMinor allocates and returns an unused minor device number for -// an "anonymous" block device with major number 0. +// an "anonymous" block device with major number UNNAMED_MAJOR. func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { vfs.anonBlockDevMinorMu.Lock() defer vfs.anonBlockDevMinorMu.Unlock() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 418d69b96..cfabd936c 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -91,10 +91,6 @@ type FileDescriptionOptions struct { // ESPIPE. DenyPWrite bool - // if InvalidWrite is true, calls to FileDescription.Write() return - // EINVAL. - InvalidWrite bool - // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling @@ -570,9 +566,6 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - if fd.opts.InvalidWrite { - return 0, syserror.EINVAL - } if !fd.writable { return 0, syserror.EBADF } diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go index f01217c91..9a3c5d6c3 100644 --- a/pkg/tcpip/buffer/view.go +++ b/pkg/tcpip/buffer/view.go @@ -59,6 +59,9 @@ func (v *View) Reader() bytes.Reader { // ToVectorisedView returns a VectorisedView containing the receiver. func (v View) ToVectorisedView() VectorisedView { + if len(v) == 0 { + return VectorisedView{} + } return NewVectorisedView(len(v), []View{v}) } @@ -229,6 +232,9 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) { // AppendView appends the given view into this vectorised view. func (vv *VectorisedView) AppendView(v View) { + if len(v) == 0 { + return + } vv.views = append(vv.views, v) vv.size += len(v) } diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go index c56795c7b..726e54de9 100644 --- a/pkg/tcpip/buffer/view_test.go +++ b/pkg/tcpip/buffer/view_test.go @@ -483,3 +483,39 @@ func TestPullUp(t *testing.T) { } } } + +func TestToVectorisedView(t *testing.T) { + testCases := []struct { + in View + want VectorisedView + }{ + {nil, VectorisedView{}}, + {View{}, VectorisedView{}}, + {View{'a'}, VectorisedView{size: 1, views: []View{{'a'}}}}, + } + for _, tc := range testCases { + if got, want := tc.in.ToVectorisedView(), tc.want; !reflect.DeepEqual(got, want) { + t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want) + } + } +} + +func TestAppendView(t *testing.T) { + testCases := []struct { + vv VectorisedView + in View + want VectorisedView + }{ + {VectorisedView{}, nil, VectorisedView{}}, + {VectorisedView{}, View{}, VectorisedView{}}, + {VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, nil, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}}, + {VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}}, + {VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{'e'}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}, {'e'}}, 5}}, + } + for _, tc := range testCases { + tc.vv.AppendView(tc.in) + if got, want := tc.vv, tc.want; !reflect.DeepEqual(got, want) { + t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want) + } + } +} diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go index 21581257b..29454c4b9 100644 --- a/pkg/tcpip/header/tcp.go +++ b/pkg/tcpip/header/tcp.go @@ -609,5 +609,8 @@ func Acceptable(segSeq seqnum.Value, segLen seqnum.Size, rcvNxt, rcvAcc seqnum.V } // Page 70 of RFC 793 allows packets that can be made "acceptable" by trimming // the payload, so we'll accept any payload that overlaps the receieve window. - return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThan(rcvAcc) + // segSeq < rcvAcc is more correct according to RFC, however, Linux does it + // differently, it uses segSeq <= rcvAcc, we'd want to keep the same behavior + // as Linux. + return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThanEq(rcvAcc) } diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index 880ea7de2..78420d6e6 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -34,5 +34,6 @@ go_test( "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/waiter", + "@com_github_google_go-cmp//cmp:go_default_library", ], ) diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 9db42b2a4..64046cbbf 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -249,10 +249,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params) pkt.NetworkHeader = buffer.View(ip) + nicName := e.stack.FindNICNameFromID(e.NICID()) // iptables filtering. All packets that reach here are locally // generated. ipt := e.stack.IPTables() - if ok := ipt.Check(stack.Output, &pkt, gso, r, ""); !ok { + if ok := ipt.Check(stack.Output, &pkt, gso, r, "", nicName); !ok { // iptables is telling us to drop the packet. return nil } @@ -319,10 +320,11 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe pkt = pkt.Next() } + nicName := e.stack.FindNICNameFromID(e.NICID()) // iptables filtering. All packets that reach here are locally // generated. ipt := e.stack.IPTables() - dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r) + dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName) if len(dropped) == 0 && len(natPkts) == 0 { // Fast path: If no packets are to be dropped then we can just invoke the // faster WritePackets API directly. @@ -445,7 +447,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) { // iptables filtering. All packets that reach here are intended for // this machine and will not be forwarded. ipt := e.stack.IPTables() - if ok := ipt.Check(stack.Input, &pkt, nil, nil, ""); !ok { + if ok := ipt.Check(stack.Input, &pkt, nil, nil, "", ""); !ok { // iptables is telling us to drop the packet. return } diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index 5a864d832..36035c820 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -20,6 +20,7 @@ import ( "math/rand" "testing" + "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -473,3 +474,252 @@ func TestInvalidFragments(t *testing.T) { }) } } + +// TestReceiveFragments feeds fragments in through the incoming packet path to +// test reassembly +func TestReceiveFragments(t *testing.T) { + const addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1 + const addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2 + const nicID = 1 + + // Build and return a UDP header containing payload. + udpGen := func(payloadLen int, multiplier uint8) buffer.View { + payload := buffer.NewView(payloadLen) + for i := 0; i < len(payload); i++ { + payload[i] = uint8(i) * multiplier + } + + udpLength := header.UDPMinimumSize + len(payload) + + hdr := buffer.NewPrependable(udpLength) + u := header.UDP(hdr.Prepend(udpLength)) + u.Encode(&header.UDPFields{ + SrcPort: 5555, + DstPort: 80, + Length: uint16(udpLength), + }) + copy(u.Payload(), payload) + sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength)) + sum = header.Checksum(payload, sum) + u.SetChecksum(^u.CalculateChecksum(sum)) + return hdr.View() + } + + // UDP header plus a payload of 0..256 + ipv4Payload1 := udpGen(256, 1) + udpPayload1 := ipv4Payload1[header.UDPMinimumSize:] + // UDP header plus a payload of 0..256 in increments of 2. + ipv4Payload2 := udpGen(128, 2) + udpPayload2 := ipv4Payload2[header.UDPMinimumSize:] + + type fragmentData struct { + id uint16 + flags uint8 + fragmentOffset uint16 + payload buffer.View + } + + tests := []struct { + name string + fragments []fragmentData + expectedPayloads [][]byte + }{ + { + name: "No fragmentation", + fragments: []fragmentData{ + { + id: 1, + flags: 0, + fragmentOffset: 0, + payload: ipv4Payload1, + }, + }, + expectedPayloads: [][]byte{udpPayload1}, + }, + { + name: "More fragments without payload", + fragments: []fragmentData{ + { + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1, + }, + }, + expectedPayloads: nil, + }, + { + name: "Non-zero fragment offset without payload", + fragments: []fragmentData{ + { + id: 1, + flags: 0, + fragmentOffset: 8, + payload: ipv4Payload1, + }, + }, + expectedPayloads: nil, + }, + { + name: "Two fragments", + fragments: []fragmentData{ + { + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1[:64], + }, + { + id: 1, + flags: 0, + fragmentOffset: 64, + payload: ipv4Payload1[64:], + }, + }, + expectedPayloads: [][]byte{udpPayload1}, + }, + { + name: "Second fragment has MoreFlags set", + fragments: []fragmentData{ + { + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1[:64], + }, + { + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 64, + payload: ipv4Payload1[64:], + }, + }, + expectedPayloads: nil, + }, + { + name: "Two fragments with different IDs", + fragments: []fragmentData{ + { + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1[:64], + }, + { + id: 2, + flags: 0, + fragmentOffset: 64, + payload: ipv4Payload1[64:], + }, + }, + expectedPayloads: nil, + }, + { + name: "Two interleaved fragmented packets", + fragments: []fragmentData{ + { + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload1[:64], + }, + { + id: 2, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload2[:64], + }, + { + id: 1, + flags: 0, + fragmentOffset: 64, + payload: ipv4Payload1[64:], + }, + { + id: 2, + flags: 0, + fragmentOffset: 64, + payload: ipv4Payload2[64:], + }, + }, + expectedPayloads: [][]byte{udpPayload1, udpPayload2}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Setup a stack and endpoint. + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()}, + TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}, + }) + e := channel.New(0, 1280, tcpip.LinkAddress("\xf0\x00")) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, addr2); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err) + } + + wq := waiter.Queue{} + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.EventIn) + defer wq.EventUnregister(&we) + defer close(ch) + ep, err := s.NewEndpoint(udp.ProtocolNumber, header.IPv4ProtocolNumber, &wq) + if err != nil { + t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, header.IPv4ProtocolNumber, err) + } + defer ep.Close() + + bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80} + if err := ep.Bind(bindAddr); err != nil { + t.Fatalf("Bind(%+v): %s", bindAddr, err) + } + + // Prepare and send the fragments. + for _, frag := range test.fragments { + hdr := buffer.NewPrependable(header.IPv4MinimumSize) + + // Serialize IPv4 fixed header. + ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) + ip.Encode(&header.IPv4Fields{ + IHL: header.IPv4MinimumSize, + TotalLength: header.IPv4MinimumSize + uint16(len(frag.payload)), + ID: frag.id, + Flags: frag.flags, + FragmentOffset: frag.fragmentOffset, + TTL: 64, + Protocol: uint8(header.UDPProtocolNumber), + SrcAddr: addr1, + DstAddr: addr2, + }) + + vv := hdr.View().ToVectorisedView() + vv.AppendView(frag.payload) + + e.InjectInbound(header.IPv4ProtocolNumber, stack.PacketBuffer{ + Data: vv, + }) + } + + if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want { + t.Errorf("got UDP Rx Packets = %d, want = %d", got, want) + } + + for i, expectedPayload := range test.expectedPayloads { + gotPayload, _, err := ep.Read(nil) + if err != nil { + t.Fatalf("(i=%d) Read(nil): %s", i, err) + } + if diff := cmp.Diff(buffer.View(expectedPayload), gotPayload); diff != "" { + t.Errorf("(i=%d) got UDP payload mismatch (-want +got):\n%s", i, diff) + } + } + + if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock { + t.Fatalf("(last) got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock) + } + }) + } +} diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go index 8b4213eec..d199ded6a 100644 --- a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go +++ b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Code generated by "stringer -type=DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT. +// Code generated by "stringer -type DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT. package stack @@ -22,9 +22,9 @@ func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} - _ = x[DHCPv6NoConfiguration-0] - _ = x[DHCPv6ManagedAddress-1] - _ = x[DHCPv6OtherConfigurations-2] + _ = x[DHCPv6NoConfiguration-1] + _ = x[DHCPv6ManagedAddress-2] + _ = x[DHCPv6OtherConfigurations-3] } const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAddressDHCPv6OtherConfigurations" @@ -32,8 +32,9 @@ const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAd var _DHCPv6ConfigurationFromNDPRA_index = [...]uint8{0, 21, 41, 66} func (i DHCPv6ConfigurationFromNDPRA) String() string { + i -= 1 if i < 0 || i >= DHCPv6ConfigurationFromNDPRA(len(_DHCPv6ConfigurationFromNDPRA_index)-1) { - return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i), 10) + ")" + return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i+1), 10) + ")" } return _DHCPv6ConfigurationFromNDPRA_name[_DHCPv6ConfigurationFromNDPRA_index[i]:_DHCPv6ConfigurationFromNDPRA_index[i+1]] } diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 7c3c47d50..443423b3c 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -16,6 +16,7 @@ package stack import ( "fmt" + "strings" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -178,7 +179,7 @@ const ( // dropped. // // Precondition: pkt.NetworkHeader is set. -func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address) bool { +func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool { // Packets are manipulated only if connection and matching // NAT rule exists. it.connections.HandlePacket(pkt, hook, gso, r) @@ -187,7 +188,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr for _, tablename := range it.Priorities[hook] { table := it.Tables[tablename] ruleIdx := table.BuiltinChains[hook] - switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address); verdict { + switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { // If the table returns Accept, move on to the next table. case chainAccept: continue @@ -228,10 +229,10 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr // // NOTE: unlike the Check API the returned map contains packets that should be // dropped. -func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *Route) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { +func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *Route, nicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { if !pkt.NatDone { - if ok := it.Check(hook, pkt, gso, r, ""); !ok { + if ok := it.Check(hook, pkt, gso, r, "", nicName); !ok { if drop == nil { drop = make(map[*PacketBuffer]struct{}) } @@ -251,11 +252,11 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r * // Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a // precondition. -func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address) chainVerdict { +func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict { // Start from ruleIdx and walk the list of rules until a rule gives us // a verdict. for ruleIdx < len(table.Rules) { - switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address); verdict { + switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { case RuleAccept: return chainAccept @@ -272,7 +273,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId ruleIdx++ continue } - switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address); verdict { + switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict { case chainAccept: return chainAccept case chainDrop: @@ -298,7 +299,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId // Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a // precondition. -func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) { +func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) { rule := table.Rules[ruleIdx] // If pkt.NetworkHeader hasn't been set yet, it will be contained in @@ -313,7 +314,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx } // Check whether the packet matches the IP header filter. - if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader)) { + if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader), hook, nicName) { // Continue on to the next rule. return RuleJump, ruleIdx + 1 } @@ -335,7 +336,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx return rule.Target.Action(pkt, &it.connections, hook, gso, r, address) } -func filterMatch(filter IPHeaderFilter, hdr header.IPv4) bool { +func filterMatch(filter IPHeaderFilter, hdr header.IPv4, hook Hook, nicName string) bool { // TODO(gvisor.dev/issue/170): Support other fields of the filter. // Check the transport protocol. if filter.Protocol != 0 && filter.Protocol != hdr.TransportProtocol() { @@ -355,5 +356,26 @@ func filterMatch(filter IPHeaderFilter, hdr header.IPv4) bool { return false } + // Check the output interface. + // TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING + // hooks after supported. + if hook == Output { + n := len(filter.OutputInterface) + if n == 0 { + return true + } + + // If the interface name ends with '+', any interface which begins + // with the name should be matched. + ifName := filter.OutputInterface + matches = true + if strings.HasSuffix(ifName, "+") { + matches = strings.HasPrefix(nicName, ifName[:n-1]) + } else { + matches = nicName == ifName + } + return filter.OutputInterfaceInvert != matches + } + return true } diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index 1bb0ba1bd..fe06007ae 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -158,6 +158,19 @@ type IPHeaderFilter struct { // true the filter will match packets that fail the destination // comparison. DstInvert bool + + // OutputInterface matches the name of the outgoing interface for the + // packet. + OutputInterface string + + // OutputInterfaceMask masks the characters of the interface name when + // comparing with OutputInterface. + OutputInterfaceMask string + + // OutputInterfaceInvert inverts the meaning of outgoing interface check, + // i.e. when true the filter will match packets that fail the outgoing + // interface comparison. + OutputInterfaceInvert bool } // A Matcher is the interface for matching packets. diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index 15343acbc..526c7d6ff 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -199,9 +199,11 @@ var ( type DHCPv6ConfigurationFromNDPRA int const ( + _ DHCPv6ConfigurationFromNDPRA = iota + // DHCPv6NoConfiguration indicates that no configurations are available via // DHCPv6. - DHCPv6NoConfiguration DHCPv6ConfigurationFromNDPRA = iota + DHCPv6NoConfiguration // DHCPv6ManagedAddress indicates that addresses are available via DHCPv6. // @@ -315,9 +317,6 @@ type NDPDispatcher interface { // OnDHCPv6Configuration will be called with an updated configuration that is // available via DHCPv6 for a specified NIC. // - // NDPDispatcher assumes that the initial configuration available by DHCPv6 is - // DHCPv6NoConfiguration. - // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA) @@ -1808,6 +1807,8 @@ func (ndp *ndpState) cleanupState(hostOnly bool) { if got := len(ndp.defaultRouters); got != 0 { panic(fmt.Sprintf("ndp: still have discovered default routers after cleaning up; found = %d", got)) } + + ndp.dhcpv6Configuration = 0 } // startSolicitingRouters starts soliciting routers, as per RFC 4861 section diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 67f012840..b3d174cdd 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -4888,7 +4888,12 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) { } } - // The initial DHCPv6 configuration should be stack.DHCPv6NoConfiguration. + // Even if the first RA reports no DHCPv6 configurations are available, the + // dispatcher should get an event. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false)) + expectDHCPv6Event(stack.DHCPv6NoConfiguration) + // Receiving the same update again should not result in an event to the + // dispatcher. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false)) expectNoDHCPv6Event() @@ -4896,8 +4901,6 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) { // Configurations. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true)) expectDHCPv6Event(stack.DHCPv6OtherConfigurations) - // Receiving the same update again should not result in an event to the - // NDPDispatcher. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true)) expectNoDHCPv6Event() @@ -4933,6 +4936,21 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) { expectDHCPv6Event(stack.DHCPv6OtherConfigurations) e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true)) expectNoDHCPv6Event() + + // Cycling the NIC should cause the last DHCPv6 configuration to be cleared. + if err := s.DisableNIC(nicID); err != nil { + t.Fatalf("s.DisableNIC(%d): %s", nicID, err) + } + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("s.EnableNIC(%d): %s", nicID, err) + } + + // Receive an RA that updates the DHCPv6 configuration to Other + // Configurations. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true)) + expectDHCPv6Event(stack.DHCPv6OtherConfigurations) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true)) + expectNoDHCPv6Event() } // TestRouterSolicitation tests the initial Router Solicitations that are sent diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 8f4c1fe42..54103fdb3 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -1233,7 +1233,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link // iptables filtering. ipt := n.stack.IPTables() address := n.primaryAddress(protocol) - if ok := ipt.Check(Prerouting, &pkt, nil, nil, address.Address); !ok { + if ok := ipt.Check(Prerouting, &pkt, nil, nil, address.Address, ""); !ok { // iptables is telling us to drop the packet. return } diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index e33fae4eb..b39ffa9fb 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -1898,9 +1898,23 @@ func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, addres nic.mu.RLock() defer nic.mu.RUnlock() - // An endpoint with this id exists, check if it can be used and return it. + // An endpoint with this id exists, check if it can be + // used and return it. return ref.ep, nil } } return nil, tcpip.ErrBadAddress } + +// FindNICNameFromID returns the name of the nic for the given NICID. +func (s *Stack) FindNICNameFromID(id tcpip.NICID) string { + s.mu.Lock() + defer s.mu.Unlock() + + nic, ok := s.nics[id] + if !ok { + return "" + } + + return nic.Name() +} diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 6fe97fefd..dd89a292a 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -70,7 +70,16 @@ func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale // acceptable checks if the segment sequence number range is acceptable // according to the table on page 26 of RFC 793. func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { - return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvAcc) + // r.rcvWnd could be much larger than the window size we advertised in our + // outgoing packets, we should use what we have advertised for acceptability + // test. + scaledWindowSize := r.rcvWnd >> r.rcvWndScale + if scaledWindowSize > 0xffff { + // This is what we actually put in the Window field. + scaledWindowSize = 0xffff + } + advertisedWindowSize := scaledWindowSize << r.rcvWndScale + return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize)) } // getSendParams returns the parameters needed by the sender when building @@ -259,7 +268,14 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo // If we are in one of the shutdown states then we need to do // additional checks before we try and process the segment. switch state { - case StateCloseWait, StateClosing, StateLastAck: + case StateCloseWait: + // If the ACK acks something not yet sent then we send an ACK. + if r.ep.snd.sndNxt.LessThan(s.ackNumber) { + r.ep.snd.sendAck() + return true, nil + } + fallthrough + case StateClosing, StateLastAck: if !s.sequenceNumber.LessThanEq(r.rcvNxt) { // Just drop the segment as we have // already received a FIN and this @@ -276,7 +292,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo // SHUT_RD) then any data past the rcvNxt should // trigger a RST. endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size())) - if rcvClosed && r.rcvNxt.LessThan(endDataSeq) { + if state != StateCloseWait && rcvClosed && r.rcvNxt.LessThan(endDataSeq) { return true, tcpip.ErrConnectionAborted } if state == StateFinWait1 { @@ -329,13 +345,6 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { state := r.ep.EndpointState() closed := r.ep.closed - if state != StateEstablished { - drop, err := r.handleRcvdSegmentClosing(s, state, closed) - if drop || err != nil { - return drop, err - } - } - segLen := seqnum.Size(s.data.Size()) segSeq := s.sequenceNumber @@ -347,6 +356,13 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { return true, nil } + if state != StateEstablished { + drop, err := r.handleRcvdSegmentClosing(s, state, closed) + if drop || err != nil { + return drop, err + } + } + // Store the time of the last ack. r.lastRcvdAckTime = time.Now() diff --git a/pkg/tcpip/transport/tcp/rcv_test.go b/pkg/tcpip/transport/tcp/rcv_test.go index c9eeff935..8a026ec46 100644 --- a/pkg/tcpip/transport/tcp/rcv_test.go +++ b/pkg/tcpip/transport/tcp/rcv_test.go @@ -30,7 +30,7 @@ func TestAcceptable(t *testing.T) { }{ // The segment is smaller than the window. {105, 2, 100, 104, false}, - {105, 2, 101, 105, false}, + {105, 2, 101, 105, true}, {105, 2, 102, 106, true}, {105, 2, 103, 107, true}, {105, 2, 104, 108, true}, @@ -39,7 +39,7 @@ func TestAcceptable(t *testing.T) { {105, 2, 107, 111, false}, // The segment is larger than the window. - {105, 4, 103, 105, false}, + {105, 4, 103, 105, true}, {105, 4, 104, 106, true}, {105, 4, 105, 107, true}, {105, 4, 106, 108, true}, diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 7712ce652..074edded6 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -96,6 +96,8 @@ func (s *segment) clone() *segment { route: s.route.Clone(), viewToDeliver: s.viewToDeliver, rcvdTime: s.rcvdTime, + xmitTime: s.xmitTime, + xmitCount: s.xmitCount, } t.data = s.data.Clone(t.views[:]) return t diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index a3018914b..9e547a221 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -598,22 +598,34 @@ func (s *sender) splitSeg(seg *segment, size int) { seg.data.CapLength(size) } -// NextSeg implements the RFC6675 NextSeg() operation. It returns segments that -// match rule 1, 3 and 4 of the NextSeg() operation defined in RFC6675. Rule 2 -// is handled by the normal send logic. -func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { +// NextSeg implements the RFC6675 NextSeg() operation. +// +// NextSeg starts scanning the writeList starting from nextSegHint and returns +// the hint to be passed on the next call to NextSeg. This is required to avoid +// iterating the write list repeatedly when NextSeg is invoked in a loop during +// recovery. The returned hint will be nil if there are no more segments that +// can match rules defined by NextSeg operation in RFC6675. +// +// rescueRtx will be true only if nextSeg is a rescue retransmission as +// described by Step 4) of the NextSeg algorithm. +func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { var s3 *segment var s4 *segment - smss := s.ep.scoreboard.SMSS() // Step 1. - for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { - if !s.isAssignedSequenceNumber(seg) { + for seg := nextSegHint; seg != nil; seg = seg.Next() { + // Stop iteration if we hit a segment that has never been + // transmitted (i.e. either it has no assigned sequence number + // or if it does have one, it's >= the next sequence number + // to be sent [i.e. >= s.sndNxt]). + if !s.isAssignedSequenceNumber(seg) || s.sndNxt.LessThanEq(seg.sequenceNumber) { + hint = nil break } segSeq := seg.sequenceNumber - if seg.data.Size() > int(smss) { + if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) { s.splitSeg(seg, int(smss)) } + // See RFC 6675 Section 4 // // 1. If there exists a smallest unSACKED sequence number @@ -630,8 +642,9 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { // NextSeg(): // (1.c) IsLost(S2) returns true. if s.ep.scoreboard.IsLost(segSeq) { - return seg, s3, s4 + return seg, seg.Next(), false } + // NextSeg(): // // (3): If the conditions for rules (1) and (2) @@ -643,6 +656,7 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { // SHOULD be returned. if s3 == nil { s3 = seg + hint = seg.Next() } } // NextSeg(): @@ -651,10 +665,12 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { // but there exists outstanding unSACKED data, we // provide the opportunity for a single "rescue" // retransmission per entry into loss recovery. If - // HighACK is greater than RescueRxt, the one - // segment of upto SMSS octects that MUST include - // the highest outstanding unSACKed sequence number - // SHOULD be returned. + // HighACK is greater than RescueRxt (or RescueRxt + // is undefined), then one segment of upto SMSS + // octects that MUST include the highest outstanding + // unSACKed sequence number SHOULD be returned, and + // RescueRxt set to RecoveryPoint. HighRxt MUST NOT + // be updated. if s.fr.rescueRxt.LessThan(s.sndUna - 1) { if s4 != nil { if s4.sequenceNumber.LessThan(segSeq) { @@ -663,12 +679,31 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { } else { s4 = seg } - s.fr.rescueRxt = s.fr.last } } } - return nil, s3, s4 + // If we got here then no segment matched step (1). + // Step (2): "If no sequence number 'S2' per rule (1) + // exists but there exists available unsent data and the + // receiver's advertised window allows, the sequence + // range of one segment of up to SMSS octets of + // previously unsent data starting with sequence number + // HighData+1 MUST be returned." + for seg := s.writeNext; seg != nil; seg = seg.Next() { + if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { + continue + } + // We do not split the segment here to <= smss as it has + // potentially not been assigned a sequence number yet. + return seg, nil, false + } + + if s3 != nil { + return s3, hint, false + } + + return s4, nil, true } // maybeSendSegment tries to send the specified segment and either coalesces @@ -792,64 +827,47 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se // section 5, step C. func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { s.SetPipe() + + if smss := int(s.ep.scoreboard.SMSS()); limit > smss { + // Cap segment size limit to s.smss as SACK recovery requires + // that all retransmissions or new segments send during recovery + // be of <= SMSS. + limit = smss + } + + nextSegHint := s.writeList.Front() for s.outstanding < s.sndCwnd { - nextSeg, s3, s4 := s.NextSeg() - if nextSeg == nil { - // NextSeg(): - // - // Step (2): "If no sequence number 'S2' per rule (1) - // exists but there exists available unsent data and the - // receiver's advertised window allows, the sequence - // range of one segment of up to SMSS octets of - // previously unsent data starting with sequence number - // HighData+1 MUST be returned." - for seg := s.writeNext; seg != nil; seg = seg.Next() { - if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { - continue - } - // Step C.3 described below is handled by - // maybeSendSegment which increments sndNxt when - // a segment is transmitted. - // - // Step C.3 "If any of the data octets sent in - // (C.1) are above HighData, HighData must be - // updated to reflect the transmission of - // previously unsent data." - if sent := s.maybeSendSegment(seg, limit, end); !sent { - break - } - dataSent = true - s.outstanding++ - s.writeNext = seg.Next() - nextSeg = seg - break - } - if nextSeg != nil { - continue - } - } - rescueRtx := false - if nextSeg == nil && s3 != nil { - nextSeg = s3 - } - if nextSeg == nil && s4 != nil { - nextSeg = s4 - rescueRtx = true - } + var nextSeg *segment + var rescueRtx bool + nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint) if nextSeg == nil { - break + return dataSent } - segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) - if !rescueRtx && nextSeg.sequenceNumber.LessThan(s.sndNxt) { - // RFC 6675, Step C.2 + if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) { + // New data being sent. + + // Step C.3 described below is handled by + // maybeSendSegment which increments sndNxt when + // a segment is transmitted. // - // "If any of the data octets sent in (C.1) are below - // HighData, HighRxt MUST be set to the highest sequence - // number of the retransmitted segment unless NextSeg () - // rule (4) was invoked for this retransmission." - s.fr.highRxt = segEnd - 1 + // Step C.3 "If any of the data octets sent in + // (C.1) are above HighData, HighData must be + // updated to reflect the transmission of + // previously unsent data." + // + // We pass s.smss as the limit as the Step 2) requires that + // new data sent should be of size s.smss or less. + if sent := s.maybeSendSegment(nextSeg, limit, end); !sent { + return dataSent + } + dataSent = true + s.outstanding++ + s.writeNext = nextSeg.Next() + continue } + // Now handle the retransmission case where we matched either step 1,3 or 4 + // of the NextSeg algorithm. // RFC 6675, Step C.4. // // "The estimate of the amount of data outstanding in the network @@ -858,6 +876,22 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) s.outstanding++ dataSent = true s.sendSegment(nextSeg) + + segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) + if rescueRtx { + // We do the last part of rule (4) of NextSeg here to update + // RescueRxt as until this point we don't know if we are going + // to use the rescue transmission. + s.fr.rescueRxt = s.fr.last + } else { + // RFC 6675, Step C.2 + // + // "If any of the data octets sent in (C.1) are below + // HighData, HighRxt MUST be set to the highest sequence + // number of the retransmitted segment unless NextSeg () + // rule (4) was invoked for this retransmission." + s.fr.highRxt = segEnd - 1 + } } return dataSent } @@ -903,7 +937,7 @@ func (s *sender) sendData() { // "A TCP SHOULD set cwnd to no more than RW before beginning // transmission if the TCP has not sent data in the interval exceeding // the retrasmission timeout." - if !s.fr.active && time.Now().Sub(s.lastSendTime) > s.rto { + if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto { if s.sndCwnd > InitialCwnd { s.sndCwnd = InitialCwnd } @@ -921,6 +955,9 @@ func (s *sender) sendData() { limit = cwndLimit } if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { + // Move writeNext along so that we don't try and scan data that + // has already been SACKED. + s.writeNext = seg.Next() continue } if sent := s.maybeSendSegment(seg, limit, end); !sent { @@ -966,6 +1003,8 @@ func (s *sender) enterFastRecovery() { s.fr.first = s.sndUna s.fr.last = s.sndNxt - 1 s.fr.maxCwnd = s.sndCwnd + s.outstanding + s.fr.highRxt = s.sndUna + s.fr.rescueRxt = s.sndUna if s.ep.sackPermitted { s.state = SACKRecovery s.ep.stack.Stats().TCP.SACKRecovery.Increment() @@ -1258,6 +1297,7 @@ func (s *sender) handleRcvdSegment(seg *segment) { if s.writeNext == seg { s.writeNext = seg.Next() } + s.writeList.Remove(seg) // if SACK is enabled then Only reduce outstanding if @@ -1329,7 +1369,23 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error { } seg.xmitTime = time.Now() seg.xmitCount++ - return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) + err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) + + // Every time a packet containing data is sent (including a + // retransmission), if SACK is enabled and we are retransmitting data + // then use the conservative timer described in RFC6675 Section 6.0, + // otherwise follow the standard time described in RFC6298 Section 5.1. + if err != nil && seg.data.Size() != 0 { + if s.fr.active && seg.xmitCount > 1 && s.ep.sackPermitted { + s.resendTimer.enable(s.rto) + } else { + if !s.resendTimer.enabled() { + s.resendTimer.enable(s.rto) + } + } + } + + return err } // sendSegmentFromView sends a new segment containing the given payload, flags @@ -1345,19 +1401,5 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq // Remember the max sent ack. s.maxSentAck = rcvNxt - // Every time a packet containing data is sent (including a - // retransmission), if SACK is enabled then use the conservative timer - // described in RFC6675 Section 4.0, otherwise follow the standard time - // described in RFC6298 Section 5.2. - if data.Size() != 0 { - if s.ep.sackPermitted { - s.resendTimer.enable(s.rto) - } else { - if !s.resendTimer.enabled() { - s.resendTimer.enable(s.rto) - } - } - } - return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) } diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 49e4ba214..d2c90ebd5 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -4905,6 +4905,8 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) { } for _, test := range tests { + test := test // capture range variable + t.Run(test.name, func(t *testing.T) { t.Parallel() @@ -5007,6 +5009,8 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) { } for _, test := range tests { + test := test // capture range variable + t.Run(test.name, func(t *testing.T) { t.Parallel() |