From b915a25597a961311ceb57f89d18eaee9c9461d8 Mon Sep 17 00:00:00 2001 From: Josh Gao Date: Thu, 13 Jun 2019 14:26:26 -0700 Subject: Fix use of "2 ^ 30". 2 ^ 30 is 28, not 1073741824. --- runsc/boot/loader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'runsc') diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index c1dea736f..b2a91bec9 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -259,7 +259,7 @@ func New(args Args) (*Loader, error) { // Adjust the total memory returned by the Sentry so that applications that // use /proc/meminfo can make allocations based on this limit. usage.MinimumTotalMemoryBytes = args.TotalMem - log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30)) + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1 << 30)) } // Initiate the Kernel object, which is required by the Context passed -- cgit v1.2.3 From 8a625ceeb173307094e81d273458b6651e54220a Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 28 Jun 2019 11:48:27 -0700 Subject: runsc: allow openat for runsc-race I see that runsc-race is killed by SIGSYS, because openat isn't allowed by seccomp filters: 60052 openat(AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY|O_CLOEXEC 60052 <... openat resumed> ) = 257 60052 --- SIGSYS {si_signo=SIGSYS, si_code=SYS_SECCOMP, si_call_addr=0xfaacf1, si_syscall=__NR_openat, si_arch=AUDIT_ARCH_X86_64} --- PiperOrigin-RevId: 255640808 --- runsc/boot/filter/extra_filters_race.go | 1 + 1 file changed, 1 insertion(+) (limited to 'runsc') diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go index d5bee4453..9ff80276a 100644 --- a/runsc/boot/filter/extra_filters_race.go +++ b/runsc/boot/filter/extra_filters_race.go @@ -33,6 +33,7 @@ func instrumentationFilters() seccomp.SyscallRules { syscall.SYS_MUNLOCK: {}, syscall.SYS_NANOSLEEP: {}, syscall.SYS_OPEN: {}, + syscall.SYS_OPENAT: {}, syscall.SYS_SET_ROBUST_LIST: {}, // Used within glibc's malloc. syscall.SYS_TIME: {}, -- cgit v1.2.3 From 295078fa7ae7893f3deb0e6042e2aecfaba0dd1c Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 28 Jun 2019 15:27:25 -0700 Subject: Automated rollback of changelist 255263686 PiperOrigin-RevId: 255679453 --- pkg/sentry/fs/inode_overlay.go | 6 ++++++ runsc/boot/fs.go | 13 ------------- test/syscalls/BUILD | 2 +- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'runsc') diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index b247fa514..24b769cfc 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -567,6 +567,12 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { if o.upper != nil { err = o.upper.check(ctx, p) } else { + if p.Write { + // Since writes will be redirected to the upper filesystem, the lower + // filesystem need not be writable, but must be readable for copy-up. + p.Write = false + p.Read = true + } err = o.lower.check(ctx, p) } o.copyMu.RUnlock() diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index af52286a6..9da0c7067 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -85,19 +85,6 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, if err != nil { return nil, fmt.Errorf("creating tmpfs overlay: %v", err) } - - // Replicate permissions and owner from lower to upper mount point. - attr, err := lower.UnstableAttr(ctx) - if err != nil { - return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) - } - if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { - return nil, fmt.Errorf("error setting permission to upper mount point") - } - if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { - return nil, fmt.Errorf("setting owner to upper mount point: %v", err) - } - return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index b5f89033b..4ac511740 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -240,7 +240,7 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:munmap_test") syscall_test( - add_overlay = True, + add_overlay = False, # TODO(gvisor.dev/issue/316): enable when fixed. test = "//test/syscalls/linux:open_create_test", ) -- cgit v1.2.3 From 45566fa4e4bc55e870ae1d7f80778be8432bdef5 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Fri, 28 Jun 2019 20:06:33 -0700 Subject: Add finalizer on AtomicRefCount to check for leaks. PiperOrigin-RevId: 255711454 --- pkg/refs/BUILD | 3 + pkg/refs/refcounter.go | 69 ++++++++++++++++++++++ pkg/sentry/fs/dirent.go | 4 +- pkg/sentry/fs/file.go | 5 +- pkg/sentry/fs/file_overlay.go | 5 +- pkg/sentry/fs/gofer/handles.go | 5 +- pkg/sentry/fs/gofer/path.go | 7 ++- pkg/sentry/fs/gofer/session.go | 7 ++- pkg/sentry/fs/gofer/session_state.go | 1 - pkg/sentry/fs/host/socket.go | 8 ++- pkg/sentry/fs/inode.go | 4 +- pkg/sentry/fs/mount.go | 4 +- pkg/sentry/fs/mounts.go | 6 +- pkg/sentry/fs/tty/terminal.go | 4 +- pkg/sentry/kernel/fd_map.go | 4 +- pkg/sentry/kernel/fs_context.go | 4 +- pkg/sentry/kernel/sessions.go | 16 +++-- pkg/sentry/kernel/shm/shm.go | 1 + pkg/sentry/mm/aio_context.go | 4 +- pkg/sentry/mm/special_mappable.go | 4 +- pkg/sentry/socket/unix/transport/connectioned.go | 4 ++ pkg/sentry/socket/unix/transport/connectionless.go | 4 +- pkg/sentry/socket/unix/unix.go | 7 ++- runsc/boot/BUILD | 1 + runsc/boot/loader.go | 6 ++ 25 files changed, 152 insertions(+), 35 deletions(-) (limited to 'runsc') diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 0652fbbe6..9c08452fc 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -24,6 +24,9 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/refs", visibility = ["//:sandbox"], + deps = [ + "//pkg/log", + ], ) go_test( diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 20f515391..6d6c3a782 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -18,8 +18,11 @@ package refs import ( "reflect" + "runtime" "sync" "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" ) // RefCounter is the interface to be implemented by objects that are reference @@ -189,6 +192,11 @@ type AtomicRefCount struct { // how these fields are used. refCount int64 + // name is the name of the type which owns this ref count. + // + // name is immutable after EnableLeakCheck is called. + name string + // mu protects the list below. mu sync.Mutex `state:"nosave"` @@ -196,6 +204,67 @@ type AtomicRefCount struct { weakRefs weakRefList `state:"nosave"` } +// LeakMode configures the leak checker. +type LeakMode uint32 + +const ( + // uninitializedLeakChecking indicates that the leak checker has not yet been initialized. + uninitializedLeakChecking LeakMode = iota + + // NoLeakChecking indicates that no effort should be made to check for + // leaks. + NoLeakChecking + + // LeaksLogWarning indicates that a warning should be logged when leaks + // are found. + LeaksLogWarning +) + +// leakMode stores the current mode for the reference leak checker. +// +// Values must be one of the LeakMode values. +// +// leakMode must be accessed atomically. +var leakMode uint32 + +// SetLeakMode configures the reference leak checker. +func SetLeakMode(mode LeakMode) { + atomic.StoreUint32(&leakMode, uint32(mode)) +} + +func (r *AtomicRefCount) finalize() { + var note string + switch LeakMode(atomic.LoadUint32(&leakMode)) { + case NoLeakChecking: + return + case uninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n) + } +} + +// EnableLeakCheck checks for reference leaks when the AtomicRefCount gets +// garbage collected. +// +// This function adds a finalizer to the AtomicRefCount, so the AtomicRefCount +// must be at the beginning of its parent. +// +// name is a friendly name that will be listed as the owner of the +// AtomicRefCount in logs. It should be the name of the parent type, including +// package. +func (r *AtomicRefCount) EnableLeakCheck(name string) { + if name == "" { + panic("invalid name") + } + if LeakMode(atomic.LoadUint32(&leakMode)) == NoLeakChecking { + return + } + r.name = name + runtime.SetFinalizer(r, (*AtomicRefCount).finalize) +} + // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *AtomicRefCount) ReadRefs() int64 { diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 28651e58b..fbca06761 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -229,11 +229,13 @@ func newDirent(inode *Inode, name string) *Dirent { if inode != nil { inode.MountSource.IncDirentRefs() } - return &Dirent{ + d := Dirent{ Inode: inode, name: name, children: make(map[string]*refs.WeakRef), } + d.EnableLeakCheck("fs.Dirent") + return &d } // NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 8e1f5674d..bb8117f89 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -130,14 +130,15 @@ type File struct { // to false respectively. func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File { dirent.IncRef() - f := &File{ + f := File{ UniqueID: uniqueid.GlobalFromContext(ctx), Dirent: dirent, FileOperations: fops, flags: flags, } f.mu.Init() - return f + f.EnableLeakCheck("fs.File") + return &f } // DecRef destroys the File when it is no longer referenced. diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index c6cbd5631..9820f0b13 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -347,13 +347,14 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt // preventing us from saving a proper inode mapping for the // file. file.IncRef() - id := &overlayMappingIdentity{ + id := overlayMappingIdentity{ id: opts.MappingIdentity, overlayFile: file, } + id.EnableLeakCheck("fs.overlayMappingIdentity") // Swap out the old MappingIdentity for the wrapped one. - opts.MappingIdentity = id + opts.MappingIdentity = &id return nil } diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index b87c4f150..27eeae3d9 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -79,11 +79,12 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han newFile.close(ctx) return nil, err } - h := &handles{ + h := handles{ File: newFile, Host: hostFile, } - return h, nil + h.EnableLeakCheck("gofer.handles") + return &h, nil } type handleReadWriter struct { diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index b91386909..8c17603f8 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -145,16 +145,17 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string defer d.DecRef() // Construct the new file, caching the handles if allowed. - h := &handles{ + h := handles{ File: newFile, Host: hostFile, } + h.EnableLeakCheck("gofer.handles") if iops.fileState.canShareHandles() { iops.fileState.handlesMu.Lock() - iops.fileState.setSharedHandlesLocked(flags, h) + iops.fileState.setSharedHandlesLocked(flags, &h) iops.fileState.handlesMu.Unlock() } - return NewFile(ctx, d, name, flags, iops, h), nil + return NewFile(ctx, d, name, flags, iops, &h), nil } // CreateLink uses Create to create a symlink between oldname and newname. diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 9f7660ed1..69d08a627 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -241,7 +241,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF } // Construct the session. - s := &session{ + s := session{ connID: dev, msize: o.msize, version: o.version, @@ -250,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF superBlockFlags: superBlockFlags, mounter: mounter, } + s.EnableLeakCheck("gofer.session") if o.privateunixsocket { s.endpoints = newEndpointMaps() } // Construct the MountSource with the session and superBlockFlags. - m := fs.NewMountSource(ctx, s, filesystem, superBlockFlags) + m := fs.NewMountSource(ctx, &s, filesystem, superBlockFlags) // Given that gofer files can consume host FDs, restrict the number // of files that can be held by the cache. @@ -290,7 +291,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF return nil, err } - sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr, false) + sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false) return fs.NewInode(ctx, iops, m, sattr), nil } diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 29a79441e..d045e04ff 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -111,5 +111,4 @@ func (s *session) afterLoad() { panic("failed to restore endpoint maps: " + err.Error()) } } - } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 7fedc88bc..44c4ee5f2 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -47,12 +47,12 @@ const maxSendBufferSize = 8 << 20 // // +stateify savable type ConnectedEndpoint struct { - queue *waiter.Queue - path string - // ref keeps track of references to a connectedEndpoint. ref refs.AtomicRefCount + queue *waiter.Queue + path string + // If srfd >= 0, it is the host FD that file was imported from. srfd int `state:"wait"` @@ -133,6 +133,8 @@ func NewConnectedEndpoint(ctx context.Context, file *fd.FD, queue *waiter.Queue, // AtomicRefCounters start off with a single reference. We need two. e.ref.IncRef() + e.ref.EnableLeakCheck("host.ConnectedEndpoint") + return &e, nil } diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index e4aae1135..f4ddfa406 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -86,12 +86,14 @@ type LockCtx struct { // NewInode takes a reference on msrc. func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode { msrc.IncRef() - return &Inode{ + i := Inode{ InodeOperations: iops, StableAttr: sattr, Watches: newWatches(), MountSource: msrc, } + i.EnableLeakCheck("fs.Inode") + return &i } // DecRef drops a reference on the Inode. diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 912495528..7a9692800 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -138,12 +138,14 @@ func NewMountSource(ctx context.Context, mops MountSourceOperations, filesystem if filesystem != nil { fsType = filesystem.Name() } - return &MountSource{ + msrc := MountSource{ MountSourceOperations: mops, Flags: flags, FilesystemType: fsType, fscache: NewDirentCache(DefaultDirentCacheSize), } + msrc.EnableLeakCheck("fs.MountSource") + return &msrc } // DirentRefs returns the current mount direntRefs. diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 281364dfc..ce7ffeed2 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -181,12 +181,14 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error d: newRootMount(1, d), } - return &MountNamespace{ + mns := MountNamespace{ userns: creds.UserNamespace, root: d, mounts: mnts, mountID: 2, - }, nil + } + mns.EnableLeakCheck("fs.MountNamespace") + return &mns, nil } // UserNamespace returns the user namespace associated with this mount manager. diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 8290f2530..b7cecb2ed 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -38,9 +38,11 @@ type Terminal struct { func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal { termios := linux.DefaultSlaveTermios - return &Terminal{ + t := Terminal{ d: d, n: n, ld: newLineDiscipline(termios), } + t.EnableLeakCheck("tty.Terminal") + return &t } diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go index 786936a7d..1b84bfe14 100644 --- a/pkg/sentry/kernel/fd_map.go +++ b/pkg/sentry/kernel/fd_map.go @@ -98,11 +98,13 @@ func (f *FDMap) ID() uint64 { // NewFDMap allocates a new FDMap that may be used by tasks in k. func (k *Kernel) NewFDMap() *FDMap { - return &FDMap{ + f := FDMap{ k: k, files: make(map[kdefs.FD]descriptor), uid: atomic.AddUint64(&k.fdMapUids, 1), } + f.EnableLeakCheck("kernel.FDMap") + return &f } // destroy removes all of the file descriptors from the map. diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 938239aeb..ded27d668 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -51,11 +51,13 @@ type FSContext struct { func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { root.IncRef() cwd.IncRef() - return &FSContext{ + f := FSContext{ root: root, cwd: cwd, umask: umask, } + f.EnableLeakCheck("kernel.FSContext") + return &f } // destroy is the destructor for an FSContext. diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 355984140..81fcd8258 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -294,6 +294,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } + s.refs.EnableLeakCheck("kernel.Session") // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -307,6 +308,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") // Tie them and return the result. s.processGroups.PushBack(pg) @@ -378,11 +380,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // We manually adjust the ancestors if the parent is in the same // session. tg.processGroup.session.incRef() - pg := &ProcessGroup{ + pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") + if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ } @@ -390,20 +394,20 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // Assign the new process group; adjust children. oldParentPG := tg.parentPG() tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { - childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.incRefWithParent(&pg) childTG.processGroup.decRefWithParent(oldParentPG) }) tg.processGroup.decRefWithParent(oldParentPG) - tg.processGroup = pg + tg.processGroup = &pg // Add the new process group to the session. - pg.session.processGroups.PushBack(pg) + pg.session.processGroups.PushBack(&pg) // Ensure this translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { local := ns.tgids[tg] - ns.pgids[pg] = ProcessGroupID(local) - ns.processGroups[ProcessGroupID(local)] = pg + ns.pgids[&pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = &pg } return nil diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 3e9fe70e2..5bd610f68 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -224,6 +224,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } + shm.EnableLeakCheck("kernel.Shm") // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index c9a942023..1b746d030 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -213,7 +213,9 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { if err != nil { return nil, err } - return &aioMappable{mfp: mfp, fr: fr}, nil + m := aioMappable{mfp: mfp, fr: fr} + m.EnableLeakCheck("mm.aioMappable") + return &m, nil } // DecRef implements refs.RefCounter.DecRef. diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 2f8651a0a..ea2d7af74 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -45,7 +45,9 @@ type SpecialMappable struct { // // Preconditions: fr.Length() != 0. func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { - return &SpecialMappable{mfp: mfp, fr: fr, name: name} + m := SpecialMappable{mfp: mfp, fr: fr, name: name} + m.EnableLeakCheck("mm.SpecialMappable") + return &m } // DecRef implements refs.RefCounter.DecRef. diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index e4c416233..73d2df15d 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -143,7 +143,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E } q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} + q1.EnableLeakCheck("transport.queue") q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} + q2.EnableLeakCheck("transport.queue") if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} @@ -294,12 +296,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn } readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} + readQueue.EnableLeakCheck("transport.queue") ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} + writeQueue.EnableLeakCheck("transport.queue") if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index e987519f0..c591f261d 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -41,7 +41,9 @@ var ( // NewConnectionless creates a new unbound dgram endpoint. func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} - ep.receiver = &queueReceiver{readQueue: &queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}} + q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} + q.EnableLeakCheck("transport.queue") + ep.receiver = &queueReceiver{readQueue: &q} return ep } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 6190de0c5..bf7d2cfa2 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -69,10 +69,13 @@ func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) // NewWithDirent creates a new unix socket using an existing dirent. func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File { - return fs.NewFile(ctx, d, flags, &SocketOperations{ + s := SocketOperations{ ep: ep, stype: stype, - }) + } + s.EnableLeakCheck("unix.SocketOperations") + + return fs.NewFile(ctx, d, flags, &s) } // DecRef implements RefCounter.DecRef. diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 7ec15a524..d91f66d95 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/log", "//pkg/memutil", "//pkg/rand", + "//pkg/refs", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/context", diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 41027416d..89f7d9f94 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs/host" @@ -1006,3 +1007,8 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host } return ep.tg, ep.tty, nil } + +func init() { + // TODO(gvisor.dev/issue/365): Make this configurable. + refs.SetLeakMode(refs.NoLeakChecking) +} -- cgit v1.2.3 From 4a72c8078ebd3e7b5982f0f7d9fcc4f4d9665ef8 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 1 Jul 2019 17:00:08 -0700 Subject: Use new location of python-hello image in tests. PiperOrigin-RevId: 256062988 --- runsc/test/image/image_test.go | 7 +++++-- runsc/test/integration/integration_test.go | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'runsc') diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go index 14cbd30c4..ddaa2c13b 100644 --- a/runsc/test/image/image_test.go +++ b/runsc/test/image/image_test.go @@ -209,11 +209,14 @@ func TestMysql(t *testing.T) { } func TestPythonHello(t *testing.T) { - if err := testutil.Pull("google/python-hello"); err != nil { + // TODO(b/136503277): Once we have more complete python runtime tests, + // we can drop this one. + const img = "gcr.io/gvisor-presubmit/python-hello" + if err := testutil.Pull(img); err != nil { t.Fatalf("docker pull failed: %v", err) } d := testutil.MakeDocker("python-hello-test") - if err := d.Run("-p", "8080", "google/python-hello"); err != nil { + if err := d.Run("-p", "8080", img); err != nil { t.Fatalf("docker run failed: %v", err) } defer d.CleanUp() diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go index 55ebc2f5d..7cef4b9dd 100644 --- a/runsc/test/integration/integration_test.go +++ b/runsc/test/integration/integration_test.go @@ -86,16 +86,17 @@ func TestLifeCycle(t *testing.T) { } func TestPauseResume(t *testing.T) { + const img = "gcr.io/gvisor-presubmit/python-hello" if !testutil.IsPauseResumeSupported() { t.Log("Pause/resume is not supported, skipping test.") return } - if err := testutil.Pull("google/python-hello"); err != nil { + if err := testutil.Pull(img); err != nil { t.Fatal("docker pull failed:", err) } d := testutil.MakeDocker("pause-resume-test") - if err := d.Run("-p", "8080", "google/python-hello"); err != nil { + if err := d.Run("-p", "8080", img); err != nil { t.Fatalf("docker run failed: %v", err) } defer d.CleanUp() @@ -149,15 +150,16 @@ func TestPauseResume(t *testing.T) { } func TestCheckpointRestore(t *testing.T) { + const img = "gcr.io/gvisor-presubmit/python-hello" if !testutil.IsPauseResumeSupported() { t.Log("Pause/resume is not supported, skipping test.") return } - if err := testutil.Pull("google/python-hello"); err != nil { + if err := testutil.Pull(img); err != nil { t.Fatal("docker pull failed:", err) } d := testutil.MakeDocker("save-restore-test") - if err := d.Run("-p", "8080", "google/python-hello"); err != nil { + if err := d.Run("-p", "8080", img); err != nil { t.Fatalf("docker run failed: %v", err) } defer d.CleanUp() -- cgit v1.2.3 From 753da9604efc74dced3055bb2f5c6bef2d98fe6c Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 2 Jul 2019 19:27:51 -0700 Subject: Remove map from fd_map, change to fd_table. This renames FDMap to FDTable and drops the kernel.FD type, which had an entire package to itself and didn't serve much use (it was freely cast between types, and served as more of an annoyance than providing any protection.) Based on BenchmarkFDLookupAndDecRef-12, we can expect 5-10 ns per lookup operation, and 10-15 ns per concurrent lookup operation of savings. This also fixes two tangential usage issues with the FDMap. Namely, non-atomic use of NewFDFrom and associated calls to Remove (that are both racy and fail to drop the reference on the underlying file.) PiperOrigin-RevId: 256285890 --- pkg/refs/refcounter.go | 7 + pkg/sentry/control/BUILD | 1 - pkg/sentry/control/proc.go | 12 +- pkg/sentry/fs/README.md | 2 +- pkg/sentry/fs/g3doc/inotify.md | 4 +- pkg/sentry/fs/proc/BUILD | 1 - pkg/sentry/fs/proc/fds.go | 38 ++- pkg/sentry/fs/proc/task.go | 4 +- pkg/sentry/kernel/BUILD | 9 +- pkg/sentry/kernel/epoll/BUILD | 1 - pkg/sentry/kernel/epoll/epoll.go | 3 +- pkg/sentry/kernel/fd_map.go | 366 ---------------------------- pkg/sentry/kernel/fd_map_test.go | 136 ----------- pkg/sentry/kernel/fd_table.go | 380 ++++++++++++++++++++++++++++++ pkg/sentry/kernel/fd_table_test.go | 192 +++++++++++++++ pkg/sentry/kernel/fd_table_unsafe.go | 103 ++++++++ pkg/sentry/kernel/kdefs/BUILD | 10 - pkg/sentry/kernel/kdefs/kdefs.go | 20 -- pkg/sentry/kernel/kernel.go | 76 +++--- pkg/sentry/kernel/task.go | 65 ++++- pkg/sentry/kernel/task_clone.go | 40 ++-- pkg/sentry/kernel/task_exec.go | 2 +- pkg/sentry/kernel/task_exit.go | 4 +- pkg/sentry/kernel/task_log.go | 2 +- pkg/sentry/kernel/task_start.go | 13 +- pkg/sentry/loader/loader.go | 2 +- pkg/sentry/socket/BUILD | 1 - pkg/sentry/socket/control/BUILD | 1 - pkg/sentry/socket/control/control.go | 10 +- pkg/sentry/socket/epsocket/BUILD | 1 - pkg/sentry/socket/epsocket/epsocket.go | 8 +- pkg/sentry/socket/hostinet/BUILD | 1 - pkg/sentry/socket/hostinet/socket.go | 9 +- pkg/sentry/socket/netlink/BUILD | 1 - pkg/sentry/socket/netlink/socket.go | 3 +- pkg/sentry/socket/rpcinet/BUILD | 1 - pkg/sentry/socket/rpcinet/socket.go | 8 +- pkg/sentry/socket/socket.go | 3 +- pkg/sentry/socket/unix/BUILD | 1 - pkg/sentry/socket/unix/unix.go | 8 +- pkg/sentry/strace/BUILD | 1 - pkg/sentry/strace/poll.go | 3 +- pkg/sentry/strace/strace.go | 7 +- pkg/sentry/syscalls/BUILD | 1 - pkg/sentry/syscalls/epoll.go | 30 ++- pkg/sentry/syscalls/linux/BUILD | 1 - pkg/sentry/syscalls/linux/sys_aio.go | 9 +- pkg/sentry/syscalls/linux/sys_epoll.go | 7 +- pkg/sentry/syscalls/linux/sys_eventfd.go | 5 +- pkg/sentry/syscalls/linux/sys_file.go | 170 ++++++------- pkg/sentry/syscalls/linux/sys_getdents.go | 9 +- pkg/sentry/syscalls/linux/sys_inotify.go | 15 +- pkg/sentry/syscalls/linux/sys_lseek.go | 5 +- pkg/sentry/syscalls/linux/sys_mmap.go | 5 +- pkg/sentry/syscalls/linux/sys_pipe.go | 23 +- pkg/sentry/syscalls/linux/sys_poll.go | 5 +- pkg/sentry/syscalls/linux/sys_prctl.go | 5 +- pkg/sentry/syscalls/linux/sys_read.go | 21 +- pkg/sentry/syscalls/linux/sys_socket.go | 90 ++++--- pkg/sentry/syscalls/linux/sys_splice.go | 25 +- pkg/sentry/syscalls/linux/sys_stat.go | 17 +- pkg/sentry/syscalls/linux/sys_sync.go | 17 +- pkg/sentry/syscalls/linux/sys_timerfd.go | 13 +- pkg/sentry/syscalls/linux/sys_write.go | 21 +- runsc/boot/BUILD | 1 - runsc/boot/fds.go | 27 +-- runsc/boot/fs.go | 4 +- runsc/boot/loader.go | 22 +- 68 files changed, 1115 insertions(+), 993 deletions(-) delete mode 100644 pkg/sentry/kernel/fd_map.go delete mode 100644 pkg/sentry/kernel/fd_map_test.go create mode 100644 pkg/sentry/kernel/fd_table.go create mode 100644 pkg/sentry/kernel/fd_table_test.go create mode 100644 pkg/sentry/kernel/fd_table_unsafe.go delete mode 100644 pkg/sentry/kernel/kdefs/BUILD delete mode 100644 pkg/sentry/kernel/kdefs/kdefs.go (limited to 'runsc') diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 3dd6f0dd4..6ecbace9e 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -365,6 +365,8 @@ func (r *AtomicRefCount) ReadRefs() int64 { // // The sanity check here is limited to real references, since if they have // dropped beneath zero then the object should have been destroyed. +// +//go:nosplit func (r *AtomicRefCount) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { panic("Incrementing non-positive ref count") @@ -379,6 +381,8 @@ func (r *AtomicRefCount) IncRef() { // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to // distinguish other TryIncRef calls from genuine references held. +// +//go:nosplit func (r *AtomicRefCount) TryIncRef() bool { const speculativeRef = 1 << 32 v := atomic.AddInt64(&r.refCount, speculativeRef) @@ -420,6 +424,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) { // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // +//go:nosplit func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: @@ -455,6 +460,8 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { } // DecRef decrements this object's reference count. +// +//go:nosplit func (r *AtomicRefCount) DecRef() { r.DecRefWithDestructor(nil) } diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 5dccb8e3c..bf802d1b6 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -23,7 +23,6 @@ go_library( "//pkg/sentry/fs/host", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/state", diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 66a506584..6ae60c5cb 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -123,9 +122,8 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID // TTYFileOperations that wraps the TTY is also returned. func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { // Import file descriptors. - l := limits.NewLimitSet() - fdm := proc.Kernel.NewFDMap() - defer fdm.DecRef() + fdTable := proc.Kernel.NewFDTable() + defer fdTable.DecRef() // No matter what happens, we should close all files in the FilePayload // before returning. Any files that are imported will be duped. @@ -149,9 +147,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI WorkingDirectory: args.WorkingDirectory, Root: args.Root, Credentials: creds, - FDMap: fdm, + FDTable: fdTable, Umask: 0022, - Limits: l, + Limits: limits.NewLimitSet(), MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: proc.Kernel.RootUTSNamespace(), IPCNamespace: proc.Kernel.RootIPCNamespace(), @@ -212,7 +210,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } // Add the file to the FD map. - if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil { + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { return nil, 0, nil, err } } diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md index c4e8faa3c..db4a1b730 100644 --- a/pkg/sentry/fs/README.md +++ b/pkg/sentry/fs/README.md @@ -69,7 +69,7 @@ Specifically this state is: - An `fs.MountManager` containing mount points. -- A `kernel.FDMap` containing pointers to open files. +- A `kernel.FDTable` containing pointers to open files. Anything else managed by the VFS that can be easily loaded into memory from a filesystem is synced back to those filesystems and is not saved. Examples are diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md index e1630553b..71a577d9d 100644 --- a/pkg/sentry/fs/g3doc/inotify.md +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -9,7 +9,7 @@ For the most part, the sentry implementation of inotify mirrors the Linux architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are backed by a pseudo-filesystem. Events are generated from various places in the sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and -the [process fd table][fd_map]. Watches are stored in inodes and generated +the [process fd table][fd_table]. Watches are stored in inodes and generated events are queued to the inotify instance owning the watches for delivery to the user. @@ -114,7 +114,7 @@ ordering. [dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go [event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go -[fd_map]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_map.go +[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go [inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go [inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go [inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index da41a10ab..70ed854a8 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index ea7aded9a..bee421d76 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF var file *fs.File var fdFlags kernel.FDFlags t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - file, fdFlags = fdm.GetDescriptor(kdefs.FD(n)) + if fdTable := t.FDTable(); fdTable != nil { + file, fdFlags = fdTable.Get(int32(n)) } }) if file == nil { @@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { - var fds kernel.FDs + var fds []int32 t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.GetFDs() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() } }) - fdInts := make([]int, 0, len(fds)) - for _, fd := range fds { - fdInts = append(fdInts, int(fd)) - } - - // Find the fd to start at. - idx := sort.SearchInts(fdInts, int(offset)) - if idx == len(fdInts) { + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) }) + if idx == len(fds) { return offset, nil } - fdInts = fdInts[idx:] + fds = fds[idx:] - var fd int - for _, fd = range fdInts { + // Serialize all FDs. + for _, fd := range fds { name := strconv.FormatUint(uint64(fd), 10) - if err := c.DirEmit(name, toDentAttr(fd)); err != nil { + if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil { // Returned offset is the next fd to serialize. return int64(fd), err } } // We serialized them all. Next offset should be higher than last // serialized fd. - return int64(fd + 1), nil + return int64(fds[len(fds)-1] + 1), nil } // fd implements fs.InodeOperations for a file in /proc/TID/fd/. @@ -154,9 +148,9 @@ func (f *fd) Close() error { type fdDir struct { ramfs.Dir - // We hold a reference on the task's fdmap but only keep an indirect - // task pointer to avoid Dirent loading circularity caused by fdmap's - // potential back pointers into the dirent tree. + // We hold a reference on the task's FDTable but only keep an indirect + // task pointer to avoid Dirent loading circularity caused by the + // table's back pointers into the dirent tree. t *kernel.Task } diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index b2e36aeee..ef0ca3301 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -580,8 +580,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( var fds int var vss, rss, data uint64 s.t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.Size() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c172d399e..7b92f1b8d 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -96,7 +96,8 @@ go_library( srcs = [ "abstract_socket_namespace.go", "context.go", - "fd_map.go", + "fd_table.go", + "fd_table_unsafe.go", "fs_context.go", "ipc_namespace.go", "kernel.go", @@ -179,7 +180,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/semaphore", "//pkg/sentry/kernel/shm", @@ -214,7 +214,7 @@ go_test( name = "kernel_test", size = "small", srcs = [ - "fd_map_test.go", + "fd_table_test.go", "table_test.go", "task_test.go", "timekeeper_test.go", @@ -223,9 +223,10 @@ go_test( deps = [ "//pkg/abi", "//pkg/sentry/arch", + "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", "//pkg/sentry/fs/filetest", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/sched", "//pkg/sentry/limits", "//pkg/sentry/pgalloc", diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index fb99cfc8f..f46c43128 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 33c7dccae..9c0a4e1b4 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -61,7 +60,7 @@ const ( // +stateify savable type FileIdentifier struct { File *fs.File `state:"wait"` - Fd kdefs.FD + Fd int32 } // pollEntry holds all the state associated with an event poll entry, that is, diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go deleted file mode 100644 index 1b84bfe14..000000000 --- a/pkg/sentry/kernel/fd_map.go +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kernel - -import ( - "bytes" - "fmt" - "sort" - "sync" - "sync/atomic" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" -) - -// FDs is an ordering of FD's that can be made stable. -type FDs []kdefs.FD - -func (f FDs) Len() int { - return len(f) -} - -func (f FDs) Swap(i, j int) { - f[i], f[j] = f[j], f[i] -} - -func (f FDs) Less(i, j int) bool { - return f[i] < f[j] -} - -// FDFlags define flags for an individual descriptor. -// -// +stateify savable -type FDFlags struct { - // CloseOnExec indicates the descriptor should be closed on exec. - CloseOnExec bool -} - -// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags -// representation. -func (f FDFlags) ToLinuxFileFlags() (mask uint) { - if f.CloseOnExec { - mask |= linux.O_CLOEXEC - } - return -} - -// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags -// representation. -func (f FDFlags) ToLinuxFDFlags() (mask uint) { - if f.CloseOnExec { - mask |= linux.FD_CLOEXEC - } - return -} - -// descriptor holds the details about a file descriptor, namely a pointer the -// file itself and the descriptor flags. -// -// +stateify savable -type descriptor struct { - file *fs.File - flags FDFlags -} - -// FDMap is used to manage File references and flags. -// -// +stateify savable -type FDMap struct { - refs.AtomicRefCount - k *Kernel - files map[kdefs.FD]descriptor - mu sync.RWMutex `state:"nosave"` - uid uint64 -} - -// ID returns a unique identifier for this FDMap. -func (f *FDMap) ID() uint64 { - return f.uid -} - -// NewFDMap allocates a new FDMap that may be used by tasks in k. -func (k *Kernel) NewFDMap() *FDMap { - f := FDMap{ - k: k, - files: make(map[kdefs.FD]descriptor), - uid: atomic.AddUint64(&k.fdMapUids, 1), - } - f.EnableLeakCheck("kernel.FDMap") - return &f -} - -// destroy removes all of the file descriptors from the map. -func (f *FDMap) destroy() { - f.RemoveIf(func(*fs.File, FDFlags) bool { - return true - }) -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDMap) DecRef() { - f.DecRefWithDestructor(f.destroy) -} - -// Size returns the number of file descriptor slots currently allocated. -func (f *FDMap) Size() int { - f.mu.RLock() - defer f.mu.RUnlock() - - return len(f.files) -} - -// String is a stringer for FDMap. -func (f *FDMap) String() string { - f.mu.RLock() - defer f.mu.RUnlock() - - var b bytes.Buffer - for k, v := range f.files { - n, _ := v.file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n)) - } - return b.String() -} - -// NewFDFrom allocates a new FD guaranteed to be the lowest number available -// greater than or equal to from. This property is important as Unix programs -// tend to count on this allocation order. -func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) { - if fd < 0 { - // Don't accept negative FDs. - return 0, syscall.EINVAL - } - - f.mu.Lock() - defer f.mu.Unlock() - - // Finds the lowest fd not in the handles map. - lim := limitSet.Get(limits.NumberOfFiles) - for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ { - if _, ok := f.files[i]; !ok { - file.IncRef() - f.files[i] = descriptor{file, flags} - return i, nil - } - } - - return -1, syscall.EMFILE -} - -// NewFDAt sets the file reference for the given FD. If there is an -// active reference for that FD, the ref count for that existing reference -// is decremented. -func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error { - if fd < 0 { - // Don't accept negative FDs. - return syscall.EBADF - } - - // In this one case we do not do a defer of the Unlock. The - // reason is that we must have done all the work needed for - // discarding any old open file before we return to the - // caller. In other words, the DecRef(), below, must have - // completed by the time we return to the caller to ensure - // side effects are, in fact, effected. A classic example is - // dup2(fd1, fd2); if fd2 was already open, it must be closed, - // and we don't want to resume the caller until it is; we have - // to block on the DecRef(). Hence we can not just do a 'go - // oldfile.DecRef()', since there would be no guarantee that - // it would be done before we the caller resumed. Since we - // must wait for the DecRef() to finish, and that could take - // time, it's best to first call f.muUnlock beore so we are - // not blocking other uses of this FDMap on the DecRef() call. - f.mu.Lock() - oldDesc, oldExists := f.files[fd] - lim := limitSet.Get(limits.NumberOfFiles).Cur - // if we're closing one then the effective limit is one - // more than the actual limit. - if oldExists && lim != limits.Infinity { - lim++ - } - if lim != limits.Infinity && fd >= kdefs.FD(lim) { - f.mu.Unlock() - return syscall.EMFILE - } - - file.IncRef() - f.files[fd] = descriptor{file, flags} - f.mu.Unlock() - - if oldExists { - oldDesc.file.DecRef() - } - return nil -} - -// SetFlags sets the flags for the given file descriptor, if it is valid. -func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) { - f.mu.Lock() - defer f.mu.Unlock() - - desc, ok := f.files[fd] - if !ok { - return - } - - f.files[fd] = descriptor{desc.file, flags} -} - -// GetDescriptor returns a reference to the file and the flags for the FD. It -// bumps its reference count as well. It returns nil if there is no File -// for the FD, i.e. if the FD is invalid. The caller must use DecRef -// when they are done. -func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) { - f.mu.RLock() - defer f.mu.RUnlock() - - if desc, ok := f.files[fd]; ok { - desc.file.IncRef() - return desc.file, desc.flags - } - return nil, FDFlags{} -} - -// GetFile returns a reference to the File for the FD and bumps -// its reference count as well. It returns nil if there is no File -// for the FD, i.e. if the FD is invalid. The caller must use DecRef -// when they are done. -func (f *FDMap) GetFile(fd kdefs.FD) *fs.File { - f.mu.RLock() - if desc, ok := f.files[fd]; ok { - desc.file.IncRef() - f.mu.RUnlock() - return desc.file - } - f.mu.RUnlock() - return nil -} - -// fds returns an ordering of FDs. -func (f *FDMap) fds() FDs { - fds := make(FDs, 0, len(f.files)) - for fd := range f.files { - fds = append(fds, fd) - } - sort.Sort(fds) - return fds -} - -// GetFDs returns a list of valid fds. -func (f *FDMap) GetFDs() FDs { - f.mu.RLock() - defer f.mu.RUnlock() - return f.fds() -} - -// GetRefs returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDMap) GetRefs() []*fs.File { - f.mu.RLock() - defer f.mu.RUnlock() - - fds := f.fds() - fs := make([]*fs.File, 0, len(fds)) - for _, fd := range fds { - desc := f.files[fd] - desc.file.IncRef() - fs = append(fs, desc.file) - } - return fs -} - -// Fork returns an independent FDMap pointing to the same descriptors. -func (f *FDMap) Fork() *FDMap { - f.mu.RLock() - defer f.mu.RUnlock() - - clone := f.k.NewFDMap() - - // Grab a extra reference for every file. - for fd, desc := range f.files { - desc.file.IncRef() - clone.files[fd] = desc - } - - // That's it! - return clone -} - -// unlock releases all file locks held by this FDMap's uid. Must only be -// called on a non-nil *fs.File. -func (f *FDMap) unlock(file *fs.File) { - id := lock.UniqueID(f.ID()) - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF}) -} - -// inotifyFileClose generates the appropriate inotify events for f being closed. -func inotifyFileClose(f *fs.File) { - var ev uint32 - d := f.Dirent - - if fs.IsDir(d.Inode.StableAttr) { - ev |= linux.IN_ISDIR - } - - if f.Flags().Write { - ev |= linux.IN_CLOSE_WRITE - } else { - ev |= linux.IN_CLOSE_NOWRITE - } - - d.InotifyEvent(ev, 0) -} - -// Remove removes an FD from the FDMap, and returns (File, true) if a File -// one was found. Callers are expected to decrement the reference count on -// the File. Otherwise returns (nil, false). -func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) { - f.mu.Lock() - desc := f.files[fd] - delete(f.files, fd) - f.mu.Unlock() - if desc.file != nil { - f.unlock(desc.file) - inotifyFileClose(desc.file) - return desc.file, true - } - return nil, false -} - -// RemoveIf removes all FDs where cond is true. -func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) { - var removed []*fs.File - f.mu.Lock() - for fd, desc := range f.files { - if desc.file != nil && cond(desc.file, desc.flags) { - delete(f.files, fd) - removed = append(removed, desc.file) - } - } - f.mu.Unlock() - - for _, file := range removed { - f.unlock(file) - inotifyFileClose(file) - file.DecRef() - } -} diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go deleted file mode 100644 index 8571dbe59..000000000 --- a/pkg/sentry/kernel/fd_map_test.go +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kernel - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/sentry/fs/filetest" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" -) - -const ( - // maxFD is the maximum FD to try to create in the map. - // This number of open files has been seen in the wild. - maxFD = 2 * 1024 -) - -func newTestFDMap() *FDMap { - return &FDMap{ - files: make(map[kdefs.FD]descriptor), - } -} - -// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap, -// until there is no room, then makes sure that NewFDAt works -// and also that if we remove one and add one that works too. -func TestFDMapMany(t *testing.T) { - file := filetest.NewTestFile(t) - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */) - - f := newTestFDMap() - for i := 0; i < maxFD; i++ { - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) - } - } - - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error") - } - - if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) - } -} - -// TestFDMap does a set of simple tests to make sure simple adds, -// removes, GetRefs, and DecRefs work. The ordering is just weird -// enough that a table-driven approach seemed clumsy. -func TestFDMap(t *testing.T) { - file := filetest.NewTestFile(t) - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true /* privileged */) - - f := newTestFDMap() - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) - } - - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") - } - - largeLimit := limits.Limit{maxFD, maxFD} - limitSet.Set(limits.NumberOfFiles, largeLimit, true /* privileged */) - - if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) - } else if fd != kdefs.FD(1) { - t.Fatalf("Added an FD to a resized map: got %v, want 1", fd) - } - - if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) - } - - if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) - } - - if ref := f.GetFile(1); ref == nil { - t.Fatalf("f.GetFile(1): got nil, wanted %v", file) - } - - if ref := f.GetFile(2); ref != nil { - t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref) - } - - ref, ok := f.Remove(1) - if !ok { - t.Fatalf("f.Remove(1) for an existing FD: failed, want success") - } - ref.DecRef() - - if ref, ok := f.Remove(1); ok { - ref.DecRef() - t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") - } - -} - -func TestDescriptorFlags(t *testing.T) { - file := filetest.NewTestFile(t) - f := newTestFDMap() - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */) - - origFlags := FDFlags{CloseOnExec: true} - - if err := f.NewFDAt(2, file, origFlags, limitSet); err != nil { - t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) - } - - newFile, newFlags := f.GetDescriptor(2) - if newFile == nil { - t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile) - } - - if newFlags != origFlags { - t.Fatalf("new File flags %+v don't match original %+v", newFlags, origFlags) - } -} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go new file mode 100644 index 000000000..1f3a57dc1 --- /dev/null +++ b/pkg/sentry/kernel/fd_table.go @@ -0,0 +1,380 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "math" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/limits" +) + +// FDFlags define flags for an individual descriptor. +// +// +stateify savable +type FDFlags struct { + // CloseOnExec indicates the descriptor should be closed on exec. + CloseOnExec bool +} + +// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags +// representation. +func (f FDFlags) ToLinuxFileFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.O_CLOEXEC + } + return +} + +// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags +// representation. +func (f FDFlags) ToLinuxFDFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.FD_CLOEXEC + } + return +} + +// descriptor holds the details about a file descriptor, namely a pointer to +// the file itself and the descriptor flags. +// +// Note that this is immutable and can only be changed via operations on the +// descriptorTable. +// +// +stateify savable +type descriptor struct { + file *fs.File + flags FDFlags +} + +// FDTable is used to manage File references and flags. +// +// +stateify savable +type FDTable struct { + refs.AtomicRefCount + k *Kernel + + // uid is a unique identifier. + uid uint64 + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // used contains the number of non-nil entries. + used int32 + + // descriptorTable holds descriptors. + descriptorTable `state:".(map[int32]descriptor)"` +} + +func (f *FDTable) saveDescriptorTable() map[int32]descriptor { + m := make(map[int32]descriptor) + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + m[fd] = descriptor{ + file: file, + flags: flags, + } + }) + return m +} + +func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + f.init() // Initialize table. + for fd, d := range m { + f.set(fd, d.file, d.flags) + + // Note that we do _not_ need to acquire a extra table + // reference here. The table reference will already be + // accounted for in the file, so we drop the reference taken by + // set above. + d.file.DecRef() + } +} + +// drop drops the table reference. +func (f *FDTable) drop(file *fs.File) { + // Release locks. + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + + // Send inotify events. + d := file.Dirent + var ev uint32 + if fs.IsDir(d.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + if file.Flags().Write { + ev |= linux.IN_CLOSE_WRITE + } else { + ev |= linux.IN_CLOSE_NOWRITE + } + d.InotifyEvent(ev, 0) + + // Drop the table reference. + file.DecRef() +} + +// ID returns a unique identifier for this FDTable. +func (f *FDTable) ID() uint64 { + return f.uid +} + +// NewFDTable allocates a new FDTable that may be used by tasks in k. +func (k *Kernel) NewFDTable() *FDTable { + f := &FDTable{ + k: k, + uid: atomic.AddUint64(&k.fdMapUids, 1), + } + f.init() + return f +} + +// destroy removes all of the file descriptors from the map. +func (f *FDTable) destroy() { + f.RemoveIf(func(*fs.File, FDFlags) bool { + return true + }) +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FDTable) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Size returns the number of file descriptor slots currently allocated. +func (f *FDTable) Size() int { + size := atomic.LoadInt32(&f.used) + return int(size) +} + +// forEach iterates over all non-nil files. +// +// It is the caller's responsibility to acquire an appropriate lock. +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { + fd := int32(0) + for { + file, flags, ok := f.get(fd) + if !ok { + break + } + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + fn(int32(fd), file, flags) + file.DecRef() + } + fd++ + } +} + +// String is a stringer for FDTable. +func (f *FDTable) String() string { + var b bytes.Buffer + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + n, _ := file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + }) + return b.String() +} + +// NewFDs allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.get(i); d == nil { + f.set(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.set(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + return fds, nil +} + +// NewFDAt sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Check the limit for the provided file. + if limitSet := limits.FromContext(ctx); limitSet != nil { + if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { + return syscall.EMFILE + } + } + + // Install the entry. + f.set(fd, file, flags) + return nil +} + +// SetFlags sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.get(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.set(fd, file, flags) + return nil +} + +// Get returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.get(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + +// GetFDs returns a list of valid fds. +func (f *FDTable) GetFDs() []int32 { + fds := make([]int32, 0, f.used) + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + fds = append(fds, fd) + }) + return fds +} + +// GetRefs returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefs() []*fs.File { + files := make([]*fs.File, 0, f.Size()) + f.forEach(func(_ int32, file *fs.File, flags FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// Fork returns an independent FDTable. +func (f *FDTable) Fork() *FDTable { + clone := f.k.NewFDTable() + + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + // The set function here will acquire an appropriate table + // reference for the clone. We don't need anything else. + clone.set(fd, file, flags) + }) + return clone +} + +// Remove removes an FD from and returns a non-file iff successful. +// +// N.B. Callers are required to use DecRef when they are done. +func (f *FDTable) Remove(fd int32) *fs.File { + if fd < 0 { + return nil + } + + f.mu.Lock() + defer f.mu.Unlock() + + orig, _, _ := f.get(fd) + if orig != nil { + orig.IncRef() // Reference for caller. + f.set(fd, nil, FDFlags{}) // Zap entry. + } + return orig +} + +// RemoveIf removes all FDs where cond is true. +func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { + f.mu.Lock() + defer f.mu.Unlock() + + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + if cond(file, flags) { + f.set(fd, nil, FDFlags{}) // Clear from table. + } + }) +} diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go new file mode 100644 index 000000000..2413788e7 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_test.go @@ -0,0 +1,192 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "runtime" + "sync" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/filetest" + "gvisor.dev/gvisor/pkg/sentry/limits" +) + +const ( + // maxFD is the maximum FD to try to create in the map. + // + // This number of open files has been seen in the wild. + maxFD = 2 * 1024 +) + +func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) { + t.Helper() // Don't show in stacks. + + // Create the limits and context. + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet) + + // Create a test file.; + file := filetest.NewTestFile(t) + + // Create the table. + fdTable := new(FDTable) + fdTable.init() + + // Run the test. + fn(ctx, fdTable, file, limitSet) +} + +// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there +// is no room, then makes sure that NewFDAt works and also that if we remove +// one and add one that works too. +func TestFDTableMany(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + for i := 0; i < maxFD; i++ { + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) + } + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error") + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + }) +} + +// TestFDTable does a set of simple tests to make sure simple adds, removes, +// GetRefs, and DecRefs work. The ordering is just weird enough that a +// table-driven approach seemed clumsy. +func TestFDTable(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) { + // Cap the limit at one. + limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true) + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") + } + + // Remove the previous limit. + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 1 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil { + t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) + } + + if ref, _ := fdTable.Get(1); ref == nil { + t.Fatalf("fdTable.Get(1): got nil, wanted %v", file) + } + + if ref, _ := fdTable.Get(2); ref != nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) + } + + ref := fdTable.Remove(1) + if ref == nil { + t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") + } + ref.DecRef() + + if ref := fdTable.Remove(1); ref != nil { + t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") + } + }) +} + +func TestDescriptorFlags(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil { + t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) + } + + newFile, flags := fdTable.Get(2) + if newFile == nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile) + } + + if !flags.CloseOnExec { + t.Fatalf("new File flags %v don't match original %d\n", flags, 0) + } + }) +} + +func BenchmarkFDLookupAndDecRef(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + b.StartTimer() // Benchmark. + for i := 0; i < b.N; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }) +} + +func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + concurrency := runtime.GOMAXPROCS(0) + if concurrency < 4 { + concurrency = 4 + } + each := b.N / concurrency + + b.StartTimer() // Benchmark. + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < each; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }() + } + wg.Wait() + }) +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go new file mode 100644 index 000000000..e009df974 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -0,0 +1,103 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +type descriptorTable struct { + // slice is a *[]unsafe.Pointer, where each element is actually + // *descriptor object, updated atomically. + // + // Changes to the slice itself requiring holding FDTable.mu. + slice unsafe.Pointer `state:".(map[int32]*descriptor)"` +} + +// init initializes the table. +func (f *FDTable) init() { + var slice []unsafe.Pointer // Empty slice. + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) +} + +// get gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + if fd >= int32(len(slice)) { + return nil, FDFlags{}, false + } + d := (*descriptor)(atomic.LoadPointer(&slice[fd])) + if d == nil { + return nil, FDFlags{}, true + } + return d.file, d.flags, true +} + +// set sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + + // Grow the table as required. + if last := int32(len(slice)); fd >= last { + end := fd + 1 + if end < 2*last { + end = 2 * last + } + slice = append(slice, make([]unsafe.Pointer, end-last)...) + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + } + + // Create the new element. + var d *descriptor + if file != nil { + d = &descriptor{ + file: file, + flags: flags, + } + } + + // Update the single element. + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d))) + + // Acquire a table reference. + if file != nil && (orig == nil || file != orig.file) { + file.IncRef() + } + + // Drop the table reference. + if orig != nil && file != orig.file { + f.drop(orig.file) + } + + // Adjust used. + switch { + case orig == nil && file != nil: + atomic.AddInt32(&f.used, 1) + case orig != nil && file == nil: + atomic.AddInt32(&f.used, -1) + } +} diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD deleted file mode 100644 index 5d62f406a..000000000 --- a/pkg/sentry/kernel/kdefs/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "kdefs", - srcs = ["kdefs.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs", - visibility = ["//:sandbox"], -) diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go deleted file mode 100644 index 304da2032..000000000 --- a/pkg/sentry/kernel/kdefs/kdefs.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package kdefs defines common kernel definitions. -// -package kdefs - -// FD is a File Descriptor. -type FD int32 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 47dadc43a..38b49cba2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -155,7 +155,7 @@ type Kernel struct { // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` - // fdMapUids is an ever-increasing counter for generating FDMap uids. + // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. fdMapUids uint64 @@ -400,8 +400,8 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(desc descriptor) error { - desc.file.Dirent.Inode.MountSource.FlushDirentRefs() + return k.tasks.forEachFDPaused(func(file *fs.File) error { + file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } @@ -410,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error { // task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if t.fds == nil { + if t.fdTable == nil { continue } - for _, desc := range t.fds.files { - if err := f(desc); err != nil { - return err + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if lastErr := f(file); lastErr != nil && err == nil { + err = lastErr } - } + }) } - return nil + return err } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(desc descriptor) error { - if flags := desc.file.Flags(); !flags.Write { + return ts.forEachFDPaused(func(file *fs.File) error { + if flags := file.Flags(); !flags.Write { return nil } - if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { return nil } // Here we need all metadata synced. - syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { - name, _ := desc.file.Dirent.FullName(nil /* root */) + name, _ := file.Dirent.FullName(nil /* root */) // Wrap this error in ErrSaveRejection // so that it will trigger a save // error, rather than a panic. This @@ -483,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() { defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if fdmap := t.fds; fdmap != nil { - for _, desc := range fdmap.files { - if desc.file != nil { - if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() } - } + }) } } } @@ -538,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { } log.Infof("Memory load took [%s].", time.Since(memoryStart)) + log.Infof("Overall load took [%s]", time.Since(loadStart)) + // Ensure that all pending asynchronous work is complete: // - namedpipe opening // - inode file opening @@ -602,9 +602,9 @@ type CreateProcessArgs struct { // Credentials is the initial credentials. Credentials *auth.Credentials - // FDMap is the initial set of file descriptors. If CreateProcess succeeds, - // it takes a reference on FDMap. - FDMap *FDMap + // FDTable is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDTable. + FDTable *FDTable // Umask is the initial umask. Umask uint @@ -789,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, return nil, 0, errors.New(se.String()) } - // Take a reference on the FDMap, which will be transferred to + // Take a reference on the FDTable, which will be transferred to // TaskSet.NewTask(). - args.FDMap.IncRef() + args.FDTable.IncRef() // Create the task. config := &TaskConfig{ @@ -799,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, ThreadGroup: tg, TaskContext: tc, FSContext: newFSContext(root, wd, args.Umask), - FDMap: args.FDMap, + FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, @@ -871,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() { } // By precondition, nothing else can be interacting with PIDNamespace.tids - // or FDMap.files, so we can iterate them without synchronization. (We + // or FDTable.files, so we can iterate them without synchronization. (We // can't hold the TaskSet mutex when pausing thread group timers because // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet // mutex, while holding the Timer mutex.) @@ -882,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() { it.PauseTimer() } } - // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } - } + }) } } k.timekeeper.PauseUpdates() @@ -914,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } - } + }) } } } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 2e3a39d3b..e91f82bb3 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -236,15 +236,15 @@ type Task struct { // tc is protected by mu, and is owned by the task goroutine. tc TaskContext - // fsc is the task's filesystem context. + // fsContext is the task's filesystem context. // - // fsc is protected by mu, and is owned by the task goroutine. - fsc *FSContext + // fsContext is protected by mu, and is owned by the task goroutine. + fsContext *FSContext - // fds is the task's file descriptor table. + // fdTable is the task's file descriptor table. // - // fds is protected by mu, and is owned by the task goroutine. - fds *FDMap + // fdTable is protected by mu, and is owned by the task goroutine. + fdTable *FDTable // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when @@ -602,7 +602,7 @@ func (t *Task) Value(key interface{}) interface{} { case context.CtxThreadGroupID: return int32(t.ThreadGroup().ID()) case fs.CtxRoot: - return t.fsc.RootDirectory() + return t.fsContext.RootDirectory() case fs.CtxDirentCacheLimiter: return t.k.DirentCacheLimiter case inet.CtxStack: @@ -668,7 +668,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { realRoot := t.tg.mounts.Root() defer realRoot.DecRef() - root := t.fsc.RootDirectory() + root := t.fsContext.RootDirectory() if root != nil { defer root.DecRef() } @@ -689,16 +689,55 @@ func (t *Task) TaskContext() *TaskContext { // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. func (t *Task) FSContext() *FSContext { - return t.fsc + return t.fsContext } -// FDMap returns t's FDMap. FDMap does not take an additional reference on the -// returned FDMap. +// FDTable returns t's FDTable. FDMTable does not take an additional reference +// on the returned FDMap. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. -func (t *Task) FDMap() *FDMap { - return t.fds +func (t *Task) FDTable() *FDTable { + return t.fdTable +} + +// GetFile is a convenience wrapper t.FDTable().GetFile. +// +// Precondition: same as FDTable. +func (t *Task) GetFile(fd int32) *fs.File { + f, _ := t.fdTable.Get(fd) + return f +} + +// NewFDs is a convenience wrapper for t.FDTable().NewFDs. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDs(t, fd, files, flags) +} + +// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) { + fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags) + if err != nil { + return 0, err + } + return fds[0], nil +} + +// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { + return t.fdTable.NewFDAt(t, fd, file, flags) } // WithMuLocked executes f with t.mu locked. diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index b5cc3860d..0916fd658 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -214,20 +214,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } } - var fsc *FSContext + var fsContext *FSContext if opts.NewFSContext { - fsc = t.fsc.Fork() + fsContext = t.fsContext.Fork() } else { - fsc = t.fsc - fsc.IncRef() + fsContext = t.fsContext + fsContext.IncRef() } - var fds *FDMap + var fdTable *FDTable if opts.NewFiles { - fds = t.fds.Fork() + fdTable = t.fdTable.Fork() } else { - fds = t.fds - fds.IncRef() + fdTable = t.fdTable + fdTable.IncRef() } pidns := t.tg.pidns @@ -251,8 +251,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ThreadGroup: tg, SignalMask: t.SignalMask(), TaskContext: tc, - FSContext: fsc, - FDMap: fds, + FSContext: fsContext, + FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), NetworkNamespaced: t.netns, @@ -485,22 +485,22 @@ func (t *Task) Unshare(opts *SharingOptions) error { // namespace" t.ipcns = NewIPCNamespace(creds.UserNamespace) } - var oldfds *FDMap + var oldFDTable *FDTable if opts.NewFiles { - oldfds = t.fds - t.fds = oldfds.Fork() + oldFDTable = t.fdTable + t.fdTable = oldFDTable.Fork() } - var oldfsc *FSContext + var oldFSContext *FSContext if opts.NewFSContext { - oldfsc = t.fsc - t.fsc = oldfsc.Fork() + oldFSContext = t.fsContext + t.fsContext = oldFSContext.Fork() } t.mu.Unlock() - if oldfds != nil { - oldfds.DecRef() + if oldFDTable != nil { + oldFDTable.DecRef() } - if oldfsc != nil { - oldfsc.DecRef() + if oldFSContext != nil { + oldFSContext.DecRef() } return nil } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index cd85acaef..17a089b90 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -195,7 +195,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. - t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool { + t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b97d65185..535f03e50 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -265,8 +265,8 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsc.DecRef() - t.fds.DecRef() + t.fsContext.DecRef() + t.fdTable.DecRef() // If this is the last task to exit from the thread group, release the // thread group's resources. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index cf48663b6..a29e9b9eb 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() { if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } - t.Debugf("FDMap:\n%s", t.fds) + t.Debugf("FDTable:\n%s", t.fdTable) } // debugDumpRegisters logs register state at log level debug. diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 72caae537..a88bf3951 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -52,9 +52,10 @@ type TaskConfig struct { // succeeds. FSContext *FSContext - // FDMap is the FDMap of the new task. A reference must be held on FDMap, - // which is transferred to TaskSet.NewTask whether or not it succeeds. - FDMap *FDMap + // FDTable is the FDTableof the new task. A reference must be held on + // FDMap, which is transferred to TaskSet.NewTask whether or not it + // succeeds. + FDTable *FDTable // Credentials is the Credentials of the new task. Credentials *auth.Credentials @@ -90,7 +91,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { if err != nil { cfg.TaskContext.release() cfg.FSContext.DecRef() - cfg.FDMap.DecRef() + cfg.FDTable.DecRef() return nil, err } return t, nil @@ -112,8 +113,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { signalMask: cfg.SignalMask, signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, tc: *tc, - fsc: cfg.FSContext, - fds: cfg.FDMap, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), k: cfg.Kernel, ptraceTracees: make(map[*Task]struct{}), diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index edfdac2a7..baa12d9a0 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -54,7 +54,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in // openPath opens name for loading. // // openPath returns the fs.Dirent and an *fs.File for name, which is not -// installed in the Task FDMap. The caller takes ownership of both. +// installed in the Task FDTable. The caller takes ownership of both. // // name must be a readable, executable, regular file. func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) { diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 7a24d4806..2b03ea87c 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -14,7 +14,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 39de46c39..81dbd7309 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -17,7 +17,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserror", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index b646dc258..4f4a20dfe 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -63,7 +62,7 @@ type RightsFiles []*fs.File func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { files := make(RightsFiles, 0, len(fds)) for _, fd := range fds { - file, _ := t.FDMap().GetDescriptor(kdefs.FD(fd)) + file := t.GetFile(fd) if file == nil { files.Release() return nil, syserror.EBADF @@ -109,7 +108,9 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 files, trunc := rights.Files(t, max) fds := make([]int32, 0, len(files)) for i := 0; i < max && len(files) > 0; i++ { - fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits()) + fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) files[0].DecRef() files = files[1:] if err != nil { @@ -315,8 +316,7 @@ func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte { // Parse parses a raw socket control message into portable objects. func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) { var ( - fds linux.ControlMessageRights - + fds linux.ControlMessageRights haveCreds bool creds linux.ControlMessageCredentials ) diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD index 45bb24a3f..1f014f399 100644 --- a/pkg/sentry/socket/epsocket/BUILD +++ b/pkg/sentry/socket/epsocket/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index 2a38e370a..b2b2d98a1 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -40,7 +40,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -537,7 +536,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait // Accept implements the linux syscall accept(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. ep, wq, terr := s.Endpoint.Accept() if terr != nil { @@ -575,10 +574,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - fdFlags := kernel.FDFlags{ + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) + }) t.Kernel().RecordSocket(ns) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 4f670beb4..a951f1bb0 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,7 +26,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c63f3aacf..7f69406b7 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -190,7 +189,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { var peerAddr []byte var peerAddrlen uint32 var peerAddrPtr *byte @@ -236,11 +235,11 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } defer f.DecRef() - fdFlags := kernel.FDFlags{ + kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, - } - kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits()) + }) t.Kernel().RecordSocket(f) + return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index f6b001b63..45ebb2a0e 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/netlink/port", diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index ecc1e2d53..f3d6c1e9b 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" @@ -272,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr } // Accept implements socket.Socket.Accept. -func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD index 96d374383..5061dcbde 100644 --- a/pkg/sentry/socket/rpcinet/BUILD +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -25,7 +25,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/hostinet", diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go index cc7b964ea..ccaaddbfc 100644 --- a/pkg/sentry/socket/rpcinet/socket.go +++ b/pkg/sentry/socket/rpcinet/socket.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" @@ -286,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { payload, se := rpcAccept(t, s.fd, peerRequested) // Check if we need to block. @@ -336,10 +335,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, }) defer file.DecRef() - fdFlags := kernel.FDFlags{ + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, 0, syserr.FromError(err) } diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 933120f34..0efa58a58 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -53,7 +52,7 @@ type Socket interface { // Accept implements the accept4(2) linux syscall. // Returns fd, real peer address length and error. Real peer address // length is only set if len(peer) > 0. - Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) + Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) // Bind implements the bind(2) linux syscall. Bind(t *kernel.Task, sockaddr []byte) *syserr.Error diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 8580eb87d..da9977fde 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index bf7d2cfa2..b30871a90 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -194,7 +193,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, * // Accept implements the linux syscall accept(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. ep, err := s.ep.Accept() if err != nil { @@ -229,10 +228,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - fdFlags := kernel.FDFlags{ + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) + }) if e != nil { return 0, nil, 0, syserr.FromError(e) } diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index d77c7a433..445d25010 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/socket/control", "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/netlink", diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go index 57cf6b139..5187594a7 100644 --- a/pkg/sentry/strace/poll.go +++ b/pkg/sentry/strace/poll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -50,7 +49,7 @@ func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string { if post { revents = PollEventSet.Parse(uint64(pfd.REvents)) } - return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, kdefs.FD(pfd.FD)), PollEventSet.Parse(uint64(pfd.Events)), revents) + return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, pfd.FD), PollEventSet.Parse(uint64(pfd.Events)), revents) } func pollFDs(t *kernel.Task, addr usermem.Addr, nfds uint, post bool) string { diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 86e9c5690..311389547 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -31,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -133,7 +132,7 @@ func path(t *kernel.Task, addr usermem.Addr) string { return fmt.Sprintf("%#x %s", addr, path) } -func fd(t *kernel.Task, fd kdefs.FD) string { +func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { defer root.DecRef() @@ -151,7 +150,7 @@ func fd(t *kernel.Task, fd kdefs.FD) string { return fmt.Sprintf("AT_FDCWD %s", name) } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) @@ -375,7 +374,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo } switch i.format[arg] { case FD: - output = append(output, fd(t, kdefs.FD(args[arg].Int()))) + output = append(output, fd(t, args[arg].Int())) case WriteBuffer: output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize)) case WriteIOVec: diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index 18fddee76..79d972202 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/kernel", "//pkg/sentry/kernel/epoll", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index c710ec9e3..11850dd0f 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -20,20 +20,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/waiter" ) // CreateEpoll implements the epoll_create(2) linux syscall. -func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { +func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) defer file.DecRef() - flags := kernel.FDFlags{ + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, - } - fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits()) + }) if err != nil { return 0, err } @@ -42,16 +40,16 @@ func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { } // AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD. -func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { +func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { return syscall.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syscall.EBADF } @@ -68,16 +66,16 @@ func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags } // UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD. -func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { +func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { return syscall.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syscall.EBADF } @@ -94,16 +92,16 @@ func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFl } // RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL. -func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { +func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { return syscall.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syscall.EBADF } @@ -120,9 +118,9 @@ func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { } // WaitEpoll implements the epoll_wait(2) linux syscall. -func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) { +func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(fd) + epollfile := t.GetFile(fd) if epollfile == nil { return nil, syscall.EBADF } diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 6e5be0158..33a40b9c6 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -71,7 +71,6 @@ go_library( "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/eventfd", "//pkg/sentry/kernel/fasync", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 7081d1a45..f56411bfe 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -55,7 +54,7 @@ type ioCallback struct { OpCode uint16 ReqPrio int16 - FD uint32 + FD int32 Buf uint64 Bytes uint64 @@ -65,7 +64,7 @@ type ioCallback struct { Flags uint32 // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD uint32 + ResFD int32 } // ioEvent describes an I/O result. @@ -292,7 +291,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC // submitCallback processes a single callback. func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { - file := t.FDMap().GetFile(kdefs.FD(cb.FD)) + file := t.GetFile(cb.FD) if file == nil { // File not found. return syserror.EBADF @@ -302,7 +301,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Was there an eventFD? Extract it. var eventFile *fs.File if cb.Flags&_IOCB_FLAG_RESFD != 0 { - eventFile = t.FDMap().GetFile(kdefs.FD(cb.ResFD)) + eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. return syserror.EBADF diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 14a61cfa5..a0dd4d4e3 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/syscalls" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -61,9 +60,9 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // EpollCtl implements the epoll_ctl(2) linux syscall. func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := kdefs.FD(args[0].Int()) + epfd := args[0].Int() op := args[1].Int() - fd := kdefs.FD(args[2].Int()) + fd := args[2].Int() eventAddr := args[3].Pointer() // Capture the event state if needed. @@ -132,7 +131,7 @@ func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error { // EpollWait implements the epoll_wait(2) linux syscall. func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := kdefs.FD(args[0].Int()) + epfd := args[0].Int() eventsAddr := args[1].Pointer() maxEvents := int(args[2].Int()) timeout := int(args[3].Int()) diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 7dbe84884..5cfc2c3c8 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -47,10 +47,9 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc }) defer event.DecRef() - fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&EFD_CLOEXEC != 0, - }, - t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 2776fdec7..eb6f5648f 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -34,7 +33,7 @@ import ( ) // fileOpAt performs an operation on the second last component in the path. -func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { +func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { // Extract the last component. dir, name := fs.SplitLast(path) if dir == "/" { @@ -60,7 +59,7 @@ func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dire } // fileOpOn performs an operation on the last entry of the path. -func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { +func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { var ( d *fs.Dirent // The file. wd *fs.Dirent // The working directory (if required.) @@ -78,7 +77,7 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func rel = wd } else { // Need to extract the given FD. - f = t.FDMap().GetFile(dirFD) + f = t.GetFile(dirFD) if f == nil { return syserror.EBADF } @@ -131,7 +130,7 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string return path, dirPath, nil } -func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) { +func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -184,8 +183,9 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u defer file.DecRef() // Success. - fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} - newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return err } @@ -201,7 +201,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u return fd, err // Use result in frame. } -func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -285,7 +285,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Mknodat implements the linux syscall mknodat(2). func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() path := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) // We don't need this argument until we support creation of device nodes. @@ -294,7 +294,7 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, mknodAt(t, dirFD, path, mode) } -func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { +func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -428,8 +428,9 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod } // Success. - fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} - newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return err } @@ -463,7 +464,7 @@ func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Openat implements linux syscall openat(2). func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() flags := uint(args[2].Uint()) if flags&linux.O_CREAT != 0 { @@ -502,7 +503,7 @@ func (ac accessContext) Value(key interface{}) interface{} { } } -func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error { +func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error { const rOK = 4 const wOK = 2 const xOK = 1 @@ -557,7 +558,7 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Faccessat implements linux syscall faccessat(2). func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() flags := args[3].Int() @@ -567,10 +568,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() request := int(args[1].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -579,12 +580,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Shared flags between file and socket. switch request { case linux.FIONCLEX: - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: false, }) return 0, nil, nil case linux.FIOCLEX: - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: true, }) return 0, nil, nil @@ -728,9 +729,9 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fchdir implements the linux syscall fchdir(2). func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -752,10 +753,13 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file, ok := t.FDMap().Remove(fd) - if !ok { + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + file := t.FDTable().Remove(fd) + if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() @@ -766,30 +770,30 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Dup implements linux syscall dup(2). func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() - newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { return 0, nil, syserror.EMFILE } - return uintptr(newfd), nil, nil + return uintptr(newFD), nil, nil } // Dup2 implements linux syscall dup2(2). func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldfd := kdefs.FD(args[0].Int()) - newfd := kdefs.FD(args[1].Int()) + oldfd := args[0].Int() + newfd := args[1].Int() // If oldfd is a valid file descriptor, and newfd has the same value as oldfd, // then dup2() does nothing, and returns newfd. if oldfd == newfd { - oldFile := t.FDMap().GetFile(oldfd) + oldFile := t.GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } @@ -805,21 +809,21 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Dup3 implements linux syscall dup3(2). func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldfd := kdefs.FD(args[0].Int()) - newfd := kdefs.FD(args[1].Int()) + oldfd := args[0].Int() + newfd := args[1].Int() flags := args[2].Uint() if oldfd == newfd { return 0, nil, syserror.EINVAL } - oldFile := t.FDMap().GetFile(oldfd) + oldFile := t.GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } defer oldFile.DecRef() - err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits()) + err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { return 0, nil, err } @@ -862,10 +866,10 @@ func fSetOwn(t *kernel.Task, file *fs.File, who int32) { // Fcntl implements linux syscall fcntl(2). func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() cmd := args[1].Int() - file, flags := t.FDMap().GetDescriptor(fd) + file, flags := t.FDTable().Get(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -873,9 +877,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: - from := kdefs.FD(args[2].Int()) - fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC} - fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits()) + from := args[2].Int() + fd, err := t.NewFDFrom(from, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) if err != nil { return 0, nil, err } @@ -884,7 +889,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) case linux.F_GETFL: @@ -945,8 +950,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } - // The lock uid is that of the Task's FDMap. - lockUniqueID := lock.UniqueID(t.FDMap().ID()) + // The lock uid is that of the Task's FDTable. + lockUniqueID := lock.UniqueID(t.FDTable().ID()) // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. @@ -1036,7 +1041,7 @@ const ( // Fadvise64 implements linux syscall fadvise64(2). // This implementation currently ignores the provided advice. func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() length := args[2].Int64() advice := args[3].Int() @@ -1045,7 +1050,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EINVAL } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1071,7 +1076,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } -func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1116,14 +1121,14 @@ func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Mkdirat implements linux syscall mkdirat(2). func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) return 0, nil, mkdirAt(t, dirFD, addr, mode) } -func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { +func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1163,7 +1168,7 @@ func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr) } -func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error { +func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error { newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err @@ -1206,7 +1211,7 @@ func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Symlinkat implements linux syscall symlinkat(2). func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() - dirFD := kdefs.FD(args[1].Int()) + dirFD := args[1].Int() newAddr := args[2].Pointer() return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr) @@ -1248,7 +1253,7 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error { // linkAt creates a hard link to the target specified by oldDirFD and oldAddr, // specified by newDirFD and newAddr. If resolve is true, then the symlinks // will be followed when evaluating the target. -func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error { +func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error { oldPath, _, err := copyInPath(t, oldAddr, allowEmpty) if err != nil { return err @@ -1262,7 +1267,7 @@ func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kd } if allowEmpty && oldPath == "" { - target := t.FDMap().GetFile(oldDirFD) + target := t.GetFile(oldDirFD) if target == nil { return syserror.EBADF } @@ -1324,9 +1329,9 @@ func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Linkat implements linux syscall linkat(2). func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldDirFD := kdefs.FD(args[0].Int()) + oldDirFD := args[0].Int() oldAddr := args[1].Pointer() - newDirFD := kdefs.FD(args[2].Int()) + newDirFD := args[2].Int() newAddr := args[3].Pointer() // man linkat(2): @@ -1351,7 +1356,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } -func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { +func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -1401,7 +1406,7 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Readlinkat implements linux syscall readlinkat(2). func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() bufAddr := args[2].Pointer() size := args[3].SizeT() @@ -1410,7 +1415,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return n, nil, err } -func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { +func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1440,7 +1445,7 @@ func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Unlinkat implements linux syscall unlinkat(2). func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() flags := args[2].Uint() if flags&linux.AT_REMOVEDIR != 0 { @@ -1501,10 +1506,10 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Ftruncate implements linux syscall ftruncate(2). func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() length := args[1].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1619,7 +1624,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { return nil } -func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { +func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { path, _, err := copyInPath(t, addr, allowEmpty) if err != nil { return err @@ -1627,7 +1632,7 @@ func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty if path == "" { // Annoying. What's wrong with fchown? - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syserror.EBADF } @@ -1661,11 +1666,11 @@ func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchown implements linux syscall fchown(2). func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() uid := auth.UID(args[1].Uint()) gid := auth.GID(args[2].Uint()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1676,7 +1681,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchownat implements Linux syscall fchownat(2). func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() uid := auth.UID(args[2].Uint()) gid := auth.GID(args[3].Uint()) @@ -1706,7 +1711,7 @@ func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { return nil } -func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1727,10 +1732,10 @@ func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fchmod implements linux syscall fchmod(2). func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() mode := linux.FileMode(args[1].ModeT()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1741,7 +1746,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchmodat implements linux syscall fchmodat(2). func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) @@ -1757,7 +1762,7 @@ func defaultSetToSystemTimeSpec() fs.TimeSpec { } } -func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { +func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Does the task own the file? if !d.Inode.CheckOwnership(t) { @@ -1790,7 +1795,7 @@ func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, r // Linux returns EINVAL in this case. See utimes.c. return syserror.EINVAL } - f := t.FDMap().GetFile(dirFD) + f := t.GetFile(dirFD) if f == nil { return syserror.EBADF } @@ -1858,7 +1863,7 @@ func timespecIsValid(ts linux.Timespec) bool { // Utimensat implements linux syscall utimensat(2). func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() flags := args[3].Int() @@ -1893,7 +1898,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Futimesat implements linux syscall futimesat(2). func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() @@ -1917,7 +1922,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } -func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error { +func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err @@ -1965,21 +1970,21 @@ func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Renameat implements linux syscall renameat(2). func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldDirFD := kdefs.FD(args[0].Int()) + oldDirFD := args[0].Int() oldPathAddr := args[1].Pointer() - newDirFD := kdefs.FD(args[2].Int()) + newDirFD := args[2].Int() newPathAddr := args[3].Pointer() return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() mode := args[1].Int64() offset := args[2].Int64() length := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -2028,10 +2033,10 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Flock implements linux syscall flock(2). func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() operation := args[1].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF @@ -2138,8 +2143,9 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S defer dirent.DecRef() defer file.DecRef() - fdFlags := kernel.FDFlags{CloseOnExec: cloExec} - newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index dea872672..7a2beda23 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -23,14 +23,13 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) // Getdents implements linux syscall getdents(2) for 64bit systems. func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := int(args[2].Uint()) @@ -46,7 +45,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Getdents64 implements linux syscall getdents64(2). func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := int(args[2].Uint()) @@ -62,8 +61,8 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getdents implements the core of getdents(2)/getdents64(2). // f is the syscall implementation dirent serialization function. -func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { - dir := t.FDMap().GetFile(fd) +func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { + dir := t.GetFile(fd) if dir == nil { return 0, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index 9cfa660fa..a4f36e01f 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -44,9 +43,9 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) defer n.DecRef() - fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err @@ -63,8 +62,8 @@ func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // fdToInotify resolves an fd to an inotify object. If successful, the file will // have an extra ref and the caller is responsible for releasing the ref. -func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { - file := t.FDMap().GetFile(fd) +func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { + file := t.GetFile(fd) if file == nil { // Invalid fd. return nil, nil, syscall.EBADF @@ -82,7 +81,7 @@ func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { // InotifyAddWatch implements the inotify_add_watch() syscall. func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() mask := args[2].Uint() @@ -114,7 +113,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern } // Copy out to the return frame. - fd = kdefs.FD(ino.AddWatch(dirent, mask)) + fd = ino.AddWatch(dirent, mask) return nil }) @@ -123,7 +122,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // InotifyRmWatch implements the inotify_rm_watch() syscall. func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() wd := args[1].Int() ino, file, err := fdToInotify(t, fd) diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index a3813b818..297e920c4 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -18,17 +18,16 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) // Lseek implements linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() offset := args[1].Int64() whence := args[2].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index d831833bc..58a05b5bb 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -40,7 +39,7 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() flags := args[3].Int() - fd := kdefs.FD(args[4].Int()) + fd := args[4].Int() fixed := flags&linux.MAP_FIXED != 0 private := flags&linux.MAP_PRIVATE != 0 shared := flags&linux.MAP_SHARED != 0 @@ -80,7 +79,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if !anon { // Convert the passed FD to a file reference. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 7c1bea43d..576022dd5 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -38,24 +38,17 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { w.SetFlags(linuxToFlags(flags).Settable()) defer w.DecRef() - rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{ - CloseOnExec: flags&linux.O_CLOEXEC != 0}, - t.ThreadGroup().Limits()) + fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return 0, err } - wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{ - CloseOnExec: flags&linux.O_CLOEXEC != 0}, - t.ThreadGroup().Limits()) - if err != nil { - t.FDMap().Remove(rfd) - return 0, err - } - - if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil { - t.FDMap().Remove(rfd) - t.FDMap().Remove(wfd) + if _, err := t.CopyOut(addr, fds); err != nil { + // The files are not closed in this case, the exact semantics + // of this error case are not well defined, but they could have + // already been observed by user space. return 0, syscall.EFAULT } return 0, nil diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index ef6211218..7a13beac2 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -64,7 +63,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan return } - file := t.FDMap().GetFile(kdefs.FD(pfd.FD)) + file := t.GetFile(pfd.FD) if file == nil { pfd.REvents = linux.POLLNVAL return @@ -265,7 +264,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // immediately to ensure we don't leak. Note, another thread // might be about to close fd. This is racy, but that's // OK. Linux is racy in the same way. - file := t.FDMap().GetFile(kdefs.FD(fd)) + file := t.GetFile(fd) if file == nil { return 0, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 9d70881fd..5329023e6 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/mm" ) @@ -122,9 +121,9 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch args[1].Int() { case linux.PR_SET_MM_EXE_FILE: - fd := kdefs.FD(args[2].Int()) + fd := args[2].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index a1965f490..b2474e60d 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -39,11 +38,11 @@ const ( // they can do large reads all at once. Bug for bug. Same for other read // calls below. func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -75,12 +74,12 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Pread64 implements linux syscall pread64(2). func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -122,11 +121,11 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Readv implements linux syscall readv(2). func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -152,12 +151,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Preadv implements linux syscall preadv(2). func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -201,13 +200,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 5th argument. - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() flags := int(args[5].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index ccdb079bb..d1165a57a 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -200,9 +199,9 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal }) defer s.DecRef() - fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } @@ -225,9 +224,6 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy fileFlags := fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, } - fdFlags := kernel.FDFlags{ - CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, - } // Create the socket pair. s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) @@ -240,20 +236,16 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy defer s2.DecRef() // Create the FDs for the sockets. - fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits()) - if err != nil { - return 0, nil, err - } - fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits()) + fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) if err != nil { - t.FDMap().Remove(fd1) return 0, nil, err } // Copy the file descriptors out. - if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil { - t.FDMap().Remove(fd1) - t.FDMap().Remove(fd2) + if _, err := t.CopyOut(socks, fds); err != nil { + // Note that we don't close files here; see pipe(2) also. return 0, nil, err } @@ -262,12 +254,12 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Connect implements the linux syscall connect(2). func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -291,14 +283,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // accept is the implementation of the accept syscall. It is called by accept // and accept4 syscall handlers. -func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { +func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { return 0, syscall.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, syscall.EBADF } @@ -331,7 +323,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr // Accept4 implements the linux syscall accept4(2). func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() flags := int(args[3].Int()) @@ -342,7 +334,7 @@ func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Accept implements the linux syscall accept(2). func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() @@ -352,12 +344,12 @@ func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Bind implements the linux syscall bind(2). func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -380,11 +372,11 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Listen implements the linux syscall listen(2). func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() backlog := args[1].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -409,11 +401,11 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Shutdown implements the linux syscall shutdown(2). func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() how := args[1].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -437,14 +429,14 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // GetSockOpt implements the linux syscall getsockopt(2). func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLenAddr := args[4].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -521,14 +513,14 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac // // Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLen := args[4].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -561,12 +553,12 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // GetSockName implements the linux syscall getsockname(2). func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -589,12 +581,12 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // GetPeerName implements the linux syscall getpeername(2). func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -617,7 +609,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // RecvMsg implements the linux syscall recvmsg(2). func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() @@ -627,7 +619,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -663,7 +655,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // RecvMMsg implements the linux syscall recvmmsg(2). func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() @@ -680,7 +672,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -842,7 +834,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // recvFrom is the implementation of the recvfrom syscall. It is called by // recvfrom and recv syscall handlers. -func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { +func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { if int(bufLen) < 0 { return 0, syscall.EINVAL } @@ -853,7 +845,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, syscall.EBADF } @@ -903,7 +895,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f // RecvFrom implements the linux syscall recvfrom(2). func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() @@ -916,7 +908,7 @@ func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // SendMsg implements the linux syscall sendmsg(2). func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() @@ -926,7 +918,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -953,7 +945,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // SendMMsg implements the linux syscall sendmmsg(2). func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() @@ -964,7 +956,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -1079,14 +1071,14 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // sendTo is the implementation of the sendto syscall. It is called by sendto // and send syscall handlers. -func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { +func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { return 0, syscall.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, syscall.EBADF } @@ -1135,7 +1127,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla // SendTo implements the linux syscall sendto(2). func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index b6517313f..a7c98efcb 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -81,8 +80,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB // Sendfile implements linux system call sendfile(2). func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - outFD := kdefs.FD(args[0].Int()) - inFD := kdefs.FD(args[1].Int()) + outFD := args[0].Int() + inFD := args[1].Int() offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) @@ -92,13 +91,13 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } @@ -163,9 +162,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Splice implements splice(2). func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - inFD := kdefs.FD(args[0].Int()) + inFD := args[0].Int() inOffset := args[1].Pointer() - outFD := kdefs.FD(args[2].Int()) + outFD := args[2].Int() outOffset := args[3].Pointer() count := int64(args[4].SizeT()) flags := args[5].Int() @@ -182,13 +181,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } @@ -251,8 +250,8 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Tee imlements tee(2). func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - inFD := kdefs.FD(args[0].Int()) - outFD := kdefs.FD(args[1].Int()) + inFD := args[0].Int() + outFD := args[1].Int() count := int64(args[2].SizeT()) flags := args[3].Int() @@ -265,13 +264,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 9a5657254..5556bc276 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,7 +41,7 @@ func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Fstatat implements linux syscall newfstatat, i.e. fstatat(2). func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() statAddr := args[2].Pointer() flags := args[3].Int() @@ -54,7 +53,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if path == "" { // Annoying. What's wrong with fstat? - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -93,10 +92,10 @@ func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fstat implements linux syscall fstat(2). func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() statAddr := args[1].Pointer() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -178,7 +177,7 @@ func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs // Statx implements linux syscall statx(2). func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() pathAddr := args[1].Pointer() flags := args[2].Int() mask := args[3].Uint() @@ -197,7 +196,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if path == "" { - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -285,10 +284,10 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fstatfs implements linux syscall fstatfs(2). func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() statfsAddr := args[1].Pointer() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 37225735f..3e55235bd 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +31,9 @@ func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Syncfs implements linux system call syncfs(2). func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -47,9 +46,9 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fsync implements linux syscall fsync(2). func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -63,9 +62,9 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // At the moment, it just calls Fsync, which is a big hammer, but correct. func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -79,6 +78,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { var err error + fd := args[0].Int() offset := args[1].Int64() nbytes := args[2].Int64() uflags := args[3].Uint() @@ -97,8 +97,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel nbytes = fs.FileMaxOffset } - fd := kdefs.FD(args[0].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index ea6d44315..1ce5ce4c3 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -49,9 +48,9 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) - fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } @@ -61,7 +60,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // TimerfdSettime implements Linux syscall timerfd_settime(2). func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() flags := args[1].Int() newValAddr := args[2].Pointer() oldValAddr := args[3].Pointer() @@ -70,7 +69,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne return 0, nil, syserror.EINVAL } - f := t.FDMap().GetFile(fd) + f := t.GetFile(fd) if f == nil { return 0, nil, syserror.EBADF } @@ -101,10 +100,10 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // TimerfdGettime implements Linux syscall timerfd_gettime(2). func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() curValAddr := args[1].Pointer() - f := t.FDMap().GetFile(fd) + f := t.GetFile(fd) if f == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 3a5bf9ac4..5278c96a6 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -39,11 +38,11 @@ const ( // Write implements linux syscall write(2). func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -75,12 +74,12 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Pwrite64 implements linux syscall pwrite64(2). func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -122,11 +121,11 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Writev implements linux syscall writev(2). func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -152,12 +151,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Pwritev implements linux syscall pwritev(2). func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -202,7 +201,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 5th argument. - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() @@ -212,7 +211,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EACCES } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index d91f66d95..16cd6540f 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel:uncaught_signal_go_proto", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/limits", "//pkg/sentry/loader", "//pkg/sentry/pgalloc", diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 59e1b46ec..e5de1f3d7 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -21,32 +21,23 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" ) -// createFDMap creates an FD map that contains stdin, stdout, and stderr. If -// console is true, then ioctl calls will be passed through to the host FD. +// createFDTable creates an FD table that contains stdin, stdout, and stderr. +// If console is true, then ioctl calls will be passed through to the host FD. // Upon success, createFDMap dups then closes stdioFDs. -func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { if len(stdioFDs) != 3 { return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } k := kernel.KernelFromContext(ctx) - fdm := k.NewFDMap() - defer fdm.DecRef() + fdTable := k.NewFDTable() + defer fdTable.DecRef() mounter := fs.FileOwnerFromContext(ctx) - // Maps sandbox FD to host FD. - fdMap := map[int]int{ - 0: stdioFDs[0], - 1: stdioFDs[1], - 2: stdioFDs[2], - } - var ttyFile *fs.File - for appFD, hostFD := range fdMap { + for appFD, hostFD := range stdioFDs { var appFile *fs.File if console && appFD < 3 { @@ -80,11 +71,11 @@ func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs } // Add the file to the FD map. - if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil { + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { return nil, err } } - fdm.IncRef() - return fdm, nil + fdTable.IncRef() + return fdTable, nil } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 9da0c7067..f9a6f2d3c 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -25,8 +25,10 @@ import ( // Include filesystem types that OCI spec might mount. _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" _ "gvisor.dev/gvisor/pkg/sentry/fs/host" _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" @@ -36,8 +38,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/gofer" - "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 89f7d9f94..7e27d1f49 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -517,13 +517,13 @@ func (l *Loader) run() error { // Create the FD map, which will set stdin, stdout, and stderr. If console // is true, then ioctl calls will be passed through to the host fd. ctx := l.rootProcArgs.NewContext(l.k) - fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs) + fdTable, err := createFDTable(ctx, l.console, l.stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on FDMap if successful. We won't need // ours either way. - l.rootProcArgs.FDMap = fdm + l.rootProcArgs.FDTable = fdTable // cid for root container can be empty. Only subcontainers need it to set // the mount location. @@ -562,13 +562,13 @@ func (l *Loader) run() error { return fmt.Errorf("creating init process: %v", err) } - // CreateProcess takes a reference on FDMap if successful. - l.rootProcArgs.FDMap.DecRef() + // CreateProcess takes a reference on FDTable if successful. + l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() if l.console { - ttyFile := l.rootProcArgs.FDMap.GetFile(0) + ttyFile, _ := l.rootProcArgs.FDTable.Get(0) defer ttyFile.DecRef() ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations) @@ -648,13 +648,13 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Create the FD map, which will set stdin, stdout, and stderr. ctx := procArgs.NewContext(l.k) - fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs) + fdTable, err := createFDTable(ctx, false, stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } - // CreateProcess takes a reference on FDMap if successful. We won't need ours - // either way. - procArgs.FDMap = fdm + // CreateProcess takes a reference on fdTable if successful. We won't + // need ours either way. + procArgs.FDTable = fdTable // Can't take ownership away from os.File. dup them to get a new FDs. var goferFDs []int @@ -683,8 +683,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file } l.k.StartProcess(tg) - // CreateProcess takes a reference on FDMap if successful. - procArgs.FDMap.DecRef() + // CreateProcess takes a reference on FDTable if successful. + procArgs.FDTable.DecRef() l.processes[eid].tg = tg return nil -- cgit v1.2.3 From 67f2cefce02816307805699c3462d6fd7ce61b69 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 3 Jul 2019 22:50:26 -0700 Subject: Avoid importing platforms from many source files PiperOrigin-RevId: 256494243 --- pkg/sentry/platform/BUILD | 1 + pkg/sentry/platform/kvm/BUILD | 2 ++ pkg/sentry/platform/kvm/filters.go | 33 ++++++++++++++++++++++++++++++ pkg/sentry/platform/kvm/kvm.go | 14 +++++++++++++ pkg/sentry/platform/platform.go | 28 ++++++++++++++++++++++++++ pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/filters.go | 33 ++++++++++++++++++++++++++++++ pkg/sentry/platform/ptrace/ptrace.go | 15 ++++++++++++++ runsc/BUILD | 2 ++ runsc/boot/BUILD | 3 +-- runsc/boot/config.go | 38 ++--------------------------------- runsc/boot/filter/BUILD | 2 -- runsc/boot/filter/config.go | 23 --------------------- runsc/boot/filter/filter.go | 13 +----------- runsc/boot/loader.go | 20 ++++++------------ runsc/boot/loader_test.go | 1 + runsc/boot/platforms/BUILD | 16 +++++++++++++++ runsc/boot/platforms/platforms.go | 30 +++++++++++++++++++++++++++ runsc/cmd/BUILD | 1 + runsc/cmd/boot.go | 3 ++- runsc/container/BUILD | 1 + runsc/container/container_test.go | 3 ++- runsc/main.go | 7 ++++--- runsc/sandbox/BUILD | 3 ++- runsc/sandbox/sandbox.go | 23 +++++++++------------ runsc/test/testutil/testutil.go | 1 + 26 files changed, 209 insertions(+), 108 deletions(-) create mode 100644 pkg/sentry/platform/kvm/filters.go create mode 100644 pkg/sentry/platform/ptrace/filters.go create mode 100644 runsc/boot/platforms/BUILD create mode 100644 runsc/boot/platforms/platforms.go (limited to 'runsc') diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 0b9962b2b..9aa6ec507 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/abi/linux", "//pkg/atomicbitops", "//pkg/log", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/platform/safecopy", diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 9ccf77fdf..ad8b95744 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -14,6 +14,7 @@ go_library( "bluepill_fault.go", "bluepill_unsafe.go", "context.go", + "filters.go", "kvm.go", "kvm_amd64.go", "kvm_amd64_unsafe.go", @@ -33,6 +34,7 @@ go_library( "//pkg/cpuid", "//pkg/log", "//pkg/procid", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters.go new file mode 100644 index 000000000..7d949f1dd --- /dev/null +++ b/pkg/sentry/platform/kvm/filters.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the KVM platform. +func (*KVM) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: {}, + syscall.SYS_IOCTL: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_RT_SIGSUSPEND: {}, + syscall.SYS_RT_SIGTIMEDWAIT: {}, + 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. + } +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index b49d7f3c4..ee4cd2f4d 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -141,3 +141,17 @@ func (k *KVM) NewContext() platform.Context { machine: k.machine, } } + +type constructor struct{} + +func (*constructor) New(f *os.File) (platform.Platform, error) { + return New(f) +} + +func (*constructor) OpenDevice() (*os.File, error) { + return OpenDevice() +} + +func init() { + platform.Register("kvm", &constructor{}) +} diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index eccbe2336..ec22dbf87 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -19,8 +19,10 @@ package platform import ( "fmt" + "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -93,6 +95,9 @@ type Platform interface { // Platforms for which this does not hold may panic if PreemptAllCPUs is // called. PreemptAllCPUs() error + + // SyscallFilters returns syscalls made exclusively by this platform. + SyscallFilters() seccomp.SyscallRules } // NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and @@ -347,3 +352,26 @@ type File interface { func (fr FileRange) String() string { return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) } + +// Constructor represents a platform type. +type Constructor interface { + New(deviceFile *os.File) (Platform, error) + OpenDevice() (*os.File, error) +} + +// platforms contains all available platform types. +var platforms = map[string]Constructor{} + +// Register registers a new platform type. +func Register(name string, platform Constructor) { + platforms[name] = platform +} + +// Lookup looks up the platform constructor by name. +func Lookup(name string) (Constructor, error) { + p, ok := platforms[name] + if !ok { + return nil, fmt.Errorf("unknown platform: %v", name) + } + return p, nil +} diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 6a1343f47..1b6c54e96 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "ptrace", srcs = [ + "filters.go", "ptrace.go", "ptrace_unsafe.go", "stub_amd64.s", diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go new file mode 100644 index 000000000..1e07cfd0d --- /dev/null +++ b/pkg/sentry/platform/ptrace/filters.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the ptrace platform. +func (*PTrace) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + unix.SYS_GETCPU: {}, + unix.SYS_SCHED_SETAFFINITY: {}, + syscall.SYS_PTRACE: {}, + syscall.SYS_TGKILL: {}, + syscall.SYS_WAIT4: {}, + } +} diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index ee7e0640c..6fd30ed25 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -45,6 +45,7 @@ package ptrace import ( + "os" "sync" "gvisor.dev/gvisor/pkg/abi/linux" @@ -236,3 +237,17 @@ func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan s func (*PTrace) NewContext() platform.Context { return &context{} } + +type constructor struct{} + +func (*constructor) New(*os.File) (platform.Platform, error) { + return New() +} + +func (*constructor) OpenDevice() (*os.File, error) { + return nil, nil +} + +func init() { + platform.Register("ptrace", &constructor{}) +} diff --git a/runsc/BUILD b/runsc/BUILD index 0e4cf8e09..6b8c92706 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -16,6 +16,7 @@ go_binary( x_defs = {"main.version": "{VERSION}"}, deps = [ "//pkg/log", + "//pkg/sentry/platform", "//runsc/boot", "//runsc/cmd", "//runsc/specutils", @@ -47,6 +48,7 @@ go_binary( x_defs = {"main.version": "{VERSION}"}, deps = [ "//pkg/log", + "//pkg/sentry/platform", "//runsc/boot", "//runsc/cmd", "//runsc/specutils", diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 16cd6540f..5025401dd 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -56,8 +56,6 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/platform/kvm", - "//pkg/sentry/platform/ptrace", "//pkg/sentry/sighandling", "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/hostinet", @@ -86,6 +84,7 @@ go_library( "//pkg/tcpip/transport/udp", "//pkg/urpc", "//runsc/boot/filter", + "//runsc/boot/platforms", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 6d276f207..6f1eb9a41 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -22,40 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" ) -// PlatformType tells which platform to use. -type PlatformType int - -const ( - // PlatformPtrace runs the sandbox with the ptrace platform. - PlatformPtrace PlatformType = iota - - // PlatformKVM runs the sandbox with the KVM platform. - PlatformKVM -) - -// MakePlatformType converts type from string. -func MakePlatformType(s string) (PlatformType, error) { - switch s { - case "ptrace": - return PlatformPtrace, nil - case "kvm": - return PlatformKVM, nil - default: - return 0, fmt.Errorf("invalid platform type %q", s) - } -} - -func (p PlatformType) String() string { - switch p { - case PlatformPtrace: - return "ptrace" - case PlatformKVM: - return "kvm" - default: - return fmt.Sprintf("unknown(%d)", p) - } -} - // FileAccessType tells how the filesystem is accessed. type FileAccessType int @@ -187,7 +153,7 @@ type Config struct { LogPackets bool // Platform is the platform to run on. - Platform PlatformType + Platform string // Strace indicates that strace should be enabled. Strace bool @@ -247,7 +213,7 @@ func (c *Config) ToFlags() []string { "--overlay=" + strconv.FormatBool(c.Overlay), "--network=" + c.Network.String(), "--log-packets=" + strconv.FormatBool(c.LogPackets), - "--platform=" + c.Platform.String(), + "--platform=" + c.Platform, "--strace=" + strconv.FormatBool(c.Strace), "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index 07898f3de..f5509b6b7 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -20,8 +20,6 @@ go_library( "//pkg/log", "//pkg/seccomp", "//pkg/sentry/platform", - "//pkg/sentry/platform/kvm", - "//pkg/sentry/platform/ptrace", "//pkg/tcpip/link/fdbased", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index e4ccb40d9..0ee5b8bbd 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -437,29 +437,6 @@ func hostInetFilters() seccomp.SyscallRules { } } -// ptraceFilters returns syscalls made exclusively by the ptrace platform. -func ptraceFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - unix.SYS_GETCPU: {}, - unix.SYS_SCHED_SETAFFINITY: {}, - syscall.SYS_PTRACE: {}, - syscall.SYS_TGKILL: {}, - syscall.SYS_WAIT4: {}, - } -} - -// kvmFilters returns syscalls made exclusively by the KVM platform. -func kvmFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - syscall.SYS_ARCH_PRCTL: {}, - syscall.SYS_IOCTL: {}, - syscall.SYS_MMAP: {}, - syscall.SYS_RT_SIGSUSPEND: {}, - syscall.SYS_RT_SIGTIMEDWAIT: {}, - 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. - } -} - func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT: []seccomp.Rule{ diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go index 468481f29..e80c171b3 100644 --- a/runsc/boot/filter/filter.go +++ b/runsc/boot/filter/filter.go @@ -18,13 +18,9 @@ package filter import ( - "fmt" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm" - "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" ) // Options are seccomp filter related options. @@ -53,14 +49,7 @@ func Install(opt Options) error { s.Merge(profileFilters()) } - switch p := opt.Platform.(type) { - case *ptrace.PTrace: - s.Merge(ptraceFilters()) - case *kvm.KVM: - s.Merge(kvmFilters()) - default: - return fmt.Errorf("unknown platform type %T", p) - } + s.Merge(opt.Platform.SyscallFilters()) return seccomp.Install(s) } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 7e27d1f49..38e426ee7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -42,8 +42,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm" - "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" "gvisor.dev/gvisor/pkg/sentry/sighandling" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/time" @@ -59,6 +57,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" + _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. @@ -416,19 +415,12 @@ func (l *Loader) Destroy() { } func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { - switch conf.Platform { - case PlatformPtrace: - log.Infof("Platform: ptrace") - return ptrace.New() - case PlatformKVM: - log.Infof("Platform: kvm") - if deviceFile == nil { - return nil, fmt.Errorf("kvm device file must be provided") - } - return kvm.New(deviceFile) - default: - return nil, fmt.Errorf("invalid platform %v", conf.Platform) + p, err := platform.Lookup(conf.Platform) + if err != nil { + panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) } + log.Infof("Platform: %s", conf.Platform) + return p.New(deviceFile) } func createMemoryFile() (*pgalloc.MemoryFile, error) { diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index eca592e5b..ff713660d 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -47,6 +47,7 @@ func testConfig() *Config { RootDir: "unused_root_dir", Network: NetworkNone, DisableSeccomp: true, + Platform: "ptrace", } } diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD new file mode 100644 index 000000000..03391cdca --- /dev/null +++ b/runsc/boot/platforms/BUILD @@ -0,0 +1,16 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "platforms", + srcs = ["platforms.go"], + importpath = "gvisor.dev/gvisor/runsc/boot/platforms", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + ], +) diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go new file mode 100644 index 000000000..056b46ad5 --- /dev/null +++ b/runsc/boot/platforms/platforms.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package platforms imports all available platform packages. +package platforms + +import ( + // Import platforms that runsc might use. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +const ( + // Ptrace runs the sandbox with the ptrace platform. + Ptrace = "ptrace" + + // KVM runs the sandbox with the KVM platform. + KVM = "kvm" +) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 2c8b84252..5223b9972 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -46,6 +46,7 @@ go_library( "//pkg/unet", "//pkg/urpc", "//runsc/boot", + "//runsc/boot/platforms", "//runsc/console", "//runsc/container", "//runsc/fsgofer", diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 272eb14d3..b40fded5b 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" ) @@ -172,7 +173,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if caps == nil { caps = &specs.LinuxCapabilities{} } - if conf.Platform == boot.PlatformPtrace { + if conf.Platform == platforms.Ptrace { // Ptrace platform requires extra capabilities. const c = "CAP_SYS_PTRACE" caps.Bounding = append(caps.Bounding, c) diff --git a/runsc/container/BUILD b/runsc/container/BUILD index ebe77165e..e246c38ae 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -53,6 +53,7 @@ go_test( "//pkg/unet", "//pkg/urpc", "//runsc/boot", + "//runsc/boot/platforms", "//runsc/specutils", "//runsc/test/testutil", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index b09b80715..baba09fe1 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/test/testutil" ) @@ -256,7 +257,7 @@ func configs(opts ...configOption) []*boot.Config { if testutil.RaceEnabled { continue } - c.Platform = boot.PlatformKVM + c.Platform = platforms.KVM case nonExclusiveFS: c.FileAccess = boot.FileAccessShared default: diff --git a/runsc/main.go b/runsc/main.go index 135061cd3..bc83c57a2 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -30,6 +30,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd" "gvisor.dev/gvisor/runsc/specutils" @@ -61,7 +62,7 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") gso = flag.Bool("gso", true, "enable generic segmenation offload") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") @@ -139,8 +140,8 @@ func main() { } cmd.ErrorLogger = errorLogger - platformType, err := boot.MakePlatformType(*platform) - if err != nil { + platformType := *platformName + if _, err := platform.Lookup(platformType); err != nil { cmd.Fatalf("%v", err) } diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index f32da45c1..7fdceaab6 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -18,9 +18,10 @@ go_library( "//pkg/control/server", "//pkg/log", "//pkg/sentry/control", - "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform", "//pkg/urpc", "//runsc/boot", + "//runsc/boot/platforms", "//runsc/cgroup", "//runsc/console", "//runsc/specutils", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 6bebf0737..4a11f617d 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -32,9 +32,10 @@ import ( "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/specutils" @@ -491,7 +492,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF {Type: specs.UTSNamespace}, } - if conf.Platform == boot.PlatformPtrace { + if conf.Platform == platforms.Ptrace { // TODO(b/75837838): Also set a new PID namespace so that we limit // access to other host processes. log.Infof("Sandbox will be started in the current PID namespace") @@ -1046,19 +1047,15 @@ func (s *Sandbox) waitForStopped() error { // deviceFileForPlatform opens the device file for the given platform. If the // platform does not need a device file, then nil is returned. -func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) { - var ( - f *os.File - err error - ) - switch p { - case boot.PlatformKVM: - f, err = kvm.OpenDevice() - default: - return nil, nil +func deviceFileForPlatform(name string) (*os.File, error) { + p, err := platform.Lookup(name) + if err != nil { + return nil, err } + + f, err := p.OpenDevice() if err != nil { return nil, fmt.Errorf("opening device file for platform %q: %v", p, err) } - return f, err + return f, nil } diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index ecab6871d..a98675bfc 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -132,6 +132,7 @@ func TestConfig() *boot.Config { LogPackets: true, Network: boot.NetworkNone, Strace: true, + Platform: "ptrace", FileAccess: boot.FileAccessExclusive, TestOnlyAllowRunAsCurrentUserWithoutChroot: true, NumNetworkChannels: 1, -- cgit v1.2.3 From 659bebab8e83ec9b5f6fef26ca27048af526ee40 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 8 Jul 2019 12:55:37 -0700 Subject: Don't try to execute a file that is not regular. PiperOrigin-RevId: 257037608 --- pkg/sentry/fs/mounts.go | 5 ++++ runsc/container/container_test.go | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'runsc') diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index ce7ffeed2..693ffc760 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -663,6 +663,11 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s } defer d.DecRef() + // Check that it is a regular file. + if !IsRegular(d.Inode.StableAttr) { + continue + } + // Check whether we can read and execute the found file. if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil { log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index baba09fe1..c1d6ca7b8 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -17,6 +17,7 @@ package container import ( "bytes" "fmt" + "io" "io/ioutil" "os" "path" @@ -409,6 +410,46 @@ func TestLifecycle(t *testing.T) { // Test the we can execute the application with different path formats. func TestExePath(t *testing.T) { + // Create two directories that will be prepended to PATH. + firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first") + if err != nil { + t.Fatal(err) + } + secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second") + if err != nil { + t.Fatal(err) + } + + // Create two minimal executables in the second path, two of which + // will be masked by files in first path. + for _, p := range []string{"unmasked", "masked1", "masked2"} { + path := filepath.Join(secondPath, p) + f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777) + if err != nil { + t.Fatal(err) + } + defer f.Close() + if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil { + t.Fatal(err) + } + } + + // Create a non-executable file in the first path which masks a healthy + // executable in the second. + nonExecutable := filepath.Join(firstPath, "masked1") + f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666) + if err != nil { + t.Fatal(err) + } + f2.Close() + + // Create a non-regular file in the first path which masks a healthy + // executable in the second. + nonRegular := filepath.Join(firstPath, "masked2") + if err := os.Mkdir(nonRegular, 0777); err != nil { + t.Fatal(err) + } + for _, conf := range configs(overlay) { t.Logf("Running test with conf: %+v", conf) for _, test := range []struct { @@ -421,8 +462,24 @@ func TestExePath(t *testing.T) { {path: "thisfiledoesntexit", success: false}, {path: "bin/thisfiledoesntexit", success: false}, {path: "/bin/thisfiledoesntexit", success: false}, + + {path: "unmasked", success: true}, + {path: filepath.Join(firstPath, "unmasked"), success: false}, + {path: filepath.Join(secondPath, "unmasked"), success: true}, + + {path: "masked1", success: true}, + {path: filepath.Join(firstPath, "masked1"), success: false}, + {path: filepath.Join(secondPath, "masked1"), success: true}, + + {path: "masked2", success: true}, + {path: filepath.Join(firstPath, "masked2"), success: false}, + {path: filepath.Join(secondPath, "masked2"), success: true}, } { spec := testutil.NewSpecWithArgs(test.path) + spec.Process.Env = []string{ + fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")), + } + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("exec: %s, error setting up container: %v", test.path, err) -- cgit v1.2.3