diff options
Diffstat (limited to 'pkg/sentry')
67 files changed, 1469 insertions, 1054 deletions
diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD index 14a8bf9cd..71c59287c 100644 --- a/pkg/sentry/devices/tundev/BUILD +++ b/pkg/sentry/devices/tundev/BUILD @@ -17,7 +17,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/tcpip/link/tun", - "//pkg/tcpip/network/arp", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go index ff5d49fbd..d8f4e1d35 100644 --- a/pkg/sentry/devices/tundev/tundev.go +++ b/pkg/sentry/devices/tundev/tundev.go @@ -16,8 +16,6 @@ package tundev import ( - "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -28,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" - "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -91,16 +88,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg return 0, err } flags := usermem.ByteOrder.Uint16(req.Data[:]) - created, err := fd.device.SetIff(stack.Stack, req.Name(), flags) - if err == nil && created { - // Always start with an ARP address for interfaces so they can handle ARP - // packets. - nicID := fd.device.NICID() - if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID)) - } - } - return 0, err + return 0, fd.device.SetIff(stack.Stack, req.Name(), flags) case linux.TUNGETIFF: var req linux.IFReq diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 6b7b451b8..9379a4d7b 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -34,7 +34,6 @@ go_library( "//pkg/sentry/socket/netstack", "//pkg/syserror", "//pkg/tcpip/link/tun", - "//pkg/tcpip/network/arp", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index 19ffdec47..5227ef652 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -15,8 +15,6 @@ package dev import ( - "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -27,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" - "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -112,16 +109,7 @@ func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io user return 0, err } flags := usermem.ByteOrder.Uint16(req.Data[:]) - created, err := n.device.SetIff(stack.Stack, req.Name(), flags) - if err == nil && created { - // Always start with an ARP address for interfaces so they can handle ARP - // packets. - nicID := n.device.NICID() - if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID)) - } - } - return 0, err + return 0, n.device.SetIff(stack.Stack, req.Name(), flags) case linux.TUNGETIFF: var req linux.IFReq diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 8fe626e1c..e6171dd1d 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -57,16 +57,16 @@ type execArgInode struct { var _ fs.InodeOperations = (*execArgInode)(nil) // newExecArgFile creates a file containing the exec args of the given type. -func newExecArgInode(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode { +func newExecArgInode(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode { if arg != cmdlineExecArg && arg != environExecArg { panic(fmt.Sprintf("unknown exec arg type %v", arg)) } f := &execArgInode{ - SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), arg: arg, t: t, } - return newProcInode(t, f, msrc, fs.SpecialFile, t) + return newProcInode(ctx, f, msrc, fs.SpecialFile, t) } // GetFile implements fs.InodeOperations.GetFile. diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 45523adf8..e90da225a 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -95,13 +95,13 @@ var _ fs.InodeOperations = (*fd)(nil) // newFd returns a new fd based on an existing file. // // This inherits one reference to the file. -func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode { +func newFd(ctx context.Context, t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode { fd := &fd{ // RootOwner overridden by taskOwnedInodeOps.UnstableAttrs(). - Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""), + Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""), file: f, } - return newProcInode(t, fd, msrc, fs.Symlink, t) + return newProcInode(ctx, fd, msrc, fs.Symlink, t) } // GetFile returns the fs.File backing this fd. The dirent and flags @@ -153,12 +153,12 @@ type fdDir struct { var _ fs.InodeOperations = (*fdDir)(nil) // newFdDir creates a new fdDir. -func newFdDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newFdDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { f := &fdDir{ - Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}}), + Dir: *ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}}), t: t, } - return newProcInode(t, f, msrc, fs.SpecialDirectory, t) + return newProcInode(ctx, f, msrc, fs.SpecialDirectory, t) } // Check implements InodeOperations.Check. @@ -183,7 +183,7 @@ func (f *fdDir) Check(ctx context.Context, inode *fs.Inode, req fs.PermMask) boo // Lookup loads an Inode in /proc/TID/fd into a Dirent. func (f *fdDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { n, err := walkDescriptors(f.t, p, func(file *fs.File, _ kernel.FDFlags) *fs.Inode { - return newFd(f.t, file, dir.MountSource) + return newFd(ctx, f.t, file, dir.MountSource) }) if err != nil { return nil, err @@ -237,12 +237,12 @@ type fdInfoDir struct { } // newFdInfoDir creates a new fdInfoDir. -func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newFdInfoDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { fdid := &fdInfoDir{ - Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500)), + Dir: *ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0500)), t: t, } - return newProcInode(t, fdid, msrc, fs.SpecialDirectory, t) + return newProcInode(ctx, fdid, msrc, fs.SpecialDirectory, t) } // Lookup loads an fd in /proc/TID/fdinfo into a Dirent. diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 83a43aa26..03127f816 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -41,7 +41,7 @@ import ( // LINT.IfChange // newNetDir creates a new proc net entry. -func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newNetDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { k := t.Kernel() var contents map[string]*fs.Inode @@ -49,39 +49,39 @@ func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. contents = map[string]*fs.Inode{ - "dev": seqfile.NewSeqFileInode(t, &netDev{s: s}, msrc), - "snmp": seqfile.NewSeqFileInode(t, &netSnmp{s: s}, msrc), + "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), + "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), // The following files are simple stubs until they are // implemented in netstack, if the file contains a // header the stub is just the header otherwise it is // an empty file. - "arp": newStaticProcInode(t, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), + "arp": newStaticProcInode(ctx, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), - "netlink": newStaticProcInode(t, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), - "netstat": newStaticProcInode(t, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), - "packet": newStaticProcInode(t, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), - "protocols": newStaticProcInode(t, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), + "netlink": newStaticProcInode(ctx, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), + "netstat": newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), + "packet": newStaticProcInode(ctx, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), + "protocols": newStaticProcInode(ctx, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), // Linux sets psched values to: nsec per usec, psched // tick in ns, 1000000, high res timer ticks per sec // (ClockGetres returns 1ns resolution). - "psched": newStaticProcInode(t, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), - "ptype": newStaticProcInode(t, msrc, []byte("Type Device Function\n")), - "route": seqfile.NewSeqFileInode(t, &netRoute{s: s}, msrc), - "tcp": seqfile.NewSeqFileInode(t, &netTCP{k: k}, msrc), - "udp": seqfile.NewSeqFileInode(t, &netUDP{k: k}, msrc), - "unix": seqfile.NewSeqFileInode(t, &netUnix{k: k}, msrc), + "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), + "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function\n")), + "route": seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc), + "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), + "udp": seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc), + "unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc), } if s.SupportsIPv6() { - contents["if_inet6"] = seqfile.NewSeqFileInode(t, &ifinet6{s: s}, msrc) - contents["ipv6_route"] = newStaticProcInode(t, msrc, []byte("")) - contents["tcp6"] = seqfile.NewSeqFileInode(t, &netTCP6{k: k}, msrc) - contents["udp6"] = newStaticProcInode(t, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) + contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc) + contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte("")) + contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc) + contents["udp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) } } - d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(t, d, msrc, fs.SpecialDirectory, t) + d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) + return newProcInode(ctx, d, msrc, fs.SpecialDirectory, t) } // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6. diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 77e0e1d26..2f2a9f920 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -179,7 +179,7 @@ func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dire } // Wrap it in a taskDir. - td := p.newTaskDir(otherTask, dir.MountSource, true) + td := p.newTaskDir(ctx, otherTask, dir.MountSource, true) return fs.NewDirent(ctx, td, name), nil } diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 450044c9c..f43d6c221 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -79,45 +79,45 @@ type taskDir struct { var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. -func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { +func (p *proc) newTaskDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "cwd": newCwd(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - "io": newIO(t, msrc, isThreadGroup), - "maps": newMaps(t, msrc), - "mem": newMem(t, msrc), - "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "net": newNetDir(t, msrc), - "ns": newNamespaceDir(t, msrc), - "oom_score": newOOMScore(t, msrc), - "oom_score_adj": newOOMScoreAdj(t, msrc), - "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), - "statm": newStatm(t, msrc), - "status": newStatus(t, msrc, p.pidns), - "uid_map": newUIDMap(t, msrc), + "auxv": newAuxvec(ctx, t, msrc), + "cmdline": newExecArgInode(ctx, t, msrc, cmdlineExecArg), + "comm": newComm(ctx, t, msrc), + "cwd": newCwd(ctx, t, msrc), + "environ": newExecArgInode(ctx, t, msrc, environExecArg), + "exe": newExe(ctx, t, msrc), + "fd": newFdDir(ctx, t, msrc), + "fdinfo": newFdInfoDir(ctx, t, msrc), + "gid_map": newGIDMap(ctx, t, msrc), + "io": newIO(ctx, t, msrc, isThreadGroup), + "maps": newMaps(ctx, t, msrc), + "mem": newMem(ctx, t, msrc), + "mountinfo": seqfile.NewSeqFileInode(ctx, &mountInfoFile{t: t}, msrc), + "mounts": seqfile.NewSeqFileInode(ctx, &mountsFile{t: t}, msrc), + "net": newNetDir(ctx, t, msrc), + "ns": newNamespaceDir(ctx, t, msrc), + "oom_score": newOOMScore(ctx, msrc), + "oom_score_adj": newOOMScoreAdj(ctx, t, msrc), + "smaps": newSmaps(ctx, t, msrc), + "stat": newTaskStat(ctx, t, msrc, isThreadGroup, p.pidns), + "statm": newStatm(ctx, t, msrc), + "status": newStatus(ctx, t, msrc, p.pidns), + "uid_map": newUIDMap(ctx, t, msrc), } if isThreadGroup { - contents["task"] = p.newSubtasks(t, msrc) + contents["task"] = p.newSubtasks(ctx, t, msrc) } if len(p.cgroupControllers) > 0 { - contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) + contents["cgroup"] = newCGroupInode(ctx, msrc, p.cgroupControllers) } // N.B. taskOwnedInodeOps enforces dumpability-based ownership. d := &taskDir{ - Dir: *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)), + Dir: *ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)), t: t, } - return newProcInode(t, d, msrc, fs.SpecialDirectory, t) + return newProcInode(ctx, d, msrc, fs.SpecialDirectory, t) } // subtasks represents a /proc/TID/task directory. @@ -132,13 +132,13 @@ type subtasks struct { var _ fs.InodeOperations = (*subtasks)(nil) -func (p *proc) newSubtasks(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func (p *proc) newSubtasks(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { s := &subtasks{ - Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555)), + Dir: *ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)), t: t, p: p, } - return newProcInode(t, s, msrc, fs.SpecialDirectory, t) + return newProcInode(ctx, s, msrc, fs.SpecialDirectory, t) } // UnstableAttr returns unstable attributes of the subtasks. @@ -243,7 +243,7 @@ func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dir return nil, syserror.ENOENT } - td := s.p.newTaskDir(task, dir.MountSource, false) + td := s.p.newTaskDir(ctx, task, dir.MountSource, false) return fs.NewDirent(ctx, td, p), nil } @@ -256,12 +256,12 @@ type exe struct { t *kernel.Task } -func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newExe(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { exeSymlink := &exe{ - Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""), + Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""), t: t, } - return newProcInode(t, exeSymlink, msrc, fs.Symlink, t) + return newProcInode(ctx, exeSymlink, msrc, fs.Symlink, t) } func (e *exe) executable() (file fsbridge.File, err error) { @@ -311,12 +311,12 @@ type cwd struct { t *kernel.Task } -func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newCwd(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { cwdSymlink := &cwd{ - Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""), + Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""), t: t, } - return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t) + return newProcInode(ctx, cwdSymlink, msrc, fs.Symlink, t) } // Readlink implements fs.InodeOperations. @@ -355,17 +355,17 @@ type namespaceSymlink struct { t *kernel.Task } -func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode { +func newNamespaceSymlink(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode { // TODO(rahat): Namespace symlinks should contain the namespace name and the // inode number for the namespace instance, so for example user:[123456]. We // currently fake the inode number by sticking the symlink inode in its // place. target := fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno()) n := &namespaceSymlink{ - Symlink: *ramfs.NewSymlink(t, fs.RootOwner, target), + Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, target), t: t, } - return newProcInode(t, n, msrc, fs.Symlink, t) + return newProcInode(ctx, n, msrc, fs.Symlink, t) } // Readlink reads the symlink value. @@ -390,14 +390,14 @@ func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Di return fs.NewDirent(ctx, newProcInode(ctx, iops, inode.MountSource, fs.RegularFile, nil), n.Symlink.Target), nil } -func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newNamespaceDir(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { contents := map[string]*fs.Inode{ - "net": newNamespaceSymlink(t, msrc, "net"), - "pid": newNamespaceSymlink(t, msrc, "pid"), - "user": newNamespaceSymlink(t, msrc, "user"), + "net": newNamespaceSymlink(ctx, t, msrc, "net"), + "pid": newNamespaceSymlink(ctx, t, msrc, "pid"), + "user": newNamespaceSymlink(ctx, t, msrc, "user"), } - d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0511)) - return newProcInode(t, d, msrc, fs.SpecialDirectory, t) + d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0511)) + return newProcInode(ctx, d, msrc, fs.SpecialDirectory, t) } // memData implements fs.Inode for /proc/[pid]/mem. @@ -428,12 +428,12 @@ type memDataFile struct { t *kernel.Task } -func newMem(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newMem(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { inode := &memData{ - SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0400), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0400), linux.PROC_SUPER_MAGIC), t: t, } - return newProcInode(t, inode, msrc, fs.SpecialFile, t) + return newProcInode(ctx, inode, msrc, fs.SpecialFile, t) } // Truncate implements fs.InodeOperations.Truncate. @@ -489,8 +489,8 @@ type mapsData struct { t *kernel.Task } -func newMaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t) +func newMaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &mapsData{t}), msrc, fs.SpecialFile, t) } func (md *mapsData) mm() *mm.MemoryManager { @@ -529,8 +529,8 @@ type smapsData struct { t *kernel.Task } -func newSmaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t) +func newSmaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &smapsData{t}), msrc, fs.SpecialFile, t) } func (sd *smapsData) mm() *mm.MemoryManager { @@ -575,8 +575,8 @@ type taskStatData struct { pidns *kernel.PIDNamespace } -func newTaskStat(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t) +func newTaskStat(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode { + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t) } // NeedsUpdate returns whether the generation is old or not. @@ -660,8 +660,8 @@ type statmData struct { t *kernel.Task } -func newStatm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &statmData{t}), msrc, fs.SpecialFile, t) +func newStatm(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &statmData{t}), msrc, fs.SpecialFile, t) } // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. @@ -697,8 +697,8 @@ type statusData struct { pidns *kernel.PIDNamespace } -func newStatus(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode { - return newProcInode(t, seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t) +func newStatus(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode { + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &statusData{t, pidns}), msrc, fs.SpecialFile, t) } // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. @@ -768,11 +768,11 @@ type ioData struct { ioUsage } -func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { +func newIO(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { if isThreadGroup { - return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) } - return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t) + return newProcInode(ctx, seqfile.NewSeqFile(ctx, &ioData{t}), msrc, fs.SpecialFile, t) } // NeedsUpdate returns whether the generation is old or not. @@ -816,12 +816,12 @@ type comm struct { } // newComm returns a new comm file. -func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newComm(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { c := &comm{ - SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), t: t, } - return newProcInode(t, c, msrc, fs.SpecialFile, t) + return newProcInode(ctx, c, msrc, fs.SpecialFile, t) } // Check implements fs.InodeOperations.Check. @@ -888,12 +888,12 @@ type auxvec struct { } // newAuxvec returns a new auxvec file. -func newAuxvec(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newAuxvec(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { a := &auxvec{ - SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), t: t, } - return newProcInode(t, a, msrc, fs.SpecialFile, t) + return newProcInode(ctx, a, msrc, fs.SpecialFile, t) } // GetFile implements fs.InodeOperations.GetFile. @@ -949,8 +949,8 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc // newOOMScore returns a oom_score file. It is a stub that always returns 0. // TODO(gvisor.dev/issue/1967) -func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newStaticProcInode(t, msrc, []byte("0\n")) +func newOOMScore(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + return newStaticProcInode(ctx, msrc, []byte("0\n")) } // oomScoreAdj is a file containing the oom_score adjustment for a task. @@ -979,12 +979,12 @@ type oomScoreAdjFile struct { } // newOOMScoreAdj returns a oom_score_adj file. -func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { +func newOOMScoreAdj(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { i := &oomScoreAdj{ - SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), t: t, } - return newProcInode(t, i, msrc, fs.SpecialFile, t) + return newProcInode(ctx, i, msrc, fs.SpecialFile, t) } // Truncate implements fs.InodeOperations.Truncate. Truncate is called when diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go index 8d9517b95..2bc9485d8 100644 --- a/pkg/sentry/fs/proc/uid_gid_map.go +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -58,18 +58,18 @@ type idMapInodeOperations struct { var _ fs.InodeOperations = (*idMapInodeOperations)(nil) // newUIDMap returns a new uid_map file. -func newUIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newIDMap(t, msrc, false /* gids */) +func newUIDMap(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newIDMap(ctx, t, msrc, false /* gids */) } // newGIDMap returns a new gid_map file. -func newGIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { - return newIDMap(t, msrc, true /* gids */) +func newGIDMap(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newIDMap(ctx, t, msrc, true /* gids */) } -func newIDMap(t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode { - return newProcInode(t, &idMapInodeOperations{ - InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), +func newIDMap(ctx context.Context, t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode { + return newProcInode(ctx, &idMapInodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), t: t, gids: gids, }, msrc, fs.SpecialFile, t) diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index 8ccda1264..34d25a61e 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -193,11 +192,12 @@ func (conn *connection) loadInitializedChan(closed bool) { } } -// newFUSEConnection creates a FUSE connection to fd. -func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) { - // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to - // mount a FUSE filesystem. - fuseFD := fd.Impl().(*DeviceFD) +// newFUSEConnection creates a FUSE connection to fuseFD. +func newFUSEConnection(_ context.Context, fuseFD *DeviceFD, opts *filesystemOptions) (*connection, error) { + // Mark the device as ready so it can be used. + // FIXME(gvisor.dev/issue/4813): fuseFD's fields are accessed without + // synchronization and without checking if fuseFD has already been used to + // mount another filesystem. // Create the writeBuf for the header to be stored in. hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) diff --git a/pkg/sentry/fsimpl/fuse/connection_control.go b/pkg/sentry/fsimpl/fuse/connection_control.go index bfde78559..1b3459c1d 100644 --- a/pkg/sentry/fsimpl/fuse/connection_control.go +++ b/pkg/sentry/fsimpl/fuse/connection_control.go @@ -198,7 +198,6 @@ func (conn *connection) Abort(ctx context.Context) { if !conn.connected { conn.asyncMu.Unlock() conn.mu.Unlock() - conn.fd.mu.Unlock() return } diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 1b86a4b4c..1bbe6fdb7 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -94,7 +94,8 @@ type DeviceFD struct { // unprocessed in-flight requests. fullQueueCh chan struct{} `state:".(int)"` - // fs is the FUSE filesystem that this FD is being used for. + // fs is the FUSE filesystem that this FD is being used for. A reference is + // held on fs. fs *filesystem } @@ -135,12 +136,6 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R return 0, syserror.EPERM } - // Return ENODEV if the filesystem is umounted. - if fd.fs.umounted { - // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs. - return 0, syserror.ENODEV - } - // We require that any Read done on this filesystem have a sane minimum // read buffer. It must have the capacity for the fixed parts of any request // header (Linux uses the request header and the FUSEWriteIn header for this @@ -368,7 +363,7 @@ func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask - if fd.fs.umounted { + if fd.fs == nil || fd.fs.umounted { ready |= waiter.EventErr return ready & mask } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index cd0eb56e5..23e827f90 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -127,7 +127,13 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) return nil, nil, syserror.EINVAL } - fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor)) + fuseFDGeneric := kernelTask.GetFileVFS2(int32(deviceDescriptor)) + defer fuseFDGeneric.DecRef(ctx) + fuseFD, ok := fuseFDGeneric.Impl().(*DeviceFD) + if !ok { + log.Warningf("%s.GetFilesystem: device FD is %T, not a FUSE device", fsType.Name, fuseFDGeneric) + return nil, nil, syserror.EINVAL + } // Parse and set all the other supported FUSE mount options. // TODO(gVisor.dev/issue/3229): Expand the supported mount options. @@ -189,18 +195,17 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Create a new FUSE filesystem. - fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + fs, err := newFUSEFilesystem(ctx, vfsObj, &fsType, fuseFD, devMinor, &fsopts) if err != nil { log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) return nil, nil, err } - fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - // Send a FUSE_INIT request to the FUSE daemon server before returning. // This call is not blocking. if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil { log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err) + fs.VFSFilesystem().DecRef(ctx) // returned by newFUSEFilesystem return nil, nil, err } @@ -211,20 +216,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // newFUSEFilesystem creates a new FUSE filesystem. -func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { - conn, err := newFUSEConnection(ctx, device, opts) +func newFUSEFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, fsType *FilesystemType, fuseFD *DeviceFD, devMinor uint32, opts *filesystemOptions) (*filesystem, error) { + conn, err := newFUSEConnection(ctx, fuseFD, opts) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL } - fuseFD := device.Impl().(*DeviceFD) fs := &filesystem{ devMinor: devMinor, opts: opts, conn: conn, } + fs.VFSFilesystem().Init(vfsObj, fsType, fs) + + // FIXME(gvisor.dev/issue/4813): Doesn't conn or fs need to hold a + // reference on fuseFD, since conn uses fuseFD for communication with the + // server? Wouldn't doing so create a circular reference? + fs.VFSFilesystem().IncRef() // for fuseFD.fs + // FIXME(gvisor.dev/issue/4813): fuseFD.fs is accessed without + // synchronization. fuseFD.fs = fs + return fs, nil } diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go index b2f4276b8..2c0cc0f4e 100644 --- a/pkg/sentry/fsimpl/fuse/utils_test.go +++ b/pkg/sentry/fsimpl/fuse/utils_test.go @@ -52,28 +52,21 @@ func setup(t *testing.T) *testutil.System { // newTestConnection creates a fuse connection that the sentry can communicate with // and the FD for the server to communicate with. func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { - vfsObj := &vfs.VirtualFilesystem{} fuseDev := &DeviceFD{} - if err := vfsObj.Init(system.Ctx); err != nil { - return nil, nil, err - } - - vd := vfsObj.NewAnonVirtualDentry("genCountFD") + vd := system.VFS.NewAnonVirtualDentry("fuse") defer vd.DecRef(system.Ctx) - if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { + if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, nil, err } fsopts := filesystemOptions{ maxActiveRequests: maxActiveRequests, } - fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd) + fs, err := newFUSEFilesystem(system.Ctx, system.VFS, &FilesystemType{}, fuseDev, 0, &fsopts) if err != nil { return nil, nil, err } - fs.VFSFilesystem().Init(vfsObj, nil, fs) - return fs.conn, &fuseDev.vfsfd, nil } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 7ab298019..2294c490e 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -114,6 +114,51 @@ func putDentrySlice(ds *[]*dentry) { dentrySlicePool.Put(ds) } +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *dsp with fs.renameMu locked +// for writing. +// +// dsp is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) { + fs.renameMu.RUnlock() + if *dsp == nil { + return + } + ds := **dsp + // Only go through calling dentry.checkCachingLocked() (which requires + // re-locking renameMu) if we actually have any dentries with zero refs. + checkAny := false + for i := range ds { + if atomic.LoadInt64(&ds[i].refs) == 0 { + checkAny = true + break + } + } + if checkAny { + fs.renameMu.Lock() + for _, d := range ds { + d.checkCachingLocked(ctx) + } + fs.renameMu.Unlock() + } + putDentrySlice(*dsp) +} + +func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkCachingLocked(ctx) + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + // stepLocked resolves rp.Component() to an existing file, starting from the // given directory. // @@ -651,41 +696,6 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return nil } -// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls -// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for -// writing. -// -// ds is a pointer-to-pointer since defer evaluates its arguments immediately, -// but dentry slices are allocated lazily, and it's much easier to say "defer -// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { -// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { - fs.renameMu.RUnlock() - if *ds == nil { - return - } - if len(**ds) != 0 { - fs.renameMu.Lock() - for _, d := range **ds { - d.checkCachingLocked(ctx) - } - fs.renameMu.Unlock() - } - putDentrySlice(*ds) -} - -func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { - if *ds == nil { - fs.renameMu.Unlock() - return - } - for _, d := range **ds { - d.checkCachingLocked(ctx) - } - fs.renameMu.Unlock() - putDentrySlice(*ds) -} - // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 53bcc9986..75a836899 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1352,6 +1352,11 @@ func (d *dentry) checkCachingLocked(ctx context.Context) { } if refs > 0 { if d.cached { + // This isn't strictly necessary (fs.cachedDentries is permitted to + // contain dentries with non-zero refs, which are skipped by + // fs.evictCachedDentryLocked() upon reaching the end of the LRU), + // but since we are already holding fs.renameMu for writing we may + // as well. d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index bc07d72c0..d55bdc97f 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -78,26 +78,36 @@ func putDentrySlice(ds *[]*dentry) { } // renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls -// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// dentry.checkDropLocked on all dentries in *dsp with fs.renameMu locked for // writing. // -// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// dsp is a pointer-to-pointer since defer evaluates its arguments immediately, // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() - if *ds == nil { + if *dsp == nil { return } - if len(**ds) != 0 { + ds := **dsp + // Only go through calling dentry.checkDropLocked() (which requires + // re-locking renameMu) if we actually have any dentries with zero refs. + checkAny := false + for i := range ds { + if atomic.LoadInt64(&ds[i].refs) == 0 { + checkAny = true + break + } + } + if checkAny { fs.renameMu.Lock() - for _, d := range **ds { + for _, d := range ds { d.checkDropLocked(ctx) } fs.renameMu.Unlock() } - putDentrySlice(*ds) + putDentrySlice(*dsp) } func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index e001d5032..c53cc0122 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -50,7 +50,7 @@ type subtasksInode struct { var _ kernfs.Inode = (*subtasksInode)(nil) -func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode { +func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode { subInode := &subtasksInode{ fs: fs, task: task, @@ -58,7 +58,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. - subInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + subInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) subInode.InitRefs() @@ -80,7 +80,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, if subTask.ThreadGroup() != i.task.ThreadGroup() { return nil, syserror.ENOENT } - return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) + return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers) } // IterDirents implements kernfs.inodeDirectory.IterDirents. diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index dc46a09bc..fea138f93 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -47,50 +47,50 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) { +func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) { if task.ExitState() == kernel.TaskExitDead { return nil, syserror.ESRCH } contents := map[string]kernfs.Inode{ - "auxv": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}), - "cmdline": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": fs.newComm(task, fs.NextIno(), 0444), - "cwd": fs.newCwdSymlink(task, fs.NextIno()), - "environ": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - "exe": fs.newExeSymlink(task, fs.NextIno()), - "fd": fs.newFDDirInode(task), - "fdinfo": fs.newFDInfoDirInode(task), - "gid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}), - "mem": fs.newMemInode(task, fs.NextIno(), 0400), - "mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}), - "net": fs.newTaskNetDir(task), - "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{ - "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), - "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), - "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), + "auxv": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": fs.newComm(ctx, task, fs.NextIno(), 0444), + "cwd": fs.newCwdSymlink(ctx, task, fs.NextIno()), + "environ": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": fs.newExeSymlink(ctx, task, fs.NextIno()), + "fd": fs.newFDDirInode(ctx, task), + "fdinfo": fs.newFDInfoDirInode(ctx, task), + "gid_map": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}), + "mem": fs.newMemInode(ctx, task, fs.NextIno(), 0400), + "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{task: task}), + "net": fs.newTaskNetDir(ctx, task), + "ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{ + "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "net"), + "pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "pid"), + "user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "user"), }), - "oom_score": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")), - "oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), - "smaps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}), - "stat": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}), - "status": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) + contents["task"] = fs.newSubtasks(ctx, task, pidns, cgroupControllers) } if len(cgroupControllers) > 0 { - contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) + contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) } taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. - taskInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) taskInode.InitRefs() inode := &taskOwnedInode{Inode: taskInode, owner: task} @@ -143,17 +143,17 @@ type taskOwnedInode struct { var _ kernfs.Inode = (*taskOwnedInode)(nil) -func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode { +func (fs *filesystem) newTaskOwnedInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. - inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) + inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) return &taskOwnedInode{Inode: inode, owner: task} } -func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode { +func (fs *filesystem) newTaskOwnedDir(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero} - dir := kernfs.NewStaticDir(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts) + dir := kernfs.NewStaticDir(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts) return &taskOwnedInode{Inode: dir, owner: task} } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 3ec4471f5..02bf74dbc 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -119,7 +119,7 @@ type fdDirInode struct { var _ kernfs.Inode = (*fdDirInode)(nil) -func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode { +func (fs *filesystem) newFDDirInode(ctx context.Context, task *kernel.Task) kernfs.Inode { inode := &fdDirInode{ fdDir: fdDir{ fs: fs, @@ -127,7 +127,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode { produceSymlink: true, }, } - inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.InitRefs() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) return inode @@ -148,7 +148,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } - return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil + return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil } // Open implements kernfs.Inode.Open. @@ -204,12 +204,12 @@ type fdSymlink struct { var _ kernfs.Inode = (*fdSymlink)(nil) -func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode { +func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ task: task, fd: fd, } - inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -257,14 +257,14 @@ type fdInfoDirInode struct { var _ kernfs.Inode = (*fdInfoDirInode)(nil) -func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode { +func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task) kernfs.Inode { inode := &fdInfoDirInode{ fdDir: fdDir{ fs: fs, task: task, }, } - inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.InitRefs() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) return inode @@ -284,7 +284,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, task: i.task, fd: fd, } - return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil + return i.fs.newTaskOwnedInode(ctx, i.task, i.fs.NextIno(), 0444, data), nil } // IterDirents implements Inode.IterDirents. diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index ba71d0fde..a3780b222 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -248,9 +248,9 @@ type commInode struct { task *kernel.Task } -func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { +func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { inode := &commInode{task: task} - inode.DynamicBytesFile.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) + inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) return inode } @@ -383,10 +383,10 @@ type memInode struct { locks vfs.FileLocks } -func (fs *filesystem) newMemInode(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { +func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. inode := &memInode{task: task} - inode.init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) + inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) return &taskOwnedInode{Inode: inode, owner: task} } @@ -812,9 +812,9 @@ type exeSymlink struct { var _ kernfs.Inode = (*exeSymlink)(nil) -func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode { +func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { inode := &exeSymlink{task: task} - inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -888,9 +888,9 @@ type cwdSymlink struct { var _ kernfs.Inode = (*cwdSymlink)(nil) -func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode { +func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { inode := &cwdSymlink{task: task} - inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -999,7 +999,7 @@ type namespaceSymlink struct { task *kernel.Task } -func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode { +func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. @@ -1007,7 +1007,7 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. - inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) + inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} return taskInode diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 5a9ee111f..5cf8a071a 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -37,7 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode { +func (fs *filesystem) newTaskNetDir(ctx context.Context, task *kernel.Task) kernfs.Inode { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) @@ -57,37 +57,37 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. contents = map[string]kernfs.Inode{ - "dev": fs.newInode(task, root, 0444, &netDevData{stack: stack}), - "snmp": fs.newInode(task, root, 0444, &netSnmpData{stack: stack}), + "dev": fs.newInode(ctx, root, 0444, &netDevData{stack: stack}), + "snmp": fs.newInode(ctx, root, 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. - "arp": fs.newInode(task, root, 0444, newStaticFile(arp)), - "netlink": fs.newInode(task, root, 0444, newStaticFile(netlink)), - "netstat": fs.newInode(task, root, 0444, &netStatData{}), - "packet": fs.newInode(task, root, 0444, newStaticFile(packet)), - "protocols": fs.newInode(task, root, 0444, newStaticFile(protocols)), + "arp": fs.newInode(ctx, root, 0444, newStaticFile(arp)), + "netlink": fs.newInode(ctx, root, 0444, newStaticFile(netlink)), + "netstat": fs.newInode(ctx, root, 0444, &netStatData{}), + "packet": fs.newInode(ctx, root, 0444, newStaticFile(packet)), + "protocols": fs.newInode(ctx, root, 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). - "psched": fs.newInode(task, root, 0444, newStaticFile(psched)), - "ptype": fs.newInode(task, root, 0444, newStaticFile(ptype)), - "route": fs.newInode(task, root, 0444, &netRouteData{stack: stack}), - "tcp": fs.newInode(task, root, 0444, &netTCPData{kernel: k}), - "udp": fs.newInode(task, root, 0444, &netUDPData{kernel: k}), - "unix": fs.newInode(task, root, 0444, &netUnixData{kernel: k}), + "psched": fs.newInode(ctx, root, 0444, newStaticFile(psched)), + "ptype": fs.newInode(ctx, root, 0444, newStaticFile(ptype)), + "route": fs.newInode(ctx, root, 0444, &netRouteData{stack: stack}), + "tcp": fs.newInode(ctx, root, 0444, &netTCPData{kernel: k}), + "udp": fs.newInode(ctx, root, 0444, &netUDPData{kernel: k}), + "unix": fs.newInode(ctx, root, 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { - contents["if_inet6"] = fs.newInode(task, root, 0444, &ifinet6{stack: stack}) - contents["ipv6_route"] = fs.newInode(task, root, 0444, newStaticFile("")) - contents["tcp6"] = fs.newInode(task, root, 0444, &netTCP6Data{kernel: k}) - contents["udp6"] = fs.newInode(task, root, 0444, newStaticFile(upd6)) + contents["if_inet6"] = fs.newInode(ctx, root, 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newInode(ctx, root, 0444, newStaticFile("")) + contents["tcp6"] = fs.newInode(ctx, root, 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newInode(ctx, root, 0444, newStaticFile(upd6)) } } - return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents) + return fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0555, contents) } // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 151d1f10d..fdc580610 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -118,7 +118,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err return nil, syserror.ENOENT } - return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) + return i.fs.newTaskInode(ctx, task, i.pidns, true, i.cgroupControllers) } // IterDirents implements kernfs.inodeDirectory.IterDirents. diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 738c0c9cc..205ad8192 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -136,7 +136,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns config := &kernel.TaskConfig{ Kernel: k, ThreadGroup: tc, - TaskContext: &kernel.TaskContext{Name: name, MemoryManager: m}, + TaskImage: &kernel.TaskImage{Name: name, MemoryManager: m}, Credentials: auth.CredentialsFromContext(ctx), NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e39cd305b..61138a7a4 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -381,6 +381,8 @@ afterTrailingSymlink: creds := rp.Credentials() child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)) parentDir.insertChildLocked(child, name) + child.IncRef() + defer child.DecRef(ctx) unlock() fd, err := child.open(ctx, rp, &opts, true) if err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 98680fde9..f8e0cffb0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -565,7 +565,7 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. // -// Preconditions: inode.mu must be held. +// Preconditions: rw.file.inode.mu must be held. func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { // Hold dataMu so we can modify size. rw.file.dataMu.Lock() @@ -657,7 +657,7 @@ exitLoop: // If the write ends beyond the file's previous size, it causes the // file to grow. if rw.off > rw.file.size { - rw.file.size = rw.off + atomic.StoreUint64(&rw.file.size, rw.off) } return done, retErr diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 85a3dfe20..0c9c639d3 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -478,9 +478,9 @@ func (i *inode) statTo(stat *linux.Statx) { stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino - stat.Atime = linux.NsecToStatxTimestamp(i.atime) - stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) - stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) + stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.atime)) + stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.ctime)) + stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.mtime)) stat.DevMajor = linux.UNNAMED_MAJOR stat.DevMinor = i.fs.devMinor switch impl := i.impl.(type) { @@ -631,7 +631,8 @@ func (i *inode) direntType() uint8 { } func (i *inode) isDir() bool { - return linux.FileMode(i.mode).FileType() == linux.S_IFDIR + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + return mode.FileType() == linux.S_IFDIR } func (i *inode) touchAtime(mnt *vfs.Mount) { diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 4e8d63d51..59fcff498 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -16,6 +16,7 @@ package verity import ( "bytes" + "encoding/json" "fmt" "io" "strconv" @@ -31,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -105,8 +107,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // Dentries which may have a reference count of zero, and which therefore // should be dropped once traversal is complete, are appended to ds. // -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR @@ -156,15 +160,19 @@ afterSymlink: return child, nil } -// verifyChild verifies the hash of child against the already verified hash of -// the parent to ensure the child is expected. verifyChild triggers a sentry -// panic if unexpected modifications to the file system are detected. In +// verifyChildLocked verifies the hash of child against the already verified +// hash of the parent to ensure the child is expected. verifyChild triggers a +// sentry panic if unexpected modifications to the file system are detected. In // noCrashOnVerificationFailure mode it returns a syserror instead. -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// // TODO(b/166474175): Investigate all possible errors returned in this // function, and make sure we differentiate all errors that indicate unexpected // modifications to the file system from the ones that are not harmful. -func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { +func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { vfsObj := fs.vfsfs.VirtualFilesystem() // Get the path to the child dentry. This is only used to provide path @@ -266,35 +274,44 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // contain the hash of the children in the parent Merkle tree when // Verify returns with success. var buf bytes.Buffer - if _, err := merkletree.Verify(&merkletree.VerifyParams{ - Out: &buf, - File: &fdReader, - Tree: &fdReader, - Size: int64(parentSize), - Name: parent.name, - Mode: uint32(parentStat.Mode), - UID: parentStat.UID, - GID: parentStat.GID, + parent.hashMu.RLock() + _, err = merkletree.Verify(&merkletree.VerifyParams{ + Out: &buf, + File: &fdReader, + Tree: &fdReader, + Size: int64(parentSize), + Name: parent.name, + Mode: uint32(parentStat.Mode), + UID: parentStat.UID, + GID: parentStat.GID, + Children: parent.childrenNames, //TODO(b/156980949): Support passing other hash algorithms. HashAlgorithms: fs.alg.toLinuxHashAlg(), ReadOffset: int64(offset), ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())), Expected: parent.hash, DataAndTreeInSameFile: true, - }); err != nil && err != io.EOF { + }) + parent.hashMu.RUnlock() + if err != nil && err != io.EOF { return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err)) } // Cache child hash when it's verified the first time. + child.hashMu.Lock() if len(child.hash) == 0 { child.hash = buf.Bytes() } + child.hashMu.Unlock() return child, nil } -// verifyStat verifies the stat against the verified hash. The mode/uid/gid of -// the file is cached after verified. -func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error { +// verifyStatAndChildrenLocked verifies the stat and children names against the +// verified hash. The mode/uid/gid and childrenNames of the file is cached +// after verified. +// +// Preconditions: d.dirMu must be locked. +func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error { vfsObj := fs.vfsfs.VirtualFilesystem() // Get the path to the child dentry. This is only used to provide path @@ -337,20 +354,65 @@ func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Stat return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) } + if d.isDir() && len(d.childrenNames) == 0 { + childrenOffString, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: childrenOffsetXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenOffsetXattr, childPath, err)) + } + if err != nil { + return err + } + childrenOffset, err := strconv.Atoi(childrenOffString) + if err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err)) + } + + childrenSizeString, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: childrenSizeXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenSizeXattr, childPath, err)) + } + if err != nil { + return err + } + childrenSize, err := strconv.Atoi(childrenSizeString) + if err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err)) + } + + childrenNames := make([]byte, childrenSize) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(childrenOffset), vfs.ReadOptions{}); err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to read children map for %s: %v", childPath, err)) + } + + if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames of %s: %v", childPath, err)) + } + } + fdReader := vfs.FileReadWriteSeeker{ FD: fd, Ctx: ctx, } var buf bytes.Buffer + d.hashMu.RLock() params := &merkletree.VerifyParams{ - Out: &buf, - Tree: &fdReader, - Size: int64(size), - Name: d.name, - Mode: uint32(stat.Mode), - UID: stat.UID, - GID: stat.GID, + Out: &buf, + Tree: &fdReader, + Size: int64(size), + Name: d.name, + Mode: uint32(stat.Mode), + UID: stat.UID, + GID: stat.GID, + Children: d.childrenNames, //TODO(b/156980949): Support passing other hash algorithms. HashAlgorithms: fs.alg.toLinuxHashAlg(), ReadOffset: 0, @@ -359,6 +421,7 @@ func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Stat Expected: d.hash, DataAndTreeInSameFile: false, } + d.hashMu.RUnlock() if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR { params.DataAndTreeInSameFile = true } @@ -373,7 +436,9 @@ func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Stat return nil } -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if child, ok := parent.children[name]; ok { // If verity is enabled on child, we should check again whether @@ -422,7 +487,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s // be cached before enabled. if fs.allowRuntimeEnable { if parent.verityEnabled() { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { return nil, err } } @@ -438,7 +503,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s if err != nil { return nil, err } - if err := fs.verifyStat(ctx, child, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { return nil, err } } @@ -458,96 +523,64 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s return child, nil } -// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { vfsObj := fs.vfsfs.VirtualFilesystem() - childVD, childErr := parent.getLowerAt(ctx, vfsObj, name) - // We will handle ENOENT separately, as it may indicate unexpected - // modifications to the file system, and may cause a sentry panic. - if childErr != nil && childErr != syserror.ENOENT { - return nil, childErr + if parent.verityEnabled() { + if _, ok := parent.childrenNames[name]; !ok { + return nil, syserror.ENOENT + } } - // The dentry needs to be cleaned up if any error occurs. IncRef will be - // called if a verity child dentry is successfully created. - if childErr == nil { - defer childVD.DecRef(ctx) + parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD) + if err != nil { + return nil, err } - childMerkleVD, childMerkleErr := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) - // We will handle ENOENT separately, as it may indicate unexpected - // modifications to the file system, and may cause a sentry panic. - if childMerkleErr != nil && childMerkleErr != syserror.ENOENT { - return nil, childMerkleErr + childVD, err := parent.getLowerAt(ctx, vfsObj, name) + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("file %s expected but not found", parentPath+"/"+name)) + } + if err != nil { + return nil, err } // The dentry needs to be cleaned up if any error occurs. IncRef will be // called if a verity child dentry is successfully created. - if childMerkleErr == nil { - defer childMerkleVD.DecRef(ctx) - } + defer childVD.DecRef(ctx) - // Get the path to the parent dentry. This is only used to provide path - // information in failure case. - parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD) - if err != nil { + childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + if err == syserror.ENOENT { + if !fs.allowRuntimeEnable { + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath+"/"+name)) + } + childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerVD, + Start: parent.lowerVD, + Path: fspath.Parse(merklePrefix + name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, + }) + if err != nil { + return nil, err + } + childMerkleFD.DecRef(ctx) + childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + if err != nil { + return nil, err + } + } + if err != nil && err != syserror.ENOENT { return nil, err } - // TODO(b/166474175): Investigate all possible errors of childErr and - // childMerkleErr, and make sure we differentiate all errors that - // indicate unexpected modifications to the file system from the ones - // that are not harmful. - if childErr == syserror.ENOENT && childMerkleErr == nil { - // Failed to get child file/directory dentry. However the - // corresponding Merkle tree is found. This indicates an - // unexpected modification to the file system that - // removed/renamed the child. - return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name)) - } else if childErr == nil && childMerkleErr == syserror.ENOENT { - // If in allowRuntimeEnable mode, and the Merkle tree file is - // not created yet, we create an empty Merkle tree file, so that - // if the file is enabled through ioctl, we have the Merkle tree - // file open and ready to use. - // This may cause empty and unused Merkle tree files in - // allowRuntimeEnable mode, if they are never enabled. This - // does not affect verification, as we rely on cached hash to - // decide whether to perform verification, not the existence of - // the Merkle tree file. Also, those Merkle tree files are - // always hidden and cannot be accessed by verity fs users. - if fs.allowRuntimeEnable { - childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ - Root: parent.lowerVD, - Start: parent.lowerVD, - Path: fspath.Parse(merklePrefix + name), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT, - Mode: 0644, - }) - if err != nil { - return nil, err - } - childMerkleFD.DecRef(ctx) - childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name) - if err != nil { - return nil, err - } - } else { - // If runtime enable is not allowed. This indicates an - // unexpected modification to the file system that - // removed/renamed the Merkle tree file. - return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name)) - } - } else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT { - // Both the child and the corresponding Merkle tree are missing. - // This could be an unexpected modification or due to incorrect - // parameter. - // TODO(b/167752508): Investigate possible ways to differentiate - // cases that both files are deleted from cases that they never - // exist in the file system. - return nil, alertIntegrityViolation(fmt.Sprintf("Failed to find file %s", parentPath+"/"+name)) - } + // The dentry needs to be cleaned up if any error occurs. IncRef will be + // called if a verity child dentry is successfully created. + defer childMerkleVD.DecRef(ctx) mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ @@ -577,18 +610,19 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, child.mode = uint32(stat.Mode) child.uid = stat.UID child.gid = stat.GID + child.childrenNames = make(map[string]struct{}) // Verify child hash. This should always be performed unless in // allowRuntimeEnable mode and the parent directory hasn't been enabled // yet. if parent.verityEnabled() { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { child.destroyLocked(ctx) return nil, err } } if child.verityEnabled() { - if err := fs.verifyStat(ctx, child, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { child.destroyLocked(ctx) return nil, err } @@ -602,7 +636,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). +// Preconditions: +// * fs.renameMu must be locked. +// * !rp.Done(). func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -943,11 +979,13 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } + d.dirMu.Lock() if d.verityEnabled() { - if err := fs.verifyStat(ctx, d, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return linux.Statx{}, err } } + d.dirMu.Unlock() return stat, nil } diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index faa862c55..46346e54d 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -19,9 +19,22 @@ // The verity file system is read-only, except for one case: when // allowRuntimeEnable is true, additional Merkle files can be generated using // the FS_IOC_ENABLE_VERITY ioctl. +// +// Lock order: +// +// filesystem.renameMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. package verity import ( + "encoding/json" "fmt" "math" "strconv" @@ -60,6 +73,15 @@ const ( // file size. For a directory, this is the size of all its children's hashes. merkleSizeXattr = "user.merkle.size" + // childrenOffsetXattr is the extended attribute name specifying the + // names of the offset of the serialized children names in the Merkle + // tree file. + childrenOffsetXattr = "user.merkle.childrenOffset" + + // childrenSizeXattr is the extended attribute name specifying the size + // of the serialized children names. + childrenSizeXattr = "user.merkle.childrenSize" + // sizeOfStringInt32 is the size for a 32 bit integer stored as string in // extended attributes. The maximum value of a 32 bit integer has 10 digits. sizeOfStringInt32 = 10 @@ -299,14 +321,77 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt d.uid = stat.UID d.gid = stat.GID d.hash = make([]byte, len(iopts.RootHash)) + d.childrenNames = make(map[string]struct{}) if !fs.allowRuntimeEnable { - if err := fs.verifyStat(ctx, d, stat); err != nil { + // Get children names from the underlying file system. + offString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.GetXattrOptions{ + Name: childrenOffsetXattr, + Size: sizeOfStringInt32, + }) + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenOffsetXattr, err)) + } + if err != nil { + return nil, nil, err + } + + off, err := strconv.Atoi(offString) + if err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err)) + } + + sizeString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.GetXattrOptions{ + Name: childrenSizeXattr, + Size: sizeOfStringInt32, + }) + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenSizeXattr, err)) + } + if err != nil { + return nil, nil, err + } + size, err := strconv.Atoi(sizeString) + if err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err)) + } + + lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err == syserror.ENOENT { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to open root Merkle file: %v", err)) + } + if err != nil { + return nil, nil, err + } + + childrenNames := make([]byte, size) + if _, err := lowerMerkleFD.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(off), vfs.ReadOptions{}); err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to read root children map: %v", err)) + } + + if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil { + return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err)) + } + + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return nil, nil, err } } + d.hashMu.Lock() copy(d.hash, iopts.RootHash) + d.hashMu.Unlock() d.vfsd.Init(d) fs.rootDentry = d @@ -331,7 +416,8 @@ type dentry struct { fs *filesystem // mode, uid, gid and size are the file mode, owner, group, and size of - // the file in the underlying file system. + // the file in the underlying file system. They are set when a dentry + // is initialized, and never modified. mode uint32 uid uint32 gid uint32 @@ -352,15 +438,24 @@ type dentry struct { dirMu sync.Mutex `state:"nosave"` children map[string]*dentry - // lowerVD is the VirtualDentry in the underlying file system. + // childrenNames stores the name of all children of the dentry. This is + // used by verity to check whether a child is expected. This is only + // populated by enableVerity. childrenNames is also protected by dirMu. + childrenNames map[string]struct{} + + // lowerVD is the VirtualDentry in the underlying file system. It is + // never modified after initialized. lowerVD vfs.VirtualDentry // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree - // in the underlying file system. + // in the underlying file system. It is never modified after + // initialized. lowerMerkleVD vfs.VirtualDentry - // hash is the calculated hash for the current file or directory. - hash []byte + // hash is the calculated hash for the current file or directory. hash + // is protected by hashMu. + hashMu sync.RWMutex `state:"nosave"` + hash []byte } // newDentry creates a new dentry representing the given verity file. The @@ -443,7 +538,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) { // destroyLocked destroys the dentry. // -// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +// Preconditions: +// * d.fs.renameMu must be locked for writing. +// * d.refs == 0. func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: @@ -523,6 +620,8 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) // mode, it returns true if the target has been enabled with // ioctl(FS_IOC_ENABLE_VERITY). func (d *dentry) verityEnabled() bool { + d.hashMu.RLock() + defer d.hashMu.RUnlock() return !d.fs.allowRuntimeEnable || len(d.hash) != 0 } @@ -602,11 +701,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu if err != nil { return linux.Statx{}, err } + fd.d.dirMu.Lock() if fd.d.verityEnabled() { - if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil { + if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil { return linux.Statx{}, err } } + fd.d.dirMu.Unlock() return stat, nil } @@ -642,13 +743,15 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) return offset, nil } -// generateMerkle generates a Merkle tree file for fd. If fd points to a file -// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash -// of the generated Merkle tree and the data size is returned. If fd points to -// a regular file, the data is the content of the file. If fd points to a -// directory, the data is all hahes of its children, written to the Merkle tree -// file. -func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) { +// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a +// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The +// hash of the generated Merkle tree and the data size is returned. If fd +// points to a regular file, the data is the content of the file. If fd points +// to a directory, the data is all hahes of its children, written to the Merkle +// tree file. +// +// Preconditions: fd.d.fs.verityMu must be locked. +func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) { fdReader := vfs.FileReadWriteSeeker{ FD: fd.lowerFD, Ctx: ctx, @@ -664,6 +767,7 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, params := &merkletree.GenerateParams{ TreeReader: &merkleReader, TreeWriter: &merkleWriter, + Children: fd.d.childrenNames, //TODO(b/156980949): Support passing other hash algorithms. HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), } @@ -716,9 +820,48 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, return hash, uint64(params.Size), err } +// recordChildrenLocked writes the names of fd's children into the +// corresponding Merkle tree file, and saves the offset/size of the map into +// xattrs. +// +// Preconditions: +// * fd.d.fs.verityMu must be locked. +// * fd.d.isDir() == true. +func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error { + // Record the children names in the Merkle tree file. + childrenNames, err := json.Marshal(fd.d.childrenNames) + if err != nil { + return err + } + + stat, err := fd.merkleWriter.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return err + } + + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: childrenOffsetXattr, + Value: strconv.Itoa(int(stat.Size)), + }); err != nil { + return err + } + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: childrenSizeXattr, + Value: strconv.Itoa(len(childrenNames)), + }); err != nil { + return err + } + + if _, err = fd.merkleWriter.Write(ctx, usermem.BytesIOSequence(childrenNames), vfs.WriteOptions{}); err != nil { + return err + } + + return nil +} + // enableVerity enables verity features on fd by generating a Merkle tree file // and stores its hash in its parent directory's Merkle tree. -func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) { +func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { if !fd.d.fs.allowRuntimeEnable { return 0, syserror.EPERM } @@ -734,7 +877,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (ui return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") } - hash, dataSize, err := fd.generateMerkle(ctx) + hash, dataSize, err := fd.generateMerkleLocked(ctx) if err != nil { return 0, err } @@ -761,6 +904,9 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (ui }); err != nil { return 0, err } + + // Add the current child's name to parent's childrenNames. + fd.d.parent.childrenNames[fd.d.name] = struct{}{} } // Record the size of the data being hashed for fd. @@ -770,18 +916,29 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (ui }); err != nil { return 0, err } - fd.d.hash = append(fd.d.hash, hash...) + + if fd.d.isDir() { + if err := fd.recordChildrenLocked(ctx); err != nil { + return 0, err + } + } + fd.d.hashMu.Lock() + fd.d.hash = hash + fd.d.hashMu.Unlock() return 0, nil } // measureVerity returns the hash of fd, saved in verityDigest. -func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) { +func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest usermem.Addr) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { return 0, syserror.EINVAL } var metadata linux.DigestMetadata + fd.d.hashMu.RLock() + defer fd.d.hashMu.RUnlock() + // If allowRuntimeEnable is true, an empty fd.d.hash indicates that // verity is not enabled for the file. If allowRuntimeEnable is false, // this is an integrity violation because all files should have verity @@ -815,14 +972,16 @@ func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, ve return 0, err } -func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) { +func (fd *fileDescription) verityFlags(ctx context.Context, flags usermem.Addr) (uintptr, error) { f := int32(0) + fd.d.hashMu.RLock() // All enabled files should store a hash. This flag is not settable via // FS_IOC_SETFLAGS. if len(fd.d.hash) != 0 { f |= linux.FS_VERITY_FL } + fd.d.hashMu.RUnlock() t := kernel.TaskFromContext(ctx) if t == nil { @@ -836,11 +995,11 @@ func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flag func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Uint(); cmd { case linux.FS_IOC_ENABLE_VERITY: - return fd.enableVerity(ctx, uio) + return fd.enableVerity(ctx) case linux.FS_IOC_MEASURE_VERITY: - return fd.measureVerity(ctx, uio, args[2].Pointer()) + return fd.measureVerity(ctx, args[2].Pointer()) case linux.FS_IOC_GETFLAGS: - return fd.verityFlags(ctx, uio, args[2].Pointer()) + return fd.verityFlags(ctx, args[2].Pointer()) default: // TODO(b/169682228): Investigate which ioctl commands should // be allowed. @@ -901,15 +1060,17 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Ctx: ctx, } + fd.d.hashMu.RLock() n, err := merkletree.Verify(&merkletree.VerifyParams{ - Out: dst.Writer(ctx), - File: &dataReader, - Tree: &merkleReader, - Size: int64(size), - Name: fd.d.name, - Mode: fd.d.mode, - UID: fd.d.uid, - GID: fd.d.gid, + Out: dst.Writer(ctx), + File: &dataReader, + Tree: &merkleReader, + Size: int64(size), + Name: fd.d.name, + Mode: fd.d.mode, + UID: fd.d.uid, + GID: fd.d.gid, + Children: fd.d.childrenNames, //TODO(b/156980949): Support passing other hash algorithms. HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), ReadOffset: offset, @@ -917,6 +1078,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Expected: fd.d.hash, DataAndTreeInSameFile: false, }) + fd.d.hashMu.RUnlock() if err != nil { return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err)) } diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index b2da9dd96..7196e74eb 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -18,6 +18,7 @@ import ( "fmt" "io" "math/rand" + "strconv" "testing" "time" @@ -307,6 +308,56 @@ func TestReopenUnmodifiedFileSucceeds(t *testing.T) { } } +// TestOpenNonexistentFile ensures that opening a nonexistent file does not +// trigger verification failure, even if the parent directory is verified. +func TestOpenNonexistentFile(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t, SHA256) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirms a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Enable verity on the parent directory. + parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Ensure open an unexpected file in the parent directory fails with + // ENOENT rather than verification failure. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename + "abc"), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != syserror.ENOENT { + t.Errorf("OpenAt unexpected error: %v", err) + } +} + // TestPReadModifiedFileFails ensures that read from a modified verity file // fails. func TestPReadModifiedFileFails(t *testing.T) { @@ -439,10 +490,10 @@ func TestModifiedMerkleFails(t *testing.T) { // Flip a random bit in the Merkle tree file. stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{}) if err != nil { - t.Fatalf("stat: %v", err) + t.Errorf("lowerMerkleFD.Stat: %v", err) } - merkleSize := int(stat.Size) - if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil { + + if err := corruptRandomBit(ctx, lowerMerkleFD, int(stat.Size)); err != nil { t.Fatalf("corruptRandomBit: %v", err) } @@ -510,11 +561,17 @@ func TestModifiedParentMerkleFails(t *testing.T) { // This parent directory contains only one child, so any random // modification in the parent Merkle tree should cause verification // failure when opening the child file. - stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{}) + sizeString, err := parentLowerMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: childrenOffsetXattr, + Size: sizeOfStringInt32, + }) + if err != nil { + t.Fatalf("parentLowerMerkleFD.GetXattr: %v", err) + } + parentMerkleSize, err := strconv.Atoi(sizeString) if err != nil { - t.Fatalf("stat: %v", err) + t.Fatalf("Failed convert size to int: %v", err) } - parentMerkleSize := int(stat.Size) if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil { t.Fatalf("corruptRandomBit: %v", err) } @@ -602,124 +659,165 @@ func TestModifiedStatFails(t *testing.T) { } } -// TestOpenDeletedOrRenamedFileFails ensures that opening a deleted/renamed -// verity enabled file or the corresponding Merkle tree file fails with the -// verify error. +// TestOpenDeletedFileFails ensures that opening a deleted verity enabled file +// and/or the corresponding Merkle tree file fails with the verity error. func TestOpenDeletedFileFails(t *testing.T) { testCases := []struct { - // Tests removing files is remove is true. Otherwise tests - // renaming files. - remove bool - // The original file is removed/renamed if changeFile is true. + // The original file is removed if changeFile is true. changeFile bool - // The Merkle tree file is removed/renamed if changeMerkleFile - // is true. + // The Merkle tree file is removed if changeMerkleFile is true. changeMerkleFile bool }{ { - remove: true, changeFile: true, changeMerkleFile: false, }, { - remove: true, changeFile: false, changeMerkleFile: true, }, { - remove: false, changeFile: true, - changeMerkleFile: false, + changeMerkleFile: true, }, + } + for _, tc := range testCases { + t.Run(fmt.Sprintf("changeFile:%t, changeMerkleFile:%t", tc.changeFile, tc.changeMerkleFile), func(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t, SHA256) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD + if tc.changeFile { + if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(filename), + }); err != nil { + t.Fatalf("UnlinkAt: %v", err) + } + } + if tc.changeMerkleFile { + if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(merklePrefix + filename), + }); err != nil { + t.Fatalf("UnlinkAt: %v", err) + } + } + + // Ensure reopening the verity enabled file fails. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != syserror.EIO { + t.Errorf("got OpenAt error: %v, expected EIO", err) + } + }) + } +} + +// TestOpenRenamedFileFails ensures that opening a renamed verity enabled file +// and/or the corresponding Merkle tree file fails with the verity error. +func TestOpenRenamedFileFails(t *testing.T) { + testCases := []struct { + // The original file is renamed if changeFile is true. + changeFile bool + // The Merkle tree file is renamed if changeMerkleFile is true. + changeMerkleFile bool + }{ { - remove: false, changeFile: true, changeMerkleFile: false, }, + { + changeFile: false, + changeMerkleFile: true, + }, + { + changeFile: true, + changeMerkleFile: true, + }, } for _, tc := range testCases { - t.Run(fmt.Sprintf("remove:%t", tc.remove), func(t *testing.T) { - for _, alg := range hashAlgs { - vfsObj, root, ctx, err := newVerityRoot(t, alg) - if err != nil { - t.Fatalf("newVerityRoot: %v", err) - } - - filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) - if err != nil { - t.Fatalf("newFileFD: %v", err) - } + t.Run(fmt.Sprintf("changeFile:%t, changeMerkleFile:%t", tc.changeFile, tc.changeMerkleFile), func(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t, SHA256) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } - // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } - rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD - if tc.remove { - if tc.changeFile { - if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(filename), - }); err != nil { - t.Fatalf("UnlinkAt: %v", err) - } - } - if tc.changeMerkleFile { - if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(merklePrefix + filename), - }); err != nil { - t.Fatalf("UnlinkAt: %v", err) - } - } - } else { - newFilename := "renamed-test-file" - if tc.changeFile { - if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(filename), - }, &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(newFilename), - }, &vfs.RenameOptions{}); err != nil { - t.Fatalf("RenameAt: %v", err) - } - } - if tc.changeMerkleFile { - if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(merklePrefix + filename), - }, &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(merklePrefix + newFilename), - }, &vfs.RenameOptions{}); err != nil { - t.Fatalf("UnlinkAt: %v", err) - } - } - } + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } - // Ensure reopening the verity enabled file fails. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, + rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD + newFilename := "renamed-test-file" + if tc.changeFile { + if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular, - }); err != syserror.EIO { - t.Errorf("got OpenAt error: %v, expected EIO", err) + }, &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(newFilename), + }, &vfs.RenameOptions{}); err != nil { + t.Fatalf("RenameAt: %v", err) } } + if tc.changeMerkleFile { + if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(merklePrefix + filename), + }, &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(merklePrefix + newFilename), + }, &vfs.RenameOptions{}); err != nil { + t.Fatalf("UnlinkAt: %v", err) + } + } + + // Ensure reopening the verity enabled file fails. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != syserror.EIO { + t.Errorf("got OpenAt error: %v, expected EIO", err) + } }) } } diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 90dd4a047..0ee60569c 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -184,6 +184,7 @@ go_library( "task_exit.go", "task_futex.go", "task_identity.go", + "task_image.go", "task_list.go", "task_log.go", "task_net.go", @@ -224,6 +225,7 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fspath", + "//pkg/goid", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go index 0ac78c0b8..ec36d1a49 100644 --- a/pkg/sentry/kernel/aio.go +++ b/pkg/sentry/kernel/aio.go @@ -15,10 +15,7 @@ package kernel import ( - "time" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // AIOCallback is an function that does asynchronous I/O on behalf of a task. @@ -26,7 +23,7 @@ type AIOCallback func(context.Context) // QueueAIO queues an AIOCallback which will be run asynchronously. func (t *Task) QueueAIO(cb AIOCallback) { - ctx := taskAsyncContext{t: t} + ctx := t.AsyncContext() wg := &t.TaskSet().aioGoroutines wg.Add(1) go func() { @@ -34,48 +31,3 @@ func (t *Task) QueueAIO(cb AIOCallback) { wg.Done() }() } - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index bb94769c4..a8596410f 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -15,8 +15,6 @@ package kernel import ( - "time" - "gvisor.dev/gvisor/pkg/context" ) @@ -98,18 +96,3 @@ func TaskFromContext(ctx context.Context) *Task { } return nil } - -// Deadline implements context.Context.Deadline. -func (*Task) Deadline() (time.Time, bool) { - return time.Time{}, false -} - -// Done implements context.Context.Done. -func (*Task) Done() <-chan struct{} { - return nil -} - -// Err implements context.Context.Err. -func (*Task) Err() error { - return nil -} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 9b2be44d4..2cdcdfc1f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -632,7 +632,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { defer k.tasks.mu.RUnlock() for t := range k.tasks.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if mm := t.tc.MemoryManager; mm != nil { + if mm := t.image.MemoryManager; mm != nil { if _, ok := invalidated[mm]; !ok { if err := mm.InvalidateUnsavable(ctx); err != nil { return err @@ -642,7 +642,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } // I really wish we just had a sync.Map of all MMs... if r, ok := t.runState.(*runSyscallAfterExecStop); ok { - if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { return err } } @@ -1017,7 +1017,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, Features: k.featureSet, } - tc, se := k.LoadTaskImage(ctx, loadArgs) + image, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } @@ -1030,7 +1030,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, config := &TaskConfig{ Kernel: k, ThreadGroup: tg, - TaskContext: tc, + TaskImage: image, FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, @@ -1046,7 +1046,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, err } - t.traceExecEvent(tc) // Simulate exec for tracing. + t.traceExecEvent(image) // Simulate exec for tracing. // Success. cu.Release() @@ -1359,6 +1359,13 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { // not have meaningful trace data. Rebuilding here ensures that we can do so // after tracing has been enabled. func (k *Kernel) RebuildTraceContexts() { + // We need to pause all task goroutines because Task.rebuildTraceContext() + // replaces Task.traceContext and Task.traceTask, which are + // task-goroutine-exclusive (i.e. the task goroutine assumes that it can + // access them without synchronization) for performance. + k.Pause() + defer k.Unpause() + k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 387edfa91..60917e7d3 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -106,7 +106,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { data := linux.SeccompData{ Nr: sysno, - Arch: t.tc.st.AuditNumber, + Arch: t.image.st.AuditNumber, InstructionPointer: uint64(ip), } // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go index 90f890495..0b17a562e 100644 --- a/pkg/sentry/kernel/syscalls_state.go +++ b/pkg/sentry/kernel/syscalls_state.go @@ -30,18 +30,18 @@ type syscallTableInfo struct { } // saveSt saves the SyscallTable. -func (tc *TaskContext) saveSt() syscallTableInfo { +func (image *TaskImage) saveSt() syscallTableInfo { return syscallTableInfo{ - OS: tc.st.OS, - Arch: tc.st.Arch, + OS: image.st.OS, + Arch: image.st.Arch, } } // loadSt loads the SyscallTable. -func (tc *TaskContext) loadSt(sti syscallTableInfo) { +func (image *TaskImage) loadSt(sti syscallTableInfo) { st, ok := LookupSyscallTable(sti.OS, sti.Arch) if !ok { panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) } - tc.st = st // Save the table reference. + image.st = st // Save the table reference. } diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index a83ce219c..3fee7aa68 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -75,6 +75,12 @@ func (s *syslog) Log() []byte { "Checking naughty and nice process list...", // Check it up to twice. "Granting licence to kill(2)...", // British spelling for British movie. "Letting the watchdogs out...", + "Conjuring /dev/null black hole...", + "Adversarially training Redcode AI...", + "Singleplexing /dev/ptmx...", + "Recruiting cron-ies...", + "Verifying that no non-zero bytes made their way into /dev/zero...", + "Accelerating teletypewriter to 9600 baud...", } selectMessage := func() string { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 037971393..c0ab53c94 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -29,11 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -63,6 +58,12 @@ import ( type Task struct { taskNode + // goid is the task goroutine's ID. goid is owned by the task goroutine, + // but since it's used to detect cases where non-task goroutines + // incorrectly access state owned by, or exclusive to, the task goroutine, + // goid is always accessed using atomic memory operations. + goid int64 `state:"nosave"` + // runState is what the task goroutine is executing if it is not stopped. // If runState is nil, the task goroutine should exit or has exited. // runState is exclusive to the task goroutine. @@ -83,7 +84,7 @@ type Task struct { // taskWork is exclusive to the task goroutine. taskWork []TaskWorker - // haveSyscallReturn is true if tc.Arch().Return() represents a value + // haveSyscallReturn is true if image.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // // haveSyscallReturn is exclusive to the task goroutine. @@ -257,10 +258,10 @@ type Task struct { // mu protects some of the following fields. mu sync.Mutex `state:"nosave"` - // tc holds task data provided by the ELF loader. + // image holds task data provided by the ELF loader. // - // tc is protected by mu, and is owned by the task goroutine. - tc TaskContext + // image is protected by mu, and is owned by the task goroutine. + image TaskImage // fsContext is the task's filesystem context. // @@ -274,7 +275,7 @@ type Task struct { // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when - // this TaskContext is released. + // this TaskImage is released. // // vforkParent is protected by the TaskSet mutex. vforkParent *Task @@ -641,64 +642,6 @@ func (t *Task) Kernel() *Kernel { return t.k } -// Value implements context.Context.Value. -// -// Preconditions: The caller must be running on the task goroutine (as implied -// by the requirements of context.Context). -func (t *Task) Value(key interface{}) interface{} { - switch key { - case CtxCanTrace: - return t.CanTrace - case CtxKernel: - return t.k - case CtxPIDNamespace: - return t.tg.pidns - case CtxUTSNamespace: - return t.utsns - case CtxIPCNamespace: - ipcns := t.IPCNamespace() - ipcns.IncRef() - return ipcns - case CtxTask: - return t - case auth.CtxCredentials: - return t.Credentials() - case context.CtxThreadGroupID: - return int32(t.ThreadGroup().ID()) - case fs.CtxRoot: - return t.fsContext.RootDirectory() - case vfs.CtxRoot: - return t.fsContext.RootDirectoryVFS2() - case vfs.CtxMountNamespace: - t.mountNamespaceVFS2.IncRef() - return t.mountNamespaceVFS2 - case fs.CtxDirentCacheLimiter: - return t.k.DirentCacheLimiter - case inet.CtxStack: - return t.NetworkContext() - case ktime.CtxRealtimeClock: - return t.k.RealtimeClock() - case limits.CtxLimits: - return t.tg.limits - case pgalloc.CtxMemoryFile: - return t.k.mf - case pgalloc.CtxMemoryFileProvider: - return t.k - case platform.CtxPlatform: - return t.k - case uniqueid.CtxGlobalUniqueID: - return t.k.UniqueID() - case uniqueid.CtxGlobalUniqueIDProvider: - return t.k - case uniqueid.CtxInotifyCookie: - return t.k.GenerateInotifyCookie() - case unimpl.CtxEvents: - return t.k - default: - return nil - } -} - // SetClearTID sets t's cleartid. // // Preconditions: The caller must be running on the task goroutine. @@ -751,12 +694,12 @@ func (t *Task) IsChrooted() bool { return root != realRoot } -// TaskContext returns t's TaskContext. +// TaskImage returns t's TaskImage. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. -func (t *Task) TaskContext() *TaskContext { - return &t.tc +func (t *Task) TaskImage() *TaskImage { + return &t.image } // FSContext returns t's FSContext. FSContext does not take an additional diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go index 5f3e60fe8..e574997f7 100644 --- a/pkg/sentry/kernel/task_acct.go +++ b/pkg/sentry/kernel/task_acct.go @@ -136,14 +136,14 @@ func (tg *ThreadGroup) IOUsage() *usage.IO { func (t *Task) Name() string { t.mu.Lock() defer t.mu.Unlock() - return t.tc.Name + return t.image.Name } // SetName changes t's name. func (t *Task) SetName(name string) { t.mu.Lock() defer t.mu.Unlock() - t.tc.Name = name + t.image.Name = name t.Debugf("Set thread name to %q", name) } diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index 4a4a69ee2..9419f2e95 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -20,6 +20,7 @@ import ( "time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,6 +33,8 @@ import ( // // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout // expired, and syserror.ErrInterrupted if t is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { if !haveTimeout { return timeout, t.block(C, nil) @@ -112,7 +115,14 @@ func (t *Task) Block(C <-chan struct{}) error { // block blocks a task on one of many events. // N.B. defer is too expensive to be used here. +// +// Preconditions: The caller must be running on the task goroutine. func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { + // This function is very hot; skip this check outside of +race builds. + if sync.RaceEnabled { + t.assertTaskGoroutine() + } + // Fast path if the request is already done. select { case <-C: @@ -156,33 +166,39 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { } } -// SleepStart implements amutex.Sleeper.SleepStart. +// SleepStart implements context.ChannelSleeper.SleepStart. func (t *Task) SleepStart() <-chan struct{} { + t.assertTaskGoroutine() t.Deactivate() t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) return t.interruptChan } -// SleepFinish implements amutex.Sleeper.SleepFinish. +// SleepFinish implements context.ChannelSleeper.SleepFinish. func (t *Task) SleepFinish(success bool) { if !success { - // The interrupted notification is consumed only at the top-level - // (Run). Therefore we attempt to reset the pending notification. - // This will also elide our next entry back into the task, so we - // will process signals, state changes, etc. + // Our caller received from t.interruptChan; we need to re-send to it + // to ensure that t.interrupted() is still true. t.interruptSelf() } t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) t.Activate() } -// Interrupted implements amutex.Sleeper.Interrupted +// Interrupted implements context.ChannelSleeper.Interrupted. func (t *Task) Interrupted() bool { - return len(t.interruptChan) != 0 + if t.interrupted() { + return true + } + // Indicate that t's task goroutine is still responsive (i.e. reset the + // watchdog timer). + t.accountTaskGoroutineRunning() + return false } // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. func (t *Task) UninterruptibleSleepStart(deactivate bool) { + t.assertTaskGoroutine() if deactivate { t.Deactivate() } @@ -198,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) { } // interrupted returns true if interrupt or interruptSelf has been called at -// least once since the last call to interrupted. +// least once since the last call to unsetInterrupted. func (t *Task) interrupted() bool { + return len(t.interruptChan) != 0 +} + +// unsetInterrupted causes interrupted to return false until the next call to +// interrupt or interruptSelf. +func (t *Task) unsetInterrupted() { select { case <-t.interruptChan: - return true default: - return false } } @@ -220,9 +240,7 @@ func (t *Task) interrupt() { func (t *Task) interruptSelf() { select { case t.interruptChan <- struct{}{}: - t.Debugf("Interrupt queued") default: - t.Debugf("Dropping duplicate interrupt") } // platform.Context.Interrupt() is unnecessary since a task goroutine // calling interruptSelf() cannot also be blocked in diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 527344162..f305e69c0 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -115,7 +115,7 @@ type CloneOptions struct { ParentTID usermem.Addr // If Vfork is true, place the parent in vforkStop until the cloned task - // releases its TaskContext. + // releases its TaskImage. Vfork bool // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for @@ -226,20 +226,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) } - tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) + image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err } cu.Add(func() { - tc.release() + image.release() }) // clone() returns 0 in the child. - tc.Arch.SetReturn(0) + image.Arch.SetReturn(0) if opts.Stack != 0 { - tc.Arch.SetStack(uintptr(opts.Stack)) + image.Arch.SetStack(uintptr(opts.Stack)) } if opts.SetTLS { - if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + if !image.Arch.SetTLS(uintptr(opts.TLS)) { return 0, nil, syserror.EPERM } } @@ -288,7 +288,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { Kernel: t.k, ThreadGroup: tg, SignalMask: t.SignalMask(), - TaskContext: tc, + TaskImage: image, FSContext: fsContext, FDTable: fdTable, Credentials: creds, diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index d1136461a..70b0699dc 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,159 +15,175 @@ package kernel import ( - "fmt" + "time" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel/futex" - "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" ) -var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) - -// Auxmap contains miscellaneous data for the task. -type Auxmap map[string]interface{} - -// TaskContext is the subset of a task's data that is provided by the loader. -// -// +stateify savable -type TaskContext struct { - // Name is the thread name set by the prctl(PR_SET_NAME) system call. - Name string - - // Arch is the architecture-specific context (registers, etc.) - Arch arch.Context - - // MemoryManager is the task's address space. - MemoryManager *mm.MemoryManager +// Deadline implements context.Context.Deadline. +func (t *Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} - // fu implements futexes in the address space. - fu *futex.Manager +// Done implements context.Context.Done. +func (t *Task) Done() <-chan struct{} { + return nil +} - // st is the task's syscall table. - st *SyscallTable `state:".(syscallTableInfo)"` +// Err implements context.Context.Err. +func (t *Task) Err() error { + return nil } -// release releases all resources held by the TaskContext. release is called by -// the task when it execs into a new TaskContext or exits. -func (tc *TaskContext) release() { - // Nil out pointers so that if the task is saved after release, it doesn't - // follow the pointers to possibly now-invalid objects. - if tc.MemoryManager != nil { - tc.MemoryManager.DecUsers(context.Background()) - tc.MemoryManager = nil +// Value implements context.Context.Value. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Value(key interface{}) interface{} { + // This function is very hot; skip this check outside of +race builds. + if sync.RaceEnabled { + t.assertTaskGoroutine() } - tc.fu = nil + return t.contextValue(key, true /* isTaskGoroutine */) } -// Fork returns a duplicate of tc. The copied TaskContext always has an -// independent arch.Context. If shareAddressSpace is true, the copied -// TaskContext shares an address space with the original; otherwise, the copied -// TaskContext has an independent address space that is initially a duplicate -// of the original's. -func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) { - newTC := &TaskContext{ - Name: tc.Name, - Arch: tc.Arch.Fork(), - st: tc.st, - } - if shareAddressSpace { - newTC.MemoryManager = tc.MemoryManager - if newTC.MemoryManager != nil { - if !newTC.MemoryManager.IncUsers() { - // Shouldn't be possible since tc.MemoryManager should be a - // counted user. - panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager")) - } +func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} { + switch key { + case CtxCanTrace: + return t.CanTrace + case CtxKernel: + return t.k + case CtxPIDNamespace: + return t.tg.pidns + case CtxUTSNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + return t.utsns + case CtxIPCNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + ipcns := t.ipcns + ipcns.IncRef() + return ipcns + case CtxTask: + return t + case auth.CtxCredentials: + return t.creds.Load() + case context.CtxThreadGroupID: + return int32(t.tg.ID()) + case fs.CtxRoot: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() + } + return t.fsContext.RootDirectory() + case vfs.CtxRoot: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() } - newTC.fu = tc.fu - } else { - newMM, err := tc.MemoryManager.Fork(ctx) - if err != nil { - return nil, err + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + if !isTaskGoroutine { + t.mu.Lock() + defer t.mu.Unlock() } - newTC.MemoryManager = newMM - newTC.fu = k.futexes.Fork() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 + case fs.CtxDirentCacheLimiter: + return t.k.DirentCacheLimiter + case inet.CtxStack: + return t.NetworkContext() + case ktime.CtxRealtimeClock: + return t.k.RealtimeClock() + case limits.CtxLimits: + return t.tg.limits + case pgalloc.CtxMemoryFile: + return t.k.mf + case pgalloc.CtxMemoryFileProvider: + return t.k + case platform.CtxPlatform: + return t.k + case uniqueid.CtxGlobalUniqueID: + return t.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return t.k + case uniqueid.CtxInotifyCookie: + return t.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return t.k + default: + return nil } - return newTC, nil } -// Arch returns t's arch.Context. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) Arch() arch.Context { - return t.tc.Arch +// taskAsyncContext implements context.Context for a goroutine that performs +// work on behalf of a Task, but is not the task goroutine. +type taskAsyncContext struct { + context.NoopSleeper + + t *Task } -// MemoryManager returns t's MemoryManager. MemoryManager does not take an -// additional reference on the returned MM. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) MemoryManager() *mm.MemoryManager { - return t.tc.MemoryManager +// AsyncContext returns a context.Context representing t. The returned +// context.Context is intended for use by goroutines other than t's task +// goroutine; for example, signal delivery to t will not interrupt goroutines +// that are blocking using the returned context.Context. +func (t *Task) AsyncContext() context.Context { + return taskAsyncContext{t: t} } -// SyscallTable returns t's syscall table. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) SyscallTable() *SyscallTable { - return t.tc.st +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) } -// Stack returns the userspace stack. -// -// Preconditions: The caller must be running on the task goroutine, or t.mu -// must be locked. -func (t *Task) Stack() *arch.Stack { - return &arch.Stack{ - Arch: t.Arch(), - IO: t.MemoryManager(), - Bottom: usermem.Addr(t.Arch().Stack()), - } +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) } -// LoadTaskImage loads a specified file into a new TaskContext. -// -// args.MemoryManager does not need to be set by the caller. -func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { - // If File is not nil, we should load that instead of resolving Filename. - if args.File != nil { - args.Filename = args.File.PathnameWithDeleted(ctx) - } +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} - // Prepare a new user address space to load into. - m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - defer m.DecUsers(ctx) - args.MemoryManager = m +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return time.Time{}, false +} - os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) - if err != nil { - return nil, err - } +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return nil +} - // Lookup our new syscall table. - st, ok := LookupSyscallTable(os, ac.Arch()) - if !ok { - // No syscall table found. This means that the ELF binary does not match - // the architecture. - return nil, errNoSyscalls - } +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return nil +} - if !m.IncUsers() { - panic("Failed to increment users count on new MM") - } - return &TaskContext{ - Name: name, - Arch: ac, - MemoryManager: m, - fu: k.futexes.Fork(), - st: st, - }, nil +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.contextValue(key, false /* isTaskGoroutine */) } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 412d471d3..d9897e802 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -83,11 +83,12 @@ type execStop struct{} func (*execStop) Killable() bool { return true } // Execve implements the execve(2) syscall by killing all other tasks in its -// thread group and switching to newTC. Execve always takes ownership of newTC. +// thread group and switching to newImage. Execve always takes ownership of +// newImage. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. -func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { +func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.tg.signalHandlers.mu.Lock() @@ -96,7 +97,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { if t.tg.exiting || t.tg.execing != nil { // We lost to a racing group-exit, kill, or exec from another thread // and should just exit. - newTC.release() + newImage.release() return nil, syserror.EINTR } @@ -118,7 +119,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { t.beginInternalStopLocked((*execStop)(nil)) } - return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil + return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil } // The runSyscallAfterExecStop state continues execve(2) after all siblings of @@ -126,16 +127,16 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { // // +stateify savable type runSyscallAfterExecStop struct { - tc *TaskContext + image *TaskImage } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { - t.traceExecEvent(r.tc) + t.traceExecEvent(r.image) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { t.tg.pidns.owner.mu.Unlock() - r.tc.release() + r.image.release() return (*runInterrupt)(nil) } // We are the thread group leader now. Save our old thread ID for @@ -214,7 +215,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. // See fs/exec.c:setup_new_exec. - r.tc.MemoryManager.SetDumpability(mm.UserDumpable) + r.image.MemoryManager.SetDumpability(mm.UserDumpable) // Switch to the new process. t.MemoryManager().Deactivate() @@ -222,8 +223,8 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // Update credentials to reflect the execve. This should precede switching // MMs to ensure that dumpability has been reset first, if needed. t.updateCredsForExecLocked() - t.tc.release() - t.tc = *r.tc + t.image.release() + t.image = *r.image t.mu.Unlock() t.unstopVforkParent() t.p.FullStateChanged() diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index ce7b9641d..c5137c282 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -266,7 +266,7 @@ func (*runExitMain) execute(t *Task) taskRunState { t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() t.mu.Lock() - t.tc.release() + t.image.release() t.mu.Unlock() // Releasing the MM unblocks a blocked CLONE_VFORK parent. diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index c80391475..195c7da9b 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -26,7 +26,7 @@ import ( // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Futex() *futex.Manager { - return t.tc.fu + return t.image.fu } // SwapUint32 implements futex.Target.SwapUint32. diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go new file mode 100644 index 000000000..ce5fbd299 --- /dev/null +++ b/pkg/sentry/kernel/task_image.go @@ -0,0 +1,173 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" +) + +var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) + +// Auxmap contains miscellaneous data for the task. +type Auxmap map[string]interface{} + +// TaskImage is the subset of a task's data that is provided by the loader. +// +// +stateify savable +type TaskImage struct { + // Name is the thread name set by the prctl(PR_SET_NAME) system call. + Name string + + // Arch is the architecture-specific context (registers, etc.) + Arch arch.Context + + // MemoryManager is the task's address space. + MemoryManager *mm.MemoryManager + + // fu implements futexes in the address space. + fu *futex.Manager + + // st is the task's syscall table. + st *SyscallTable `state:".(syscallTableInfo)"` +} + +// release releases all resources held by the TaskImage. release is called by +// the task when it execs into a new TaskImage or exits. +func (image *TaskImage) release() { + // Nil out pointers so that if the task is saved after release, it doesn't + // follow the pointers to possibly now-invalid objects. + if image.MemoryManager != nil { + image.MemoryManager.DecUsers(context.Background()) + image.MemoryManager = nil + } + image.fu = nil +} + +// Fork returns a duplicate of image. The copied TaskImage always has an +// independent arch.Context. If shareAddressSpace is true, the copied +// TaskImage shares an address space with the original; otherwise, the copied +// TaskImage has an independent address space that is initially a duplicate +// of the original's. +func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) { + newImage := &TaskImage{ + Name: image.Name, + Arch: image.Arch.Fork(), + st: image.st, + } + if shareAddressSpace { + newImage.MemoryManager = image.MemoryManager + if newImage.MemoryManager != nil { + if !newImage.MemoryManager.IncUsers() { + // Shouldn't be possible since image.MemoryManager should be a + // counted user. + panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager")) + } + } + newImage.fu = image.fu + } else { + newMM, err := image.MemoryManager.Fork(ctx) + if err != nil { + return nil, err + } + newImage.MemoryManager = newMM + newImage.fu = k.futexes.Fork() + } + return newImage, nil +} + +// Arch returns t's arch.Context. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Arch() arch.Context { + return t.image.Arch +} + +// MemoryManager returns t's MemoryManager. MemoryManager does not take an +// additional reference on the returned MM. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) MemoryManager() *mm.MemoryManager { + return t.image.MemoryManager +} + +// SyscallTable returns t's syscall table. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) SyscallTable() *SyscallTable { + return t.image.st +} + +// Stack returns the userspace stack. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Stack() *arch.Stack { + return &arch.Stack{ + Arch: t.Arch(), + IO: t.MemoryManager(), + Bottom: usermem.Addr(t.Arch().Stack()), + } +} + +// LoadTaskImage loads a specified file into a new TaskImage. +// +// args.MemoryManager does not need to be set by the caller. +func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) { + // If File is not nil, we should load that instead of resolving Filename. + if args.File != nil { + args.Filename = args.File.PathnameWithDeleted(ctx) + } + + // Prepare a new user address space to load into. + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + defer m.DecUsers(ctx) + args.MemoryManager = m + + os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) + if err != nil { + return nil, err + } + + // Lookup our new syscall table. + st, ok := LookupSyscallTable(os, ac.Arch()) + if !ok { + // No syscall table found. This means that the ELF binary does not match + // the architecture. + return nil, errNoSyscalls + } + + if !m.IncUsers() { + panic("Failed to increment users count on new MM") + } + return &TaskImage{ + Name: name, + Arch: ac, + MemoryManager: m, + fu: k.futexes.Fork(), + st: st, + }, nil +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index d23cea802..c70e5e6ce 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -19,6 +19,7 @@ import ( "runtime/trace" "sort" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) @@ -215,7 +216,7 @@ func (t *Task) rebuildTraceContext(tid ThreadID) { // arbitrarily large (in general it won't be, especially for cases // where we're collecting a brief profile), so using the TID is a // reasonable compromise in this case. - t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) + t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid)) } // traceCloneEvent is called when a new task is spawned. @@ -237,11 +238,11 @@ func (t *Task) traceExitEvent() { } // traceExecEvent is called when a task calls exec. -func (t *Task) traceExecEvent(tc *TaskContext) { +func (t *Task) traceExecEvent(image *TaskImage) { if !trace.IsEnabled() { return } - file := tc.MemoryManager.Executable() + file := image.MemoryManager.Executable() if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 8dc3fec90..3ccecf4b6 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -16,11 +16,13 @@ package kernel import ( "bytes" + "fmt" "runtime" "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -57,6 +59,8 @@ type taskRunState interface { // make it visible in stack dumps. A goroutine for a given task can be identified // searching for Task.run()'s argument value. func (t *Task) run(threadID uintptr) { + atomic.StoreInt64(&t.goid, goid.Get()) + // Construct t.blockingTimer here. We do this here because we can't // reconstruct t.blockingTimer during restore in Task.afterLoad(), because // kernel.timekeeper.SetClocks() hasn't been called yet. @@ -99,6 +103,9 @@ func (t *Task) run(threadID uintptr) { t.tg.pidns.owner.runningGoroutines.Done() t.p.Release() + // Deferring this store triggers a false positive in the race + // detector (https://github.com/golang/go/issues/42599). + atomic.StoreInt64(&t.goid, 0) // Keep argument alive because stack trace for dead variables may not be correct. runtime.KeepAlive(threadID) return @@ -317,7 +324,7 @@ func (app *runApp) execute(t *Task) taskRunState { // region. We should be able to easily identify // vsyscalls by having a <fault><syscall> pair. if at.Execute { - if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + if sysno, ok := t.image.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) } } @@ -375,6 +382,19 @@ func (app *runApp) execute(t *Task) taskRunState { } } +// assertTaskGoroutine panics if the caller is not running on t's task +// goroutine. +func (t *Task) assertTaskGoroutine() { + if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want { + panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) + } +} + +// GoroutineID returns the ID of t's task goroutine. +func (t *Task) GoroutineID() int64 { + return atomic.LoadInt64(&t.goid) +} + // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. func (t *Task) waitGoroutineStoppedOrExited() { t.goroutineStopped.Wait() diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 52c55d13d..9ba5f8d78 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { t.goschedSeq.EndWrite() } +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) accountTaskGoroutineRunning() { + now := t.k.CPUClockNow() + if t.gosched.State != TaskGoroutineRunningSys { + panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys)) + } + t.goschedSeq.BeginWrite() + t.gosched.SysTicks += now - t.gosched.Timestamp + t.gosched.Timestamp = now + t.goschedSeq.EndWrite() +} + // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. // Most clients should use t.CPUStats() instead. func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index ebdb83061..42dd3e278 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { return } }) - // We have to re-issue the interrupt consumed by t.interrupted() since - // it might have been for a different reason. - t.interruptSelf() } // Conversely, if the new mask unblocks any signals that were blocked by @@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { type runInterrupt struct{} func (*runInterrupt) execute(t *Task) taskRunState { - // Interrupts are de-duplicated (if t is interrupted twice before - // t.interrupted() is called, t.interrupted() will only return true once), - // so early exits from this function must re-enter the runInterrupt state - // to check for more interrupt-signaled conditions. + // Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect + // of all previous calls to t.interrupted() regardless of how many such + // calls there have been), so early exits from this function must re-enter + // the runInterrupt state to check for more interrupt-signaled conditions. t.tg.signalHandlers.mu.Lock() @@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { return t.deliverSignal(info, act) } + t.unsetInterrupted() t.tg.signalHandlers.mu.Unlock() return (*runApp)(nil) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8e28230cc..36e1384f1 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -46,10 +46,10 @@ type TaskConfig struct { // SignalMask is the new task's initial signal mask. SignalMask linux.SignalSet - // TaskContext is the TaskContext of the new task. Ownership of the - // TaskContext is transferred to TaskSet.NewTask, whether or not it + // TaskImage is the TaskImage of the new task. Ownership of the + // TaskImage is transferred to TaskSet.NewTask, whether or not it // succeeds. - TaskContext *TaskContext + TaskImage *TaskImage // FSContext is the FSContext of the new task. A reference must be held on // FSContext, which is transferred to TaskSet.NewTask whether or not it @@ -105,7 +105,7 @@ type TaskConfig struct { func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { - cfg.TaskContext.release() + cfg.TaskImage.release() cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.IPCNamespace.DecRef(ctx) @@ -121,7 +121,7 @@ func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) // of cfg if it succeeds. func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { tg := cfg.ThreadGroup - tc := cfg.TaskContext + image := cfg.TaskImage t := &Task{ taskNode: taskNode{ tg: tg, @@ -132,7 +132,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { interruptChan: make(chan struct{}, 1), signalMask: cfg.SignalMask, signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, - tc: *tc, + image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index fd92c3873..3f5be276b 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -263,13 +263,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return usermem.NoAccess, platform.ErrContextInterrupt case ring0.El0SyncUndef: return c.fault(int32(syscall.SIGILL), info) - case ring0.El1SyncUndef: - *info = arch.SignalInfo{ - Signo: int32(syscall.SIGILL), - Code: 1, // ILL_ILLOPC (illegal opcode). - } - info.SetAddr(switchOpts.Registers.Pc) // Include address. - return usermem.AccessType{}, platform.ErrContextSignal default: panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go index 327d48465..c51df2811 100644 --- a/pkg/sentry/platform/ring0/aarch64.go +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -90,6 +90,7 @@ const ( El0SyncIa El0SyncFpsimdAcc El0SyncSveAcc + El0SyncFpsimdExc El0SyncSys El0SyncSpPc El0SyncUndef diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index f489ad352..5f4b2105a 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -324,6 +324,18 @@ MOVD CPU_TTBR0_KVM(from), RSV_REG; \ MSR RSV_REG, TTBR0_EL1; +TEXT ·EnableVFP(SB),NOSPLIT,$0 + MOVD $FPEN_ENABLE, R0 + WORD $0xd5181040 //MSR R0, CPACR_EL1 + ISB $15 + RET + +TEXT ·DisableVFP(SB),NOSPLIT,$0 + MOVD $0, R0 + WORD $0xd5181040 //MSR R0, CPACR_EL1 + ISB $15 + RET + #define VFP_ENABLE \ MOVD $FPEN_ENABLE, R0; \ WORD $0xd5181040; \ //MSR R0, CPACR_EL1 @@ -370,12 +382,12 @@ MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. -// EXCEPTION_WITH_ERROR is a common exception handler function. -#define EXCEPTION_WITH_ERROR(user, vector) \ +// EXCEPTION_EL0 is a common el0 exception handler function. +#define EXCEPTION_EL0(vector) \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 WORD $0xd538601a; \ //MRS FAR_EL1, R26 MOVD R26, CPU_FAULT_ADDR(RSV_REG); \ - MOVD $user, R3; \ + MOVD $1, R3; \ MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user. MOVD $vector, R3; \ MOVD R3, CPU_VECTOR_CODE(RSV_REG); \ @@ -383,6 +395,12 @@ MOVD R3, CPU_ERROR_CODE(RSV_REG); \ B ·kernelExitToEl1(SB); +// EXCEPTION_EL1 is a common el1 exception handler function. +#define EXCEPTION_EL1(vector) \ + MOVD $vector, R3; \ + MOVD R3, 8(RSP); \ + B ·HaltEl1ExceptionAndResume(SB); + // storeAppASID writes the application's asid value. TEXT ·storeAppASID(SB),NOSPLIT,$0-8 MOVD asid+0(FP), R1 @@ -430,6 +448,16 @@ TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 CALL ·kernelSyscall(SB) // Call the trampoline. B ·kernelExitToEl1(SB) // Resume. +// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. +TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0-8 + WORD $0xd538d092 // MRS TPIDR_EL1, R18 + MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. + MOVD R3, 8(RSP) // First argument (vCPU). + MOVD vector+0(FP), R3 + MOVD R3, 16(RSP) // Second argument (vector). + CALL ·kernelException(SB) // Call the trampoline. + B ·kernelExitToEl1(SB) // Resume. + // Shutdown stops the guest. TEXT ·Shutdown(SB),NOSPLIT,$0 // PSCI EVENT. @@ -592,39 +620,22 @@ TEXT ·El1_sync(SB),NOSPLIT,$0 B el1_invalid el1_da: + EXCEPTION_EL1(El1SyncDa) el1_ia: - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - WORD $0xd538601a //MRS FAR_EL1, R26 - - MOVD R26, CPU_FAULT_ADDR(RSV_REG) - - MOVD $0, CPU_ERROR_TYPE(RSV_REG) - - MOVD $PageFault, R3 - MOVD R3, CPU_VECTOR_CODE(RSV_REG) - - B ·HaltAndResume(SB) - + EXCEPTION_EL1(El1SyncIa) el1_sp_pc: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncSpPc) el1_undef: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncUndef) el1_svc: - MOVD $0, CPU_ERROR_CODE(RSV_REG) - MOVD $0, CPU_ERROR_TYPE(RSV_REG) B ·HaltEl1SvcAndResume(SB) - el1_dbg: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncDbg) el1_fpsimd_acc: VFP_ENABLE B ·kernelExitToEl1(SB) // Resume. - el1_invalid: - B ·Shutdown(SB) + EXCEPTION_EL1(El1SyncInv) // El1_irq is the handler for El1_irq. TEXT ·El1_irq(SB),NOSPLIT,$0 @@ -680,28 +691,21 @@ el0_svc: el0_da: el0_ia: - EXCEPTION_WITH_ERROR(1, PageFault) - + EXCEPTION_EL0(PageFault) el0_fpsimd_acc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncFpsimdAcc) el0_sve_acc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncSveAcc) el0_fpsimd_exc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncFpsimdExc) el0_sp_pc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncSpPc) el0_undef: - EXCEPTION_WITH_ERROR(1, El0SyncUndef) - + EXCEPTION_EL0(El0SyncUndef) el0_dbg: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncDbg) el0_invalid: - B ·Shutdown(SB) + EXCEPTION_EL0(El0SyncInv) TEXT ·El0_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index 6cbbf001f..90a7b8392 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -24,6 +24,10 @@ func HaltAndResume() //go:nosplit func HaltEl1SvcAndResume() +// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. +//go:nosplit +func HaltEl1ExceptionAndResume() + // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { } @@ -61,11 +65,13 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Pstate &= ^uint64(PsrFlagsClear) regs.Pstate |= UserFlagsSet + EnableVFP() LoadFloatingPoint(switchOpts.FloatingPointState) kernelExitToEl0() SaveFloatingPoint(switchOpts.FloatingPointState) + DisableVFP() vector = c.vecCode diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index d91a09de1..0dffd33a3 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -53,6 +53,12 @@ func LoadFloatingPoint(*byte) // SaveFloatingPoint saves floating point state. func SaveFloatingPoint(*byte) +// EnableVFP enables fpsimd. +func EnableVFP() + +// DisableVFP disables fpsimd. +func DisableVFP() + // Init sets function pointers based on architectural features. // // This must be called prior to using ring0. diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s index 19c1fca8b..6f4923539 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -35,62 +35,47 @@ TEXT ·CPACREL1(SB),NOSPLIT,$0-8 RET TEXT ·GetFPCR(SB),NOSPLIT,$0-8 - WORD $0xd53b4201 // MRS NZCV, R1 + MOVD FPCR, R1 MOVD R1, ret+0(FP) RET TEXT ·GetFPSR(SB),NOSPLIT,$0-8 - WORD $0xd53b4421 // MRS FPSR, R1 + MOVD FPSR, R1 MOVD R1, ret+0(FP) RET TEXT ·SetFPCR(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R1 - WORD $0xd51b4201 // MSR R1, NZCV + MOVD R1, FPCR RET TEXT ·SetFPSR(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R1 - WORD $0xd51b4421 // MSR R1, FPSR + MOVD R1, FPSR RET TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R0 // Skip aarch64_ctx, fpsr, fpcr. - FMOVD F0, 16*1(R0) - FMOVD F1, 16*2(R0) - FMOVD F2, 16*3(R0) - FMOVD F3, 16*4(R0) - FMOVD F4, 16*5(R0) - FMOVD F5, 16*6(R0) - FMOVD F6, 16*7(R0) - FMOVD F7, 16*8(R0) - FMOVD F8, 16*9(R0) - FMOVD F9, 16*10(R0) - FMOVD F10, 16*11(R0) - FMOVD F11, 16*12(R0) - FMOVD F12, 16*13(R0) - FMOVD F13, 16*14(R0) - FMOVD F14, 16*15(R0) - FMOVD F15, 16*16(R0) - FMOVD F16, 16*17(R0) - FMOVD F17, 16*18(R0) - FMOVD F18, 16*19(R0) - FMOVD F19, 16*20(R0) - FMOVD F20, 16*21(R0) - FMOVD F21, 16*22(R0) - FMOVD F22, 16*23(R0) - FMOVD F23, 16*24(R0) - FMOVD F24, 16*25(R0) - FMOVD F25, 16*26(R0) - FMOVD F26, 16*27(R0) - FMOVD F27, 16*28(R0) - FMOVD F28, 16*29(R0) - FMOVD F29, 16*30(R0) - FMOVD F30, 16*31(R0) - FMOVD F31, 16*32(R0) - ISB $15 + ADD $16, R0, R0 + + WORD $0xad000400 // stp q0, q1, [x0] + WORD $0xad010c02 // stp q2, q3, [x0, #32] + WORD $0xad021404 // stp q4, q5, [x0, #64] + WORD $0xad031c06 // stp q6, q7, [x0, #96] + WORD $0xad042408 // stp q8, q9, [x0, #128] + WORD $0xad052c0a // stp q10, q11, [x0, #160] + WORD $0xad06340c // stp q12, q13, [x0, #192] + WORD $0xad073c0e // stp q14, q15, [x0, #224] + WORD $0xad084410 // stp q16, q17, [x0, #256] + WORD $0xad094c12 // stp q18, q19, [x0, #288] + WORD $0xad0a5414 // stp q20, q21, [x0, #320] + WORD $0xad0b5c16 // stp q22, q23, [x0, #352] + WORD $0xad0c6418 // stp q24, q25, [x0, #384] + WORD $0xad0d6c1a // stp q26, q27, [x0, #416] + WORD $0xad0e741c // stp q28, q29, [x0, #448] + WORD $0xad0f7c1e // stp q30, q31, [x0, #480] RET @@ -98,39 +83,24 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R0 // Skip aarch64_ctx, fpsr, fpcr. - FMOVD 16*1(R0), F0 - FMOVD 16*2(R0), F1 - FMOVD 16*3(R0), F2 - FMOVD 16*4(R0), F3 - FMOVD 16*5(R0), F4 - FMOVD 16*6(R0), F5 - FMOVD 16*7(R0), F6 - FMOVD 16*8(R0), F7 - FMOVD 16*9(R0), F8 - FMOVD 16*10(R0), F9 - FMOVD 16*11(R0), F10 - FMOVD 16*12(R0), F11 - FMOVD 16*13(R0), F12 - FMOVD 16*14(R0), F13 - FMOVD 16*15(R0), F14 - FMOVD 16*16(R0), F15 - FMOVD 16*17(R0), F16 - FMOVD 16*18(R0), F17 - FMOVD 16*19(R0), F18 - FMOVD 16*20(R0), F19 - FMOVD 16*21(R0), F20 - FMOVD 16*22(R0), F21 - FMOVD 16*23(R0), F22 - FMOVD 16*24(R0), F23 - FMOVD 16*25(R0), F24 - FMOVD 16*26(R0), F25 - FMOVD 16*27(R0), F26 - FMOVD 16*28(R0), F27 - FMOVD 16*29(R0), F28 - FMOVD 16*30(R0), F29 - FMOVD 16*31(R0), F30 - FMOVD 16*32(R0), F31 - ISB $15 + ADD $16, R0, R0 + + WORD $0xad400400 // ldp q0, q1, [x0] + WORD $0xad410c02 // ldp q2, q3, [x0, #32] + WORD $0xad421404 // ldp q4, q5, [x0, #64] + WORD $0xad431c06 // ldp q6, q7, [x0, #96] + WORD $0xad442408 // ldp q8, q9, [x0, #128] + WORD $0xad452c0a // ldp q10, q11, [x0, #160] + WORD $0xad46340c // ldp q12, q13, [x0, #192] + WORD $0xad473c0e // ldp q14, q15, [x0, #224] + WORD $0xad484410 // ldp q16, q17, [x0, #256] + WORD $0xad494c12 // ldp q18, q19, [x0, #288] + WORD $0xad4a5414 // ldp q20, q21, [x0, #320] + WORD $0xad4b5c16 // ldp q22, q23, [x0, #352] + WORD $0xad4c6418 // ldp q24, q25, [x0, #384] + WORD $0xad4d6c1a // ldp q26, q27, [x0, #416] + WORD $0xad4e741c // ldp q28, q29, [x0, #448] + WORD $0xad4f7c1e // ldp q30, q31, [x0, #480] RET diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index 53bc3353c..b5652deb9 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -70,6 +70,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa) fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc) fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc) + fmt.Fprintf(w, "#define El0SyncFpsimdExc 0x%02x\n", El0SyncFpsimdExc) fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys) fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc) fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef) diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 3baad098b..057f4d294 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -120,9 +120,6 @@ type socketOpsCommon struct { // fixed buffer but only consume this many bytes. sendBufferSize uint32 - // passcred indicates if this socket wants SCM credentials. - passcred bool - // filter indicates that this socket has a BPF filter "installed". // // TODO(gvisor.dev/issue/1119): We don't actually support filtering, @@ -201,10 +198,7 @@ func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { // Passcred implements transport.Credentialer.Passcred. func (s *socketOpsCommon) Passcred() bool { - s.mu.Lock() - passcred := s.passcred - s.mu.Unlock() - return passcred + return s.ep.SocketOptions().GetPassCred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. @@ -419,9 +413,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] } passcred := usermem.ByteOrder.Uint32(opt) - s.mu.Lock() - s.passcred = passcred != 0 - s.mu.Unlock() + s.ep.SocketOptions().SetPassCred(passcred != 0) return nil case linux.SO_ATTACH_FILTER: diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 86c634715..9c927efa0 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -260,8 +260,13 @@ type commonEndpoint interface { // transport.Endpoint.GetSockOpt. GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) - // LastError implements tcpip.Endpoint.LastError. + // LastError implements tcpip.Endpoint.LastError and + // transport.Endpoint.LastError. LastError() *tcpip.Error + + // SocketOptions implements tcpip.Endpoint.SocketOptions and + // transport.Endpoint.SocketOptions. + SocketOptions() *tcpip.SocketOptions } // LINT.IfChange @@ -1065,13 +1070,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.PasscredOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred())) + return &v, nil case linux.SO_SNDBUF: if outLen < sizeOfInt32 { @@ -1163,13 +1163,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.BroadcastOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast())) + return &v, nil case linux.SO_KEEPALIVE: if outLen < sizeOfInt32 { @@ -1916,7 +1911,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.BroadcastOption, v != 0)) + ep.SocketOptions().SetBroadcast(v != 0) + return nil case linux.SO_PASSCRED: if len(optVal) < sizeOfInt32 { @@ -1924,7 +1920,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0)) + ep.SocketOptions().SetPassCred(v != 0) + return nil case linux.SO_KEEPALIVE: if len(optVal) < sizeOfInt32 { @@ -2687,7 +2684,7 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ // Always do at least one fetchReadView, even if the number of bytes to // read is 0. err = s.fetchReadView() - if err != nil { + if err != nil || len(s.readView) == 0 { break } if dst.NumBytes() == 0 { @@ -2710,15 +2707,20 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ } copied += n s.readView.TrimFront(n) - if len(s.readView) == 0 { - atomic.StoreUint32(&s.readViewHasData, 0) - } dst = dst.DropFirst(n) if e != nil { err = syserr.FromError(e) break } + // If we are done reading requested data then stop. + if dst.NumBytes() == 0 { + break + } + } + + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) } // If we managed to copy something, we must deliver it. diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index b648273a4..0324dcd93 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -16,8 +16,6 @@ package transport import ( - "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -203,8 +201,12 @@ type Endpoint interface { // procfs. State() uint32 - // LastError implements tcpip.Endpoint.LastError. + // LastError clears and returns the last error reported by the endpoint. LastError() *tcpip.Error + + // SocketOptions returns the structure which contains all the socket + // level options. + SocketOptions() *tcpip.SocketOptions } // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket @@ -737,10 +739,6 @@ func (e *connectedEndpoint) CloseUnread() { type baseEndpoint struct { *waiter.Queue - // passcred specifies whether SCM_CREDENTIALS socket control messages are - // enabled on this endpoint. Must be accessed atomically. - passcred int32 - // Mutex protects the below fields. sync.Mutex `state:"nosave"` @@ -757,6 +755,8 @@ type baseEndpoint struct { // linger is used for SO_LINGER socket option. linger tcpip.LingerOption + + ops tcpip.SocketOptions } // EventRegister implements waiter.Waitable.EventRegister. @@ -781,7 +781,7 @@ func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { // Passcred implements Credentialer.Passcred. func (e *baseEndpoint) Passcred() bool { - return atomic.LoadInt32(&e.passcred) != 0 + return e.SocketOptions().GetPassCred() } // ConnectedPasscred implements Credentialer.ConnectedPasscred. @@ -791,14 +791,6 @@ func (e *baseEndpoint) ConnectedPasscred() bool { return e.connected != nil && e.connected.Passcred() } -func (e *baseEndpoint) setPasscred(pc bool) { - if pc { - atomic.StoreInt32(&e.passcred, 1) - } else { - atomic.StoreInt32(&e.passcred, 0) - } -} - // Connected implements ConnectingEndpoint.Connected. func (e *baseEndpoint) Connected() bool { return e.receiver != nil && e.connected != nil @@ -865,9 +857,6 @@ func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { switch opt { - case tcpip.BroadcastOption: - case tcpip.PasscredOption: - e.setPasscred(v) case tcpip.ReuseAddressOption: default: log.Warningf("Unsupported socket option: %d", opt) @@ -890,9 +879,6 @@ func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: return false, nil - case tcpip.PasscredOption: - return e.Passcred(), nil - default: log.Warningf("Unsupported socket option: %d", opt) return false, tcpip.ErrUnknownProtocolOption @@ -980,6 +966,11 @@ func (*baseEndpoint) LastError() *tcpip.Error { return nil } +// SocketOptions implements Endpoint.SocketOptions. +func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions { + return &e.ops +} + // Shutdown closes the read and/or write end of the endpoint connection to its // peer. func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index 309c183a3..88cd234d1 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -90,6 +90,9 @@ var setableLimits = map[limits.LimitType]struct{}{ limits.FileSize: {}, limits.MemoryLocked: {}, limits.Stack: {}, + // RSS can be set, but it's not enforced because Linux doesn't enforce it + // either: "This limit has effect only in Linux 2.4.x, x < 30" + limits.Rss: {}, // These are not enforced, but we include them here to avoid returning // EPERM, since some apps expect them to succeed. limits.Core: {}, diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 39ca9ea97..983f8d396 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -159,7 +159,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user defer wd.DecRef(t) } - // Load the new TaskContext. + // Load the new TaskImage. remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd), @@ -173,12 +173,12 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user Features: t.Arch().FeatureSet(), } - tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + image, se := t.Kernel().LoadTaskImage(t, loadArgs) if se != nil { return 0, nil, se.ToError() } - ctrl, err := t.Execve(tc) + ctrl, err := t.Execve(image) return 0, ctrl, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index c8ce2aabc..7a409620d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -109,7 +109,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user executable = fsbridge.NewVFSFile(file) } - // Load the new TaskContext. + // Load the new TaskImage. mntns := t.MountNamespaceVFS2() wd := t.FSContext().WorkingDirectoryVFS2() defer wd.DecRef(t) @@ -126,11 +126,11 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user Features: t.Arch().FeatureSet(), } - tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + image, se := t.Kernel().LoadTaskImage(t, loadArgs) if se != nil { return 0, nil, se.ToError() } - ctrl, err := t.Execve(tc) + ctrl, err := t.Execve(image) return 0, ctrl, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 9ce4f280a..8bb763a47 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -343,8 +343,8 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy data. var ( - n int64 - err error + total int64 + err error ) dw := dualWaiter{ inFile: inFile, @@ -357,13 +357,20 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // can block. nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0 if outIsPipe { - for n < count { - var spliceN int64 - spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count) + for { + var n int64 + n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total) if offset != -1 { - offset += spliceN + offset += n + } + total += n + if total == count { + break + } + if err == nil && t.Interrupted() { + err = syserror.ErrInterrupted + break } - n += spliceN if err == syserror.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) } @@ -374,7 +381,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } else { // Read inFile to buffer, then write the contents to outFile. buf := make([]byte, count) - for n < count { + for { var readN int64 if offset != -1 { readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{}) @@ -382,7 +389,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } else { readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) } - n += readN // Write all of the bytes that we read. This may need // multiple write calls to complete. @@ -398,7 +404,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We didn't complete the write. Only report the bytes that were actually // written, and rewind offsets as needed. notWritten := int64(len(wbuf)) - n -= notWritten + readN -= notWritten if offset == -1 { // We modified the offset of the input file itself during the read // operation. Rewind it. @@ -415,6 +421,16 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc break } } + + total += readN + buf = buf[readN:] + if total == count { + break + } + if err == nil && t.Interrupted() { + err = syserror.ErrInterrupted + break + } if err == syserror.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) } @@ -432,7 +448,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } } - if n != 0 { + if total != 0 { inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) @@ -445,7 +461,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile) + return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile) } // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index bbafb8b7f..1d1062aeb 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -336,11 +336,9 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders))) for t, o := range offenders { tid := w.k.TaskSet().Root.IDOfTask(t) - buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime))) + buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime))) } - buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine") - // Force stack dump only if a new task is detected. w.doAction(w.TaskTimeoutAction, newTaskFound, &buf) } |