diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/abi/linux/futex.go | 18 | ||||
-rw-r--r-- | pkg/abi/linux/linux_abi_autogen_unsafe.go | 112 | ||||
-rw-r--r-- | pkg/sentry/kernel/futex/futex.go | 8 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel_state_autogen.go | 7 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 4 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exec.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exit.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_futex.go | 125 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 4 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_futex.go | 48 |
10 files changed, 316 insertions, 16 deletions
diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go index 08bfde3b5..8138088a6 100644 --- a/pkg/abi/linux/futex.go +++ b/pkg/abi/linux/futex.go @@ -60,3 +60,21 @@ const ( FUTEX_WAITERS = 0x80000000 FUTEX_OWNER_DIED = 0x40000000 ) + +// FUTEX_BITSET_MATCH_ANY has all bits set. +const FUTEX_BITSET_MATCH_ANY = 0xffffffff + +// ROBUST_LIST_LIMIT protects against a deliberately circular list. +const ROBUST_LIST_LIMIT = 2048 + +// RobustListHead corresponds to Linux's struct robust_list_head. +// +// +marshal +type RobustListHead struct { + List uint64 + FutexOffset uint64 + ListOpPending uint64 +} + +// SizeOfRobustListHead is the size of a RobustListHead struct. +var SizeOfRobustListHead = (*RobustListHead)(nil).SizeBytes() diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go index 10c804128..6ea96906a 100644 --- a/pkg/abi/linux/linux_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go @@ -23,6 +23,7 @@ var _ marshal.Marshallable = (*InetAddr)(nil) var _ marshal.Marshallable = (*Linger)(nil) var _ marshal.Marshallable = (*NumaPolicy)(nil) var _ marshal.Marshallable = (*RSeqCriticalSection)(nil) +var _ marshal.Marshallable = (*RobustListHead)(nil) var _ marshal.Marshallable = (*SignalSet)(nil) var _ marshal.Marshallable = (*Statfs)(nil) var _ marshal.Marshallable = (*Statx)(nil) @@ -139,7 +140,7 @@ func (s *Statx) Packed() bool { // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Statx) MarshalUnsafe(dst []byte) { - if s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { + if s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(s)) } else { s.MarshalBytes(dst) @@ -214,7 +215,7 @@ func (s *Statx) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (s *Statx) WriteTo(w io.Writer) (int64, error) { - if !s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() { + if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) @@ -380,6 +381,103 @@ func (s *Statfs) WriteTo(w io.Writer) (int64, error) { } // SizeBytes implements marshal.Marshallable.SizeBytes. +func (r *RobustListHead) SizeBytes() int { + return 24 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (r *RobustListHead) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(r.List)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(r.FutexOffset)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(r.ListOpPending)) + dst = dst[8:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (r *RobustListHead) UnmarshalBytes(src []byte) { + r.List = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + r.FutexOffset = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + r.ListOpPending = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (r *RobustListHead) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (r *RobustListHead) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(r)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (r *RobustListHead) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(r), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (r *RobustListHead) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) + hdr.Len = r.SizeBytes() + hdr.Cap = r.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that r + // must live until the use above. + runtime.KeepAlive(r) + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (r *RobustListHead) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return r.CopyOutN(task, addr, r.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (r *RobustListHead) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) + hdr.Len = r.SizeBytes() + hdr.Cap = r.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that r + // must live until the use above. + runtime.KeepAlive(r) + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (r *RobustListHead) WriteTo(w io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) + hdr.Len = r.SizeBytes() + hdr.Cap = r.SizeBytes() + + length, err := w.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that r + // must live until the use above. + runtime.KeepAlive(r) + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (n *NumaPolicy) SizeBytes() int { return 4 @@ -701,7 +799,7 @@ func (i *IPTIP) Packed() bool { // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTIP) MarshalUnsafe(dst []byte) { - if i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() { + if i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(i)) } else { i.MarshalBytes(dst) @@ -710,7 +808,7 @@ func (i *IPTIP) MarshalUnsafe(dst []byte) { // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTIP) UnmarshalUnsafe(src []byte) { - if i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() { + if i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { safecopy.CopyOut(unsafe.Pointer(i), src) } else { i.UnmarshalBytes(src) @@ -720,7 +818,7 @@ func (i *IPTIP) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (i *IPTIP) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() { + if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. @@ -750,7 +848,7 @@ func (i *IPTIP) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { // CopyIn implements marshal.Marshallable.CopyIn. //go:nosplit func (i *IPTIP) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - if !i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() { + if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := task.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := task.CopyInBytes(addr, buf) // escapes: okay. @@ -776,7 +874,7 @@ func (i *IPTIP) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (i *IPTIP) WriteTo(w io.Writer) (int64, error) { - if !i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() { + if !i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 732e66da4..bcc1b29a8 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -717,10 +717,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3 } } -// UnlockPI unlock the futex following the Priority-inheritance futex -// rules. The address provided must contain the caller's TID. If there are -// waiters, TID of the next waiter (FIFO) is set to the given address, and the -// waiter woken up. If there are no waiters, 0 is set to the address. +// UnlockPI unlocks the futex following the Priority-inheritance futex rules. +// The address provided must contain the caller's TID. If there are waiters, +// TID of the next waiter (FIFO) is set to the given address, and the waiter +// woken up. If there are no waiters, 0 is set to the address. func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index b267f0556..8ab5e6f8e 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -1059,6 +1059,7 @@ func (x *Task) StateFields() []string { "oldRSeqCPUAddr", "rseqAddr", "rseqSignature", + "robustList", "startTime", } } @@ -1131,7 +1132,8 @@ func (x *Task) StateSave(m state.Sink) { m.Save(56, &x.oldRSeqCPUAddr) m.Save(57, &x.rseqAddr) m.Save(58, &x.rseqSignature) - m.Save(59, &x.startTime) + m.Save(59, &x.robustList) + m.Save(60, &x.startTime) } func (x *Task) StateLoad(m state.Source) { @@ -1192,7 +1194,8 @@ func (x *Task) StateLoad(m state.Source) { m.Load(56, &x.oldRSeqCPUAddr) m.Load(57, &x.rseqAddr) m.Load(58, &x.rseqSignature) - m.Load(59, &x.startTime) + m.Load(59, &x.robustList) + m.Load(60, &x.startTime) m.LoadValue(31, new(*Task), func(y interface{}) { x.loadPtraceTracer(y.(*Task)) }) m.LoadValue(47, new([]bpf.Program), func(y interface{}) { x.loadSyscallFilters(y.([]bpf.Program)) }) m.AfterLoad(x.afterLoad) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index b3d655b6e..c4db05bd8 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -565,6 +565,10 @@ type Task struct { // futexWaiter is exclusive to the task goroutine. futexWaiter *futex.Waiter `state:"nosave"` + // robustList is a pointer to the head of the tasks's robust futex + // list. + robustList usermem.Addr + // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). // diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 9b69f3cbe..7803b98d0 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -207,6 +207,9 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { return flags.CloseOnExec }) + // Handle the robust futex list. + t.exitRobustList() + // NOTE(b/30815691): We currently do not implement privileged // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index c4ade6e8e..231ac548a 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -253,6 +253,9 @@ func (*runExitMain) execute(t *Task) taskRunState { } } + // Handle the robust futex list. + t.exitRobustList() + // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index a53e77c9f..4b535c949 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" ) @@ -52,3 +53,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } + +// GetRobustList sets the robust futex list for the task. +func (t *Task) GetRobustList() usermem.Addr { + t.mu.Lock() + addr := t.robustList + t.mu.Unlock() + return addr +} + +// SetRobustList sets the robust futex list for the task. +func (t *Task) SetRobustList(addr usermem.Addr) { + t.mu.Lock() + t.robustList = addr + t.mu.Unlock() +} + +// exitRobustList walks the robust futex list, marking locks dead and notifying +// wakers. It corresponds to Linux's exit_robust_list(). Following Linux, +// errors are silently ignored. +func (t *Task) exitRobustList() { + t.mu.Lock() + addr := t.robustList + t.robustList = 0 + t.mu.Unlock() + + if addr == 0 { + return + } + + var rl linux.RobustListHead + if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil { + return + } + + next := rl.List + done := 0 + var pendingLockAddr usermem.Addr + if rl.ListOpPending != 0 { + pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset) + } + + // Wake up normal elements. + for usermem.Addr(next) != addr { + // We traverse to the next element of the list before we + // actually wake anything. This prevents the race where waking + // this futex causes a modification of the list. + thisLockAddr := usermem.Addr(next + rl.FutexOffset) + + // Try to decode the next element in the list before waking the + // current futex. But don't check the error until after we've + // woken the current futex. Linux does it in this order too + _, nextErr := t.CopyIn(usermem.Addr(next), &next) + + // Wakeup the current futex if it's not pending. + if thisLockAddr != pendingLockAddr { + t.wakeRobustListOne(thisLockAddr) + } + + // If there was an error copying the next futex, we must bail. + if nextErr != nil { + break + } + + // This is a user structure, so it could be a massive list, or + // even contain a loop if they are trying to mess with us. We + // cap traversal to prevent that. + done++ + if done >= linux.ROBUST_LIST_LIMIT { + break + } + } + + // Is there a pending entry to wake? + if pendingLockAddr != 0 { + t.wakeRobustListOne(pendingLockAddr) + } +} + +// wakeRobustListOne wakes a single futex from the robust list. +func (t *Task) wakeRobustListOne(addr usermem.Addr) { + // Bit 0 in address signals PI futex. + pi := addr&1 == 1 + addr = addr &^ 1 + + // Load the futex. + f, err := t.LoadUint32(addr) + if err != nil { + // Can't read this single value? Ignore the problem. + // We can wake the other futexes in the list. + return + } + + tid := uint32(t.ThreadID()) + for { + // Is this held by someone else? + if f&linux.FUTEX_TID_MASK != tid { + return + } + + // This thread is dying and it's holding this futex. We need to + // set the owner died bit and wake up any waiters. + newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED + if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil { + return + } else if curF != f { + // Futex changed out from under us. Try again... + f = curF + continue + } + + // Wake waiters if there are any. + if f&linux.FUTEX_WAITERS != 0 { + private := f&linux.FUTEX_PRIVATE_FLAG != 0 + if pi { + t.Futex().UnlockPI(t, addr, tid, private) + return + } + t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1) + } + + // Done. + return + } +} diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index ea4f9b1a7..80c65164a 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{ 270: syscalls.Supported("pselect", Pselect), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 273: syscalls.Supported("set_robust_list", SetRobustList), + 274: syscalls.Supported("get_robust_list", GetRobustList), 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index b68261f72..f04d78856 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.FUTEX_WAIT: // WAIT uses a relative timeout. - mask = ^uint32(0) + mask = linux.FUTEX_BITSET_MATCH_ANY var timeoutDur time.Duration if !forever { timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond @@ -286,3 +286,49 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ENOSYS } } + +// SetRobustList implements linux syscall set_robust_list(2). +func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + head := args[0].Pointer() + length := args[1].SizeT() + + if length != uint(linux.SizeOfRobustListHead) { + return 0, nil, syserror.EINVAL + } + t.SetRobustList(head) + return 0, nil, nil +} + +// GetRobustList implements linux syscall get_robust_list(2). +func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + tid := args[0].Int() + head := args[1].Pointer() + size := args[2].Pointer() + + if tid < 0 { + return 0, nil, syserror.EINVAL + } + + ot := t + if tid != 0 { + if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { + return 0, nil, syserror.ESRCH + } + } + + // Copy out head pointer. + if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil { + return 0, nil, err + } + + // Copy out size, which is a constant. + if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil { + return 0, nil, err + } + + return 0, nil, nil +} |