summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/lock/lock.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs/lock/lock.go')
-rw-r--r--pkg/sentry/fs/lock/lock.go461
1 files changed, 461 insertions, 0 deletions
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
new file mode 100644
index 000000000..f2aee4512
--- /dev/null
+++ b/pkg/sentry/fs/lock/lock.go
@@ -0,0 +1,461 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock is the API for POSIX-style advisory regional file locks and
+// BSD-style full file locks.
+//
+// Callers needing to enforce these types of locks, like sys_fcntl, can call
+// LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are
+// specific to a unique file (unique device/inode pair) and for this reason
+// should not be shared between files.
+//
+// A Lock has a set of holders identified by UniqueID. Normally this is the
+// pid of the thread attempting to acquire the lock.
+//
+// Since these are advisory locks, they do not need to be integrated into
+// Reads/Writes and for this reason there is no way to *check* if a lock is
+// held. One can only attempt to take a lock or unlock an existing lock.
+//
+// A Lock in a set of Locks is typed: it is either a read lock with any number
+// of readers and no writer, or a write lock with no readers.
+//
+// As expected from POSIX, any attempt to acquire a write lock on a file region
+// when there already exits a write lock held by a different uid will fail. Any
+// attempt to acquire a write lock on a file region when there is more than one
+// reader will fail. Any attempt to acquire a read lock on a file region when
+// there is already a writer will fail.
+//
+// In special cases, a read lock may be upgraded to a write lock and a write lock
+// can be downgraded to a read lock. This can only happen if:
+//
+// * read lock upgrade to write lock: There can be only one reader and the reader
+// must be the same as the requested write lock holder.
+//
+// * write lock downgrade to read lock: The writer must be the same as the requested
+// read lock holder.
+//
+// UnlockRegion always succeeds. If LockRegion fails the caller should normally
+// interpret this as "try again later".
+package lock
+
+import (
+ "fmt"
+ "math"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// LockType is a type of regional file lock.
+type LockType int
+
+// UniqueID is a unique identifier of the holder of a regional file lock.
+type UniqueID uint64
+
+const (
+ // ReadLock describes a POSIX regional file lock to be taken
+ // read only. There may be multiple of these locks on a single
+ // file region as long as there is no writer lock on the same
+ // region.
+ ReadLock LockType = iota
+
+ // WriteLock describes a POSIX regional file lock to be taken
+ // write only. There may be only a single holder of this lock
+ // and no read locks.
+ WriteLock
+)
+
+// LockEOF is the maximal possible end of a regional file lock.
+const LockEOF = math.MaxUint64
+
+// Lock is a regional file lock. It consists of either a single writer
+// or a set of readers.
+//
+// A Lock may be upgraded from a read lock to a write lock only if there
+// is a single reader and that reader has the same uid as the write lock.
+//
+// A Lock may be downgraded from a write lock to a read lock only if
+// the write lock's uid is the same as the read lock.
+//
+// +stateify savable
+type Lock struct {
+ // Readers are the set of read lock holders identified by UniqueID.
+ // If len(Readers) > 0 then HasWriter must be false.
+ Readers map[UniqueID]bool
+
+ // HasWriter indicates that this is a write lock held by a single
+ // UniqueID.
+ HasWriter bool
+
+ // Writer is only valid if HasWriter is true. It identifies a
+ // single write lock holder.
+ Writer UniqueID
+}
+
+// Locks is a thread-safe wrapper around a LockSet.
+//
+// +stateify savable
+type Locks struct {
+ // mu protects locks below.
+ mu sync.Mutex `state:"nosave"`
+
+ // locks is the set of region locks currently held on an Inode.
+ locks LockSet
+
+ // blockedQueue is the queue of waiters that are waiting on a lock.
+ blockedQueue waiter.Queue `state:"zerovalue"`
+}
+
+// Blocker is the interface used for blocking locks. Passing a nil Blocker
+// will be treated as non-blocking.
+type Blocker interface {
+ Block(C <-chan struct{}) error
+}
+
+const (
+ // EventMaskAll is the mask we will always use for locks, by using the
+ // same mask all the time we can wake up everyone anytime the lock
+ // changes state.
+ EventMaskAll waiter.EventMask = 0xFFFF
+)
+
+// LockRegion attempts to acquire a typed lock for the uid on a region
+// of a file. Returns true if successful in locking the region. If false
+// is returned, the caller should normally interpret this as "try again later" if
+// accquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
+// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
+// will result in non-blocking behavior.
+func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
+ for {
+ l.mu.Lock()
+
+ // Blocking locks must run in a loop because we'll be woken up whenever an unlock event
+ // happens for this lock. We will then attempt to take the lock again and if it fails
+ // continue blocking.
+ res := l.locks.lock(uid, t, r)
+ if !res && block != nil {
+ e, ch := waiter.NewChannelEntry(nil)
+ l.blockedQueue.EventRegister(&e, EventMaskAll)
+ l.mu.Unlock()
+ if err := block.Block(ch); err != nil {
+ // We were interrupted, the caller can translate this to EINTR if applicable.
+ l.blockedQueue.EventUnregister(&e)
+ return false
+ }
+ l.blockedQueue.EventUnregister(&e)
+ continue // Try again now that someone has unlocked.
+ }
+
+ l.mu.Unlock()
+ return res
+ }
+}
+
+// UnlockRegion attempts to release a lock for the uid on a region of a file.
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+ l.locks.unlock(uid, r)
+
+ // Now that we've released the lock, we need to wake up any waiters.
+ l.blockedQueue.Notify(EventMaskAll)
+}
+
+// makeLock returns a new typed Lock that has either uid as its only reader
+// or uid as its only writer.
+func makeLock(uid UniqueID, t LockType) Lock {
+ value := Lock{Readers: make(map[UniqueID]bool)}
+ switch t {
+ case ReadLock:
+ value.Readers[uid] = true
+ case WriteLock:
+ value.HasWriter = true
+ value.Writer = uid
+ default:
+ panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
+ }
+ return value
+}
+
+// isHeld returns true if uid is a holder of Lock.
+func (l Lock) isHeld(uid UniqueID) bool {
+ if l.HasWriter && l.Writer == uid {
+ return true
+ }
+ return l.Readers[uid]
+}
+
+// lock sets uid as a holder of a typed lock on Lock.
+//
+// Preconditions: canLock is true for the range containing this Lock.
+func (l *Lock) lock(uid UniqueID, t LockType) {
+ switch t {
+ case ReadLock:
+ // If we are already a reader, then this is a no-op.
+ if l.Readers[uid] {
+ return
+ }
+ // We cannot downgrade a write lock to a read lock unless the
+ // uid is the same.
+ if l.HasWriter {
+ if l.Writer != uid {
+ panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
+ }
+ // Ensure that there is only one reader if upgrading.
+ l.Readers = make(map[UniqueID]bool)
+ // Ensure that there is no longer a writer.
+ l.HasWriter = false
+ }
+ l.Readers[uid] = true
+ return
+ case WriteLock:
+ // If we are already the writer, then this is a no-op.
+ if l.HasWriter && l.Writer == uid {
+ return
+ }
+ // We can only upgrade a read lock to a write lock if there
+ // is only one reader and that reader has the same uid as
+ // the write lock.
+ if readers := len(l.Readers); readers > 0 {
+ if readers != 1 {
+ panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
+ }
+ if !l.Readers[uid] {
+ panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
+ }
+ }
+ // Ensure that there is only a writer.
+ l.Readers = make(map[UniqueID]bool)
+ l.HasWriter = true
+ l.Writer = uid
+ default:
+ panic(fmt.Sprintf("lock: invalid lock type %d", t))
+ }
+}
+
+// lockable returns true if check returns true for every Lock in LockRange.
+// Further, check should return true if Lock meets the callers requirements
+// for locking Lock.
+func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
+ // Get our starting point.
+ seg := l.LowerBoundSegment(r.Start)
+ for seg.Ok() && seg.Start() < r.End {
+ // Note that we don't care about overruning the end of the
+ // last segment because if everything checks out we'll just
+ // split the last segment.
+ if !check(seg.Value()) {
+ return false
+ }
+ // Jump to the next segment, ignoring gaps, for the same
+ // reason we ignored the first gap.
+ seg = seg.NextSegment()
+ }
+ // No conflict, we can get a lock for uid over the entire range.
+ return true
+}
+
+// canLock returns true if uid will be able to take a Lock of type t on the
+// entire range specified by LockRange.
+func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
+ switch t {
+ case ReadLock:
+ return l.lockable(r, func(value Lock) bool {
+ // If there is no writer, there's no problem adding
+ // another reader.
+ if !value.HasWriter {
+ return true
+ }
+ // If there is a writer, then it must be the same uid
+ // in order to downgrade the lock to a read lock.
+ return value.Writer == uid
+ })
+ case WriteLock:
+ return l.lockable(r, func(value Lock) bool {
+ // If there are only readers.
+ if !value.HasWriter {
+ // Then this uid can only take a write lock if
+ // this is a private upgrade, meaning that the
+ // only reader is uid.
+ return len(value.Readers) == 1 && value.Readers[uid]
+ }
+ // If the uid is already a writer on this region, then
+ // adding a write lock would be a no-op.
+ return value.Writer == uid
+ })
+ default:
+ panic(fmt.Sprintf("canLock: invalid lock type %d", t))
+ }
+}
+
+// lock returns true if uid took a lock of type t on the entire range of LockRange.
+//
+// Preconditions: r.Start <= r.End (will panic otherwise).
+func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
+ if r.Start > r.End {
+ panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
+ }
+
+ // Don't attempt to insert anything with a range of 0 and treat this
+ // as a successful no-op.
+ if r.Length() == 0 {
+ return true
+ }
+
+ // Do a first-pass check. We *could* hold onto the segments we
+ // checked if canLock would return true, but traversing the segment
+ // set should be fast and this keeps things simple.
+ if !l.canLock(uid, t, r) {
+ return false
+ }
+ // Get our starting point.
+ seg, gap := l.Find(r.Start)
+ if gap.Ok() {
+ // Fill in the gap and get the next segment to modify.
+ seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment()
+ } else if seg.Start() < r.Start {
+ // Get our first segment to modify.
+ _, seg = l.Split(seg, r.Start)
+ }
+ for seg.Ok() && seg.Start() < r.End {
+ // Split the last one if necessary.
+ if seg.End() > r.End {
+ seg, _ = l.SplitUnchecked(seg, r.End)
+ }
+
+ // Set the lock on the segment. This is guaranteed to
+ // always be safe, given canLock above.
+ value := seg.ValuePtr()
+ value.lock(uid, t)
+
+ // Fill subsequent gaps.
+ gap = seg.NextGap()
+ if gr := gap.Range().Intersect(r); gr.Length() > 0 {
+ seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment()
+ } else {
+ seg = gap.NextSegment()
+ }
+ }
+ return true
+}
+
+// unlock is always successful. If uid has no locks held for the range LockRange,
+// unlock is a no-op.
+//
+// Preconditions: same as lock.
+func (l *LockSet) unlock(uid UniqueID, r LockRange) {
+ if r.Start > r.End {
+ panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
+ }
+
+ // Same as setlock.
+ if r.Length() == 0 {
+ return
+ }
+
+ // Get our starting point.
+ seg := l.LowerBoundSegment(r.Start)
+ for seg.Ok() && seg.Start() < r.End {
+ // If this segment doesn't have a lock from uid then
+ // there is no need to fragment the set with Isolate (below).
+ // In this case just move on to the next segment.
+ if !seg.Value().isHeld(uid) {
+ seg = seg.NextSegment()
+ continue
+ }
+
+ // Ensure that if we need to unlock a sub-segment that
+ // we don't unlock/remove that entire segment.
+ seg = l.Isolate(seg, r)
+
+ value := seg.Value()
+ var remove bool
+ if value.HasWriter && value.Writer == uid {
+ // If we are unlocking a writer, then since there can
+ // only ever be one writer and no readers, then this
+ // lock should always be removed from the set.
+ remove = true
+ } else if value.Readers[uid] {
+ // If uid is the last reader, then just remove the entire
+ // segment.
+ if len(value.Readers) == 1 {
+ remove = true
+ } else {
+ // Otherwise we need to remove this reader without
+ // affecting any other segment's readers. To do
+ // this, we need to make a copy of the Readers map
+ // and not add this uid.
+ newValue := Lock{Readers: make(map[UniqueID]bool)}
+ for k, v := range value.Readers {
+ if k != uid {
+ newValue.Readers[k] = v
+ }
+ }
+ seg.SetValue(newValue)
+ }
+ }
+ if remove {
+ seg = l.Remove(seg).NextSegment()
+ } else {
+ seg = seg.NextSegment()
+ }
+ }
+}
+
+// ComputeRange takes a positive file offset and computes the start of a LockRange
+// using start (relative to offset) and the end of the LockRange using length. The
+// values of start and length may be negative but the resulting LockRange must
+// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
+func ComputeRange(start, length, offset int64) (LockRange, error) {
+ offset += start
+ // fcntl(2): "l_start can be a negative number provided the offset
+ // does not lie before the start of the file"
+ if offset < 0 {
+ return LockRange{}, syscall.EINVAL
+ }
+
+ // fcntl(2): Specifying 0 for l_len has the special meaning: lock all
+ // bytes starting at the location specified by l_whence and l_start
+ // through to the end of file, no matter how large the file grows.
+ end := uint64(LockEOF)
+ if length > 0 {
+ // fcntl(2): If l_len is positive, then the range to be locked
+ // covers bytes l_start up to and including l_start+l_len-1.
+ //
+ // Since LockRange.End is exclusive we need not -1 from length..
+ end = uint64(offset + length)
+ } else if length < 0 {
+ // fcntl(2): If l_len is negative, the interval described by
+ // lock covers bytes l_start+l_len up to and including l_start-1.
+ //
+ // Since LockRange.End is exclusive we need not -1 from offset.
+ signedEnd := offset
+ // Add to offset using a negative length (subtract).
+ offset += length
+ if offset < 0 {
+ return LockRange{}, syscall.EINVAL
+ }
+ if signedEnd < offset {
+ return LockRange{}, syscall.EOVERFLOW
+ }
+ // At this point signedEnd cannot be negative,
+ // since we asserted that offset is not negative
+ // and it is not less than offset.
+ end = uint64(signedEnd)
+ }
+ // Offset is guaranteed to be positive at this point.
+ return LockRange{Start: uint64(offset), End: end}, nil
+}