1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
|
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package lock is the API for POSIX-style advisory regional file locks and
// BSD-style full file locks.
//
// Callers needing to enforce these types of locks, like sys_fcntl, can call
// LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are
// specific to a unique file (unique device/inode pair) and for this reason
// should not be shared between files.
//
// A Lock has a set of holders identified by UniqueID. Normally this is the
// pid of the thread attempting to acquire the lock.
//
// Since these are advisory locks, they do not need to be integrated into
// Reads/Writes and for this reason there is no way to *check* if a lock is
// held. One can only attempt to take a lock or unlock an existing lock.
//
// A Lock in a set of Locks is typed: it is either a read lock with any number
// of readers and no writer, or a write lock with no readers.
//
// As expected from POSIX, any attempt to acquire a write lock on a file region
// when there already exits a write lock held by a different uid will fail. Any
// attempt to acquire a write lock on a file region when there is more than one
// reader will fail. Any attempt to acquire a read lock on a file region when
// there is already a writer will fail.
//
// In special cases, a read lock may be upgraded to a write lock and a write lock
// can be downgraded to a read lock. This can only happen if:
//
// * read lock upgrade to write lock: There can be only one reader and the reader
// must be the same as the requested write lock holder.
//
// * write lock downgrade to read lock: The writer must be the same as the requested
// read lock holder.
//
// UnlockRegion always succeeds. If LockRegion fails the caller should normally
// interpret this as "try again later".
package lock
import (
"fmt"
"math"
"syscall"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
// LockType is a type of regional file lock.
type LockType int
// UniqueID is a unique identifier of the holder of a regional file lock.
type UniqueID interface{}
const (
// ReadLock describes a POSIX regional file lock to be taken
// read only. There may be multiple of these locks on a single
// file region as long as there is no writer lock on the same
// region.
ReadLock LockType = iota
// WriteLock describes a POSIX regional file lock to be taken
// write only. There may be only a single holder of this lock
// and no read locks.
WriteLock
)
// LockEOF is the maximal possible end of a regional file lock.
//
// A BSD-style full file lock can be represented as a regional file lock from
// offset 0 to LockEOF.
const LockEOF = math.MaxUint64
// Lock is a regional file lock. It consists of either a single writer
// or a set of readers.
//
// A Lock may be upgraded from a read lock to a write lock only if there
// is a single reader and that reader has the same uid as the write lock.
//
// A Lock may be downgraded from a write lock to a read lock only if
// the write lock's uid is the same as the read lock.
//
// +stateify savable
type Lock struct {
// Readers are the set of read lock holders identified by UniqueID.
// If len(Readers) > 0 then HasWriter must be false.
Readers map[UniqueID]bool
// Writer holds the writer unique ID. It's nil if there are no writers.
Writer UniqueID
}
// Locks is a thread-safe wrapper around a LockSet.
//
// +stateify savable
type Locks struct {
// mu protects locks below.
mu sync.Mutex `state:"nosave"`
// locks is the set of region locks currently held on an Inode.
locks LockSet
// blockedQueue is the queue of waiters that are waiting on a lock.
blockedQueue waiter.Queue `state:"zerovalue"`
}
// Blocker is the interface used for blocking locks. Passing a nil Blocker
// will be treated as non-blocking.
type Blocker interface {
Block(C <-chan struct{}) error
}
const (
// EventMaskAll is the mask we will always use for locks, by using the
// same mask all the time we can wake up everyone anytime the lock
// changes state.
EventMaskAll waiter.EventMask = 0xFFFF
)
// LockRegion attempts to acquire a typed lock for the uid on a region
// of a file. Returns true if successful in locking the region. If false
// is returned, the caller should normally interpret this as "try again later" if
// acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
// will result in non-blocking behavior.
func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
for {
l.mu.Lock()
// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
// happens for this lock. We will then attempt to take the lock again and if it fails
// continue blocking.
res := l.locks.lock(uid, t, r)
if !res && block != nil {
e, ch := waiter.NewChannelEntry(nil)
l.blockedQueue.EventRegister(&e, EventMaskAll)
l.mu.Unlock()
if err := block.Block(ch); err != nil {
// We were interrupted, the caller can translate this to EINTR if applicable.
l.blockedQueue.EventUnregister(&e)
return false
}
l.blockedQueue.EventUnregister(&e)
continue // Try again now that someone has unlocked.
}
l.mu.Unlock()
return res
}
}
// UnlockRegion attempts to release a lock for the uid on a region of a file.
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
l.mu.Lock()
defer l.mu.Unlock()
l.locks.unlock(uid, r)
// Now that we've released the lock, we need to wake up any waiters.
l.blockedQueue.Notify(EventMaskAll)
}
// makeLock returns a new typed Lock that has either uid as its only reader
// or uid as its only writer.
func makeLock(uid UniqueID, t LockType) Lock {
value := Lock{Readers: make(map[UniqueID]bool)}
switch t {
case ReadLock:
value.Readers[uid] = true
case WriteLock:
value.Writer = uid
default:
panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
}
return value
}
// isHeld returns true if uid is a holder of Lock.
func (l Lock) isHeld(uid UniqueID) bool {
return l.Writer == uid || l.Readers[uid]
}
// lock sets uid as a holder of a typed lock on Lock.
//
// Preconditions: canLock is true for the range containing this Lock.
func (l *Lock) lock(uid UniqueID, t LockType) {
switch t {
case ReadLock:
// If we are already a reader, then this is a no-op.
if l.Readers[uid] {
return
}
// We cannot downgrade a write lock to a read lock unless the
// uid is the same.
if l.Writer != nil {
if l.Writer != uid {
panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
}
// Ensure that there is only one reader if upgrading.
l.Readers = make(map[UniqueID]bool)
// Ensure that there is no longer a writer.
l.Writer = nil
}
l.Readers[uid] = true
return
case WriteLock:
// If we are already the writer, then this is a no-op.
if l.Writer == uid {
return
}
// We can only upgrade a read lock to a write lock if there
// is only one reader and that reader has the same uid as
// the write lock.
if readers := len(l.Readers); readers > 0 {
if readers != 1 {
panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
}
if !l.Readers[uid] {
panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
}
}
// Ensure that there is only a writer.
l.Readers = make(map[UniqueID]bool)
l.Writer = uid
default:
panic(fmt.Sprintf("lock: invalid lock type %d", t))
}
}
// lockable returns true if check returns true for every Lock in LockRange.
// Further, check should return true if Lock meets the callers requirements
// for locking Lock.
func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
// Get our starting point.
seg := l.LowerBoundSegment(r.Start)
for seg.Ok() && seg.Start() < r.End {
// Note that we don't care about overruning the end of the
// last segment because if everything checks out we'll just
// split the last segment.
if !check(seg.Value()) {
return false
}
// Jump to the next segment, ignoring gaps, for the same
// reason we ignored the first gap.
seg = seg.NextSegment()
}
// No conflict, we can get a lock for uid over the entire range.
return true
}
// canLock returns true if uid will be able to take a Lock of type t on the
// entire range specified by LockRange.
func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
switch t {
case ReadLock:
return l.lockable(r, func(value Lock) bool {
// If there is no writer, there's no problem adding another reader.
if value.Writer == nil {
return true
}
// If there is a writer, then it must be the same uid
// in order to downgrade the lock to a read lock.
return value.Writer == uid
})
case WriteLock:
return l.lockable(r, func(value Lock) bool {
// If there are only readers.
if value.Writer == nil {
// Then this uid can only take a write lock if this is a private
// upgrade, meaning that the only reader is uid.
return len(value.Readers) == 1 && value.Readers[uid]
}
// If the uid is already a writer on this region, then
// adding a write lock would be a no-op.
return value.Writer == uid
})
default:
panic(fmt.Sprintf("canLock: invalid lock type %d", t))
}
}
// lock returns true if uid took a lock of type t on the entire range of
// LockRange.
//
// Preconditions: r.Start <= r.End (will panic otherwise).
func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
if r.Start > r.End {
panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
}
// Don't attempt to insert anything with a range of 0 and treat this
// as a successful no-op.
if r.Length() == 0 {
return true
}
// Do a first-pass check. We *could* hold onto the segments we
// checked if canLock would return true, but traversing the segment
// set should be fast and this keeps things simple.
if !l.canLock(uid, t, r) {
return false
}
// Get our starting point.
seg, gap := l.Find(r.Start)
if gap.Ok() {
// Fill in the gap and get the next segment to modify.
seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment()
} else if seg.Start() < r.Start {
// Get our first segment to modify.
_, seg = l.Split(seg, r.Start)
}
for seg.Ok() && seg.Start() < r.End {
// Split the last one if necessary.
if seg.End() > r.End {
seg, _ = l.SplitUnchecked(seg, r.End)
}
// Set the lock on the segment. This is guaranteed to
// always be safe, given canLock above.
value := seg.ValuePtr()
value.lock(uid, t)
// Fill subsequent gaps.
gap = seg.NextGap()
if gr := gap.Range().Intersect(r); gr.Length() > 0 {
seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment()
} else {
seg = gap.NextSegment()
}
}
return true
}
// unlock is always successful. If uid has no locks held for the range LockRange,
// unlock is a no-op.
//
// Preconditions: same as lock.
func (l *LockSet) unlock(uid UniqueID, r LockRange) {
if r.Start > r.End {
panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
}
// Same as setlock.
if r.Length() == 0 {
return
}
// Get our starting point.
seg := l.LowerBoundSegment(r.Start)
for seg.Ok() && seg.Start() < r.End {
// If this segment doesn't have a lock from uid then
// there is no need to fragment the set with Isolate (below).
// In this case just move on to the next segment.
if !seg.Value().isHeld(uid) {
seg = seg.NextSegment()
continue
}
// Ensure that if we need to unlock a sub-segment that
// we don't unlock/remove that entire segment.
seg = l.Isolate(seg, r)
value := seg.Value()
var remove bool
if value.Writer == uid {
// If we are unlocking a writer, then since there can
// only ever be one writer and no readers, then this
// lock should always be removed from the set.
remove = true
} else if value.Readers[uid] {
// If uid is the last reader, then just remove the entire
// segment.
if len(value.Readers) == 1 {
remove = true
} else {
// Otherwise we need to remove this reader without
// affecting any other segment's readers. To do
// this, we need to make a copy of the Readers map
// and not add this uid.
newValue := Lock{Readers: make(map[UniqueID]bool)}
for k, v := range value.Readers {
if k != uid {
newValue.Readers[k] = v
}
}
seg.SetValue(newValue)
}
}
if remove {
seg = l.Remove(seg).NextSegment()
} else {
seg = seg.NextSegment()
}
}
}
// ComputeRange takes a positive file offset and computes the start of a LockRange
// using start (relative to offset) and the end of the LockRange using length. The
// values of start and length may be negative but the resulting LockRange must
// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
func ComputeRange(start, length, offset int64) (LockRange, error) {
offset += start
// fcntl(2): "l_start can be a negative number provided the offset
// does not lie before the start of the file"
if offset < 0 {
return LockRange{}, syscall.EINVAL
}
// fcntl(2): Specifying 0 for l_len has the special meaning: lock all
// bytes starting at the location specified by l_whence and l_start
// through to the end of file, no matter how large the file grows.
end := uint64(LockEOF)
if length > 0 {
// fcntl(2): If l_len is positive, then the range to be locked
// covers bytes l_start up to and including l_start+l_len-1.
//
// Since LockRange.End is exclusive we need not -1 from length..
end = uint64(offset + length)
} else if length < 0 {
// fcntl(2): If l_len is negative, the interval described by
// lock covers bytes l_start+l_len up to and including l_start-1.
//
// Since LockRange.End is exclusive we need not -1 from offset.
signedEnd := offset
// Add to offset using a negative length (subtract).
offset += length
if offset < 0 {
return LockRange{}, syscall.EINVAL
}
if signedEnd < offset {
return LockRange{}, syscall.EOVERFLOW
}
// At this point signedEnd cannot be negative,
// since we asserted that offset is not negative
// and it is not less than offset.
end = uint64(signedEnd)
}
// Offset is guaranteed to be positive at this point.
return LockRange{Start: uint64(offset), End: end}, nil
}
|