diff options
Diffstat (limited to 'pkg/sentry')
77 files changed, 903 insertions, 742 deletions
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go index af8230a7d..387f713aa 100644 --- a/pkg/sentry/fs/fdpipe/pipe_state.go +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -34,7 +34,9 @@ func (p *pipeOperations) beforeSave() { } else if p.flags.Write { file, err := p.opener.NonBlockingOpen(context.Background(), fs.PermMask{Write: true}) if err != nil { - panic(fs.ErrSaveRejection{fmt.Errorf("write-only pipe end cannot be re-opened as %v: %v", p, err)}) + panic(&fs.ErrSaveRejection{ + Err: fmt.Errorf("write-only pipe end cannot be re-opened as %#v: %w", p, err), + }) } file.Close() } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index a020da53b..44587bb37 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -144,7 +144,7 @@ type ErrSaveRejection struct { } // Error returns a sensible description of the save rejection error. -func (e ErrSaveRejection) Error() string { +func (e *ErrSaveRejection) Error() string { return "save rejected due to unsupported file system state: " + e.Err.Error() } diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index a3402e343..141e3c27f 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -67,7 +67,9 @@ func (i *inodeFileState) beforeSave() { if i.sattr.Type == fs.RegularFile { uattr, err := i.unstableAttr(&dummyClockContext{context.Background()}) if err != nil { - panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.s.inodeMappings[i.sattr.InodeID], err)}) + panic(&fs.ErrSaveRejection{ + Err: fmt.Errorf("failed to get unstable atttribute of %s: %w", i.s.inodeMappings[i.sattr.InodeID], err), + }) } i.savedUAttr = &uattr } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index a2f3d5918..07b4fb70f 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -257,7 +257,7 @@ func (c *ConnectedEndpoint) Passcred() bool { } // GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. -func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil } diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index ae3331737..4d3b216d8 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -41,6 +41,8 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/sync", "//pkg/waiter", diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 8a5d9c7eb..57686ce07 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -54,6 +54,8 @@ import ( "math" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) @@ -83,6 +85,17 @@ const ( // offset 0 to LockEOF. const LockEOF = math.MaxUint64 +// OwnerInfo describes the owner of a lock. +// +// TODO(gvisor.dev/issue/5264): We may need to add other fields in the future +// (e.g., Linux's file_lock.fl_flags to support open file-descriptor locks). +// +// +stateify savable +type OwnerInfo struct { + // PID is the process ID of the lock owner. + PID int32 +} + // Lock is a regional file lock. It consists of either a single writer // or a set of readers. // @@ -92,14 +105,20 @@ const LockEOF = math.MaxUint64 // A Lock may be downgraded from a write lock to a read lock only if // the write lock's uid is the same as the read lock. // +// Accesses to Lock are synchronized through the Locks object to which it +// belongs. +// // +stateify savable type Lock struct { // Readers are the set of read lock holders identified by UniqueID. - // If len(Readers) > 0 then HasWriter must be false. - Readers map[UniqueID]bool + // If len(Readers) > 0 then Writer must be nil. + Readers map[UniqueID]OwnerInfo // Writer holds the writer unique ID. It's nil if there are no writers. Writer UniqueID + + // WriterInfo describes the writer. It is only meaningful if Writer != nil. + WriterInfo OwnerInfo } // Locks is a thread-safe wrapper around a LockSet. @@ -135,14 +154,14 @@ const ( // acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode. // Blocker is the interface used to provide blocking behavior, passing a nil Blocker // will result in non-blocking behavior. -func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool { +func (l *Locks) LockRegion(uid UniqueID, ownerPID int32, t LockType, r LockRange, block Blocker) bool { for { l.mu.Lock() // Blocking locks must run in a loop because we'll be woken up whenever an unlock event // happens for this lock. We will then attempt to take the lock again and if it fails // continue blocking. - res := l.locks.lock(uid, t, r) + res := l.locks.lock(uid, ownerPID, t, r) if !res && block != nil { e, ch := waiter.NewChannelEntry(nil) l.blockedQueue.EventRegister(&e, EventMaskAll) @@ -161,6 +180,14 @@ func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) } } +// LockRegionVFS1 is a wrapper around LockRegion for VFS1, which does not implement +// F_GETLK (and does not care about storing PIDs as a result). +// +// TODO(gvisor.dev/issue/1624): Delete. +func (l *Locks) LockRegionVFS1(uid UniqueID, t LockType, r LockRange, block Blocker) bool { + return l.LockRegion(uid, 0 /* ownerPID */, t, r, block) +} + // UnlockRegion attempts to release a lock for the uid on a region of a file. // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. @@ -175,13 +202,14 @@ func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) { // makeLock returns a new typed Lock that has either uid as its only reader // or uid as its only writer. -func makeLock(uid UniqueID, t LockType) Lock { - value := Lock{Readers: make(map[UniqueID]bool)} +func makeLock(uid UniqueID, ownerPID int32, t LockType) Lock { + value := Lock{Readers: make(map[UniqueID]OwnerInfo)} switch t { case ReadLock: - value.Readers[uid] = true + value.Readers[uid] = OwnerInfo{PID: ownerPID} case WriteLock: value.Writer = uid + value.WriterInfo = OwnerInfo{PID: ownerPID} default: panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) } @@ -190,17 +218,20 @@ func makeLock(uid UniqueID, t LockType) Lock { // isHeld returns true if uid is a holder of Lock. func (l Lock) isHeld(uid UniqueID) bool { - return l.Writer == uid || l.Readers[uid] + if _, ok := l.Readers[uid]; ok { + return true + } + return l.Writer == uid } // lock sets uid as a holder of a typed lock on Lock. // // Preconditions: canLock is true for the range containing this Lock. -func (l *Lock) lock(uid UniqueID, t LockType) { +func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType) { switch t { case ReadLock: // If we are already a reader, then this is a no-op. - if l.Readers[uid] { + if _, ok := l.Readers[uid]; ok { return } // We cannot downgrade a write lock to a read lock unless the @@ -210,11 +241,11 @@ func (l *Lock) lock(uid UniqueID, t LockType) { panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) } // Ensure that there is only one reader if upgrading. - l.Readers = make(map[UniqueID]bool) + l.Readers = make(map[UniqueID]OwnerInfo) // Ensure that there is no longer a writer. l.Writer = nil } - l.Readers[uid] = true + l.Readers[uid] = OwnerInfo{PID: ownerPID} return case WriteLock: // If we are already the writer, then this is a no-op. @@ -228,13 +259,14 @@ func (l *Lock) lock(uid UniqueID, t LockType) { if readers != 1 { panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers)) } - if !l.Readers[uid] { + if _, ok := l.Readers[uid]; !ok { panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers)) } } // Ensure that there is only a writer. - l.Readers = make(map[UniqueID]bool) + l.Readers = make(map[UniqueID]OwnerInfo) l.Writer = uid + l.WriterInfo = OwnerInfo{PID: ownerPID} default: panic(fmt.Sprintf("lock: invalid lock type %d", t)) } @@ -247,7 +279,7 @@ func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool { // Get our starting point. seg := l.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { - // Note that we don't care about overruning the end of the + // Note that we don't care about overrunning the end of the // last segment because if everything checks out we'll just // split the last segment. if !check(seg.Value()) { @@ -281,7 +313,7 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { if value.Writer == nil { // Then this uid can only take a write lock if this is a private // upgrade, meaning that the only reader is uid. - return len(value.Readers) == 1 && value.Readers[uid] + return value.isOnlyReader(uid) } // If the uid is already a writer on this region, then // adding a write lock would be a no-op. @@ -292,11 +324,19 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { } } +func (l *Lock) isOnlyReader(uid UniqueID) bool { + if len(l.Readers) != 1 { + return false + } + _, ok := l.Readers[uid] + return ok +} + // lock returns true if uid took a lock of type t on the entire range of // LockRange. // // Preconditions: r.Start <= r.End (will panic otherwise). -func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { +func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange) bool { if r.Start > r.End { panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End)) } @@ -317,7 +357,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { seg, gap := l.Find(r.Start) if gap.Ok() { // Fill in the gap and get the next segment to modify. - seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment() + seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t)).NextSegment() } else if seg.Start() < r.Start { // Get our first segment to modify. _, seg = l.Split(seg, r.Start) @@ -331,12 +371,12 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { // Set the lock on the segment. This is guaranteed to // always be safe, given canLock above. value := seg.ValuePtr() - value.lock(uid, t) + value.lock(uid, ownerPID, t) // Fill subsequent gaps. gap = seg.NextGap() if gr := gap.Range().Intersect(r); gr.Length() > 0 { - seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment() + seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t)).NextSegment() } else { seg = gap.NextSegment() } @@ -380,7 +420,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { // only ever be one writer and no readers, then this // lock should always be removed from the set. remove = true - } else if value.Readers[uid] { + } else if _, ok := value.Readers[uid]; ok { // If uid is the last reader, then just remove the entire // segment. if len(value.Readers) == 1 { @@ -390,7 +430,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { // affecting any other segment's readers. To do // this, we need to make a copy of the Readers map // and not add this uid. - newValue := Lock{Readers: make(map[UniqueID]bool)} + newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)} for k, v := range value.Readers { if k != uid { newValue.Readers[k] = v @@ -451,3 +491,72 @@ func ComputeRange(start, length, offset int64) (LockRange, error) { // Offset is guaranteed to be positive at this point. return LockRange{Start: uint64(offset), End: end}, nil } + +// TestRegion checks whether the lock holder identified by uid can hold a lock +// of type t on range r. It returns a Flock struct representing this +// information as the F_GETLK fcntl does. +// +// Note that the PID returned in the flock structure is relative to the root PID +// namespace. It needs to be converted to the caller's PID namespace before +// returning to userspace. +// +// TODO(gvisor.dev/issue/5264): we don't support OFD locks through fcntl, which +// would return a struct with pid = -1. +func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange) linux.Flock { + f := linux.Flock{Type: linux.F_UNLCK} + switch t { + case ReadLock: + l.testRegion(r, func(lock Lock, start, length uint64) bool { + if lock.Writer == nil || lock.Writer == uid { + return true + } + f.Type = linux.F_WRLCK + f.PID = lock.WriterInfo.PID + f.Start = int64(start) + f.Len = int64(length) + return false + }) + case WriteLock: + l.testRegion(r, func(lock Lock, start, length uint64) bool { + if lock.Writer == nil { + for k, v := range lock.Readers { + if k != uid { + // Stop at the first conflict detected. + f.Type = linux.F_RDLCK + f.PID = v.PID + f.Start = int64(start) + f.Len = int64(length) + return false + } + } + return true + } + if lock.Writer == uid { + return true + } + f.Type = linux.F_WRLCK + f.PID = lock.WriterInfo.PID + f.Start = int64(start) + f.Len = int64(length) + return false + }) + default: + panic(fmt.Sprintf("TestRegion: invalid lock type %d", t)) + } + return f +} + +func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) { + l.mu.Lock() + defer l.mu.Unlock() + + seg := l.locks.LowerBoundSegment(r.Start) + for seg.Ok() && seg.Start() < r.End { + lock := seg.Value() + if !check(lock, seg.Start(), seg.End()-seg.Start()) { + // Stop at the first conflict detected. + return + } + seg = seg.NextSegment() + } +} diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go index 50a16e662..dcc17c0dc 100644 --- a/pkg/sentry/fs/lock/lock_set_functions.go +++ b/pkg/sentry/fs/lock/lock_set_functions.go @@ -40,7 +40,7 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) return Lock{}, false } for k := range val1.Readers { - if !val2.Readers[k] { + if _, ok := val2.Readers[k]; !ok { return Lock{}, false } } @@ -53,11 +53,12 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) { // Copy the segment so that split segments don't contain map references // to other segments. - val0 := Lock{Readers: make(map[UniqueID]bool)} + val0 := Lock{Readers: make(map[UniqueID]OwnerInfo)} for k, v := range val.Readers { val0.Readers[k] = v } val0.Writer = val.Writer + val0.WriterInfo = val.WriterInfo return val, val0 } diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go index fad90984b..9878c04e1 100644 --- a/pkg/sentry/fs/lock/lock_test.go +++ b/pkg/sentry/fs/lock/lock_test.go @@ -30,12 +30,12 @@ func equals(e0, e1 []entry) bool { } for i := range e0 { for k := range e0[i].Lock.Readers { - if !e1[i].Lock.Readers[k] { + if _, ok := e1[i].Lock.Readers[k]; !ok { return false } } for k := range e1[i].Lock.Readers { - if !e0[i].Lock.Readers[k] { + if _, ok := e0[i].Lock.Readers[k]; !ok { return false } } @@ -90,15 +90,15 @@ func TestCanLock(t *testing.T) { // 0 1024 2048 3072 4096 l := fill([]entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 3: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 3: OwnerInfo{}}}, LockRange: LockRange{2048, 3072}, }, { @@ -220,7 +220,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -266,7 +266,7 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, { @@ -283,7 +283,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -302,7 +302,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -333,7 +333,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -351,7 +351,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -366,11 +366,11 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -383,7 +383,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -398,15 +398,15 @@ func TestSetLock(t *testing.T) { // 0 4096 8192 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, 8192}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{8192, LockEOF}, }, }, @@ -419,7 +419,7 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -434,7 +434,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -447,7 +447,7 @@ func TestSetLock(t *testing.T) { // 0 4096 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, }, @@ -467,11 +467,11 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -484,7 +484,7 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -499,15 +499,15 @@ func TestSetLock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -520,15 +520,15 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -543,7 +543,7 @@ func TestSetLock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { @@ -551,7 +551,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -564,15 +564,15 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -587,7 +587,7 @@ func TestSetLock(t *testing.T) { // 0 1024 3072 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { @@ -595,7 +595,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{1024, 3072}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -608,11 +608,11 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, }, @@ -634,15 +634,15 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{2048, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -676,7 +676,7 @@ func TestSetLock(t *testing.T) { l := fill(test.before) r := LockRange{Start: test.start, End: test.end} - success := l.lock(test.uid, test.lockType, r) + success := l.lock(test.uid, 0 /* ownerPID */, test.lockType, r) var got []entry for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { got = append(got, entry{ @@ -739,7 +739,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -752,7 +752,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -765,7 +765,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -797,7 +797,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -810,7 +810,7 @@ func TestUnlock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -849,7 +849,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -862,7 +862,7 @@ func TestUnlock(t *testing.T) { // 0 4096 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, }, @@ -901,7 +901,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { @@ -909,7 +909,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -922,11 +922,11 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -939,7 +939,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -952,15 +952,15 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -977,7 +977,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -994,7 +994,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 8}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -1011,7 +1011,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -1028,11 +1028,11 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{4096, 8192}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{8192, LockEOF}, }, }, diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index e91fa26a4..b44117f40 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -194,16 +193,6 @@ func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions return mfd.inode.Stat(ctx, fs, opts) } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence) -} - // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { switch cmd { diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go index 70c68cf0a..a0c5b5af5 100644 --- a/pkg/sentry/fsimpl/devpts/replica.go +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -189,13 +188,3 @@ func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOption fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() return rfd.inode.Stat(ctx, fs, opts) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 0ad79b381..512b70ede 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -311,13 +310,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 4a5539b37..5ad9befcd 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -154,13 +153,3 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // TODO(b/134676337): Implement mmap(2). return syserror.ENODEV } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 3b5927702..9da01cba3 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -90,6 +90,7 @@ type createSyntheticOpts struct { // * d.isDir(). // * d does not already contain a child with the given name. func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { + now := d.fs.clock.Now().Nanoseconds() child := &dentry{ refs: 1, // held by d fs: d.fs, @@ -98,6 +99,10 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { uid: uint32(opts.kuid), gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary + atime: now, + mtime: now, + ctime: now, + btime: now, readFD: -1, writeFD: -1, mmapFD: -1, diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 91d5dc174..8f95473b6 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -36,16 +36,26 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // Snapshot current syncable dentries and special file FDs. + fs.renameMu.RLock() fs.syncMu.Lock() ds := make([]*dentry, 0, len(fs.syncableDentries)) for d := range fs.syncableDentries { + // It's safe to use IncRef here even though fs.syncableDentries doesn't + // hold references since we hold fs.renameMu. Note that we can't use + // TryIncRef since cached dentries at zero references should still be + // synced. d.IncRef() ds = append(ds, d) } + fs.renameMu.RUnlock() sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { - sffd.vfsfd.IncRef() - sffds = append(sffds, sffd) + // As above, fs.specialFileFDs doesn't hold references. However, unlike + // dentries, an FD that has reached zero references can't be + // resurrected, so we can use TryIncRef. + if sffd.vfsfd.TryIncRef() { + sffds = append(sffds, sffd) + } } fs.syncMu.Unlock() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 3cdb1e659..98f7bc52f 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1944,22 +1944,22 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { } // LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { fd.lockLogging.Do(func() { log.Infof("File lock using gofer file handled internally.") }) - return fd.LockFD.LockBSD(ctx, uid, t, block) + return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { fd.lockLogging.Do(func() { log.Infof("Range lock using gofer file handled internally.") }) - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) + return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.Locks().UnlockPOSIX(ctx, uid, r) } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 36a3f6810..05f11fbd5 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -810,13 +809,3 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) { func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 60acc367f..72aa535f8 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -201,7 +201,7 @@ func (c *ConnectedEndpoint) Passcred() bool { } // GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. -func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil } diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index f5c596fec..0f9e20a84 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -370,13 +369,3 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return syserror.ERESTARTSYS } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 485504995..65054b0ea 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -136,13 +135,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index f8dae22f8..e55111af0 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -275,13 +274,3 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index eac578f25..8139bff76 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -371,6 +371,8 @@ type OrderedChildrenOptions struct { // OrderedChildren may modify the tracked children. This applies to // operations related to rename, unlink and rmdir. If an OrderedChildren is // not writable, these operations all fail with EPERM. + // + // Note that writable users must implement the sticky bit (I_SVTX). Writable bool } @@ -556,7 +558,6 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) return err } - // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. o.removeLocked(name) return nil } @@ -603,8 +604,8 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c if err := o.checkExistingLocked(oldname, child); err != nil { return err } + o.removeLocked(oldname) - // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. dst.replaceChildLocked(ctx, newname, child) return nil } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 3492409b2..082fa6504 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -42,7 +42,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -820,13 +819,3 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 75be6129f..fdae163d1 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -518,16 +517,6 @@ func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { // Release implements vfs.FileDescriptionImpl.Release. func (fd *memFD) Release(context.Context) {} -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *memFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *memFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} - // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable @@ -1110,13 +1099,3 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err func (fd *namespaceFD) Release(ctx context.Context) { fd.inode.DecRef(ctx) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 7ee6227a9..d6f076cd6 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -393,7 +393,7 @@ func TestProcSelf(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } - collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ + collector := s.WithTemporaryContext(task.AsyncContext()).ListDirents(&vfs.PathOperation{ Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc/self/"), @@ -491,11 +491,11 @@ func TestTree(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } // Add file to populate /proc/[pid]/fd and fdinfo directories. - task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{}) + task.FDTable().NewFDVFS2(task.AsyncContext(), 0, file, kernel.FDFlags{}) tasks = append(tasks, task) } - ctx := tasks[0] + ctx := tasks[0].AsyncContext() fd, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(s.Ctx), diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 146c7fdfe..4393cc13b 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -140,35 +140,35 @@ func TestLocks(t *testing.T) { uid1 := 123 uid2 := 456 - if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil { + if err := fd.Impl().LockBSD(ctx, uid1, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 1, End: 2}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.WriteLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } - if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil { + if err := fd.Impl().UnlockPOSIX(ctx, uid1, lock.LockRange{Start: 0, End: 1}); err != nil { t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 0c9c639d3..b32c54e20 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,7 +36,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -797,16 +796,6 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { return nil } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} - // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all // filesystem state is in-memory. func (*fileDescription) Sync(context.Context) error { diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index a5171b5ad..8645078a0 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -660,7 +660,6 @@ func (d *dentry) readlink(ctx context.Context) (string, error) { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - vfs.LockFD // d is the corresponding dentry to the fileDescription. d *dentry @@ -1104,14 +1103,29 @@ func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, op return 0, syserror.EROFS } +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { + return fd.lowerFD.LockBSD(ctx, ownerPID, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *fileDescription) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return fd.lowerFD.UnlockBSD(ctx) +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block) +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + return fd.lowerFD.LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence) +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.lowerFD.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (fd *fileDescription) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fd.lowerFD.TestPOSIX(ctx, uid, t, r) } // FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index 30d8b4355..798d6a9bd 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -66,7 +66,7 @@ func dentryFromFD(t *testing.T, fd *vfs.FileDescription) *dentry { // newVerityRoot creates a new verity mount, and returns the root. The // underlying file system is tmpfs. If the error is not nil, then cleanup // should be called when the root is no longer needed. -func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) { +func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, context.Context, error) { t.Helper() k, err := testutil.Boot() if err != nil { @@ -119,7 +119,7 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, root.DecRef(ctx) mntns.DecRef(ctx) }) - return vfsObj, root, task, nil + return vfsObj, root, task.AsyncContext(), nil } // openVerityAt opens a verity file. diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 0ee60569c..8a5b11d40 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -240,7 +240,6 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 7aba31587..a6afabb1c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -153,20 +153,12 @@ func (f *FDTable) drop(ctx context.Context, file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { - // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the - // entire file. - err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) + // Release any POSIX lock possibly held by the FDTable. + err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) if err != nil && err != syserror.ENOLCK { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } - // Generate inotify events. - ev := uint32(linux.IN_CLOSE_NOWRITE) - if file.IsWritable() { - ev = linux.IN_CLOSE_WRITE - } - file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) - // Drop the table's reference. file.DecRef(ctx) } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 303ae8056..ef4e934a1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -593,8 +593,8 @@ func (k *Kernel) flushWritesToFiles(ctx context.Context) error { // Wrap this error in ErrSaveRejection so that it will trigger a save // error, rather than a panic. This also allows us to distinguish Fsync // errors from state file errors in state.Save. - return fs.ErrSaveRejection{ - Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), + return &fs.ErrSaveRejection{ + Err: fmt.Errorf("%q was not sufficiently synced: %w", name, err), } } return nil diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 2c32d017d..71daa9f4b 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -27,7 +27,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index d5a91730d..3b6336e94 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -441,13 +440,3 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } return n, err } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index 9419f2e95..ecbe8f920 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -69,7 +69,7 @@ func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time. // syserror.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error { +func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline ktime.Time) error { if !haveDeadline { return t.block(C, nil) } diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index fdadb52c0..e9da99067 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -216,7 +216,7 @@ func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { return t } -// ThreadGroupWithID returns the thread group lead by the task with thread ID +// ThreadGroupWithID returns the thread group led by the task with thread ID // tid in PID namespace ns. If no task has that TID, or if the task with that // TID is not a thread group leader, ThreadGroupWithID returns nil. func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { @@ -292,6 +292,11 @@ func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { return ns.userns } +// Root returns the root PID namespace of ns. +func (ns *PIDNamespace) Root() *PIDNamespace { + return ns.owner.Root +} + // A threadGroupNode defines the relationship between a thread group and the // rest of the system. Conceptually, threadGroupNode is data belonging to the // owning TaskSet, as if TaskSet contained a field `nodes @@ -485,3 +490,8 @@ func (t *Task) Parent() *Task { func (t *Task) ThreadID() ThreadID { return t.tg.pidns.IDOfTask(t) } + +// TGIDInRoot returns t's TGID in the root PID namespace. +func (t *Task) TGIDInRoot() ThreadID { + return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg) +} diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index 6efe5102b..73bfbea49 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -17,7 +17,6 @@ package mm import ( "bytes" "fmt" - "strings" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" @@ -165,12 +164,12 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI } if s != "" { // Per linux, we pad until the 74th character. - if pad := 73 - lineLen; pad > 0 { - b.WriteString(strings.Repeat(" ", pad)) + for pad := 73 - lineLen; pad > 0; pad-- { + b.WriteByte(' ') } b.WriteString(s) } - b.WriteString("\n") + b.WriteByte('\n') } // ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 675efdc7c..69e37330b 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -1055,18 +1055,11 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { mm.activeMu.Lock() defer mm.activeMu.Unlock() - // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range() - // is analogous to our mm.invalidateLocked(ar, true, true). We inline this - // here, with the special case that we synchronously decommit - // uniquely-owned (non-copy-on-write) pages for private anonymous vma, - // which is the common case for MADV_DONTNEED. Invalidating these pmas, and - // allowing them to be reallocated when touched again, increases pma - // fragmentation, which may significantly reduce performance for - // non-vectored I/O implementations. Also, decommitting synchronously - // ensures that Decommit immediately reduces host memory usage. + // This is invalidateLocked(invalidatePrivate=true, invalidateShared=true), + // with the additional wrinkle that we must refuse to invalidate pmas under + // mlocked vmas. var didUnmapAS bool pseg := mm.pmas.LowerBoundSegment(ar.Start) - mf := mm.mfp.MemoryFile() for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { vma := vseg.ValuePtr() if vma.mlockMode != memmap.MLockNone { @@ -1081,20 +1074,8 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { } } for pseg.Ok() && pseg.Start() < vsegAR.End { - pma := pseg.ValuePtr() - if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) { - psegAR := pseg.Range().Intersect(ar) - if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { - if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil { - pseg = pseg.NextSegment() - continue - } - // If an error occurs, fall through to the general - // invalidation case below. - } - } pseg = mm.pmas.Isolate(pseg, vsegAR) - pma = pseg.ValuePtr() + pma := pseg.ValuePtr() if !didUnmapAS { // Unmap all of ar, not just pseg.Range(), to minimize host // syscalls. AddressSpace mappings must be removed before diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index aacd7ce70..17fb0a0d8 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -550,6 +550,12 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { // Wait for the syscall-enter stop. sig := t.wait(stopped) + if sig == syscall.SIGSTOP { + // SIGSTOP was delivered to another thread in the same thread + // group, which initiated another group stop. Just ignore it. + continue + } + // Refresh all registers. if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) @@ -566,13 +572,11 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { // Is it a system call? if sig == (syscallEvent | syscall.SIGTRAP) { + s.arm64SyscallWorkaround(t, regs) + // Ensure registers are sane. updateSyscallRegs(regs) return true - } else if sig == syscall.SIGSTOP { - // SIGSTOP was delivered to another thread in the same thread - // group, which initiated another group stop. Just ignore it. - continue } // Grab signal information. diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 020bbda79..04815282b 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -257,3 +257,6 @@ func probeSeccomp() bool { } } } + +func (s *subprocess) arm64SyscallWorkaround(t *thread, regs *arch.Registers) { +} diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index bd618fae8..416132967 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -21,6 +21,7 @@ import ( "strings" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -172,3 +173,38 @@ func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFActi func probeSeccomp() bool { return true } + +func (s *subprocess) arm64SyscallWorkaround(t *thread, regs *arch.Registers) { + // On ARM64, when ptrace stops on a system call, it uses the x7 + // register to indicate whether the stop has been signalled from + // syscall entry or syscall exit. This means that we can't get a value + // of this register and we can't change it. More details are in the + // comment for tracehook_report_syscall in arch/arm64/kernel/ptrace.c. + // + // This happens only if we stop on a system call, so let's queue a + // signal, resume a stub thread and catch it on a signal handling. + t.NotifyInterrupt() + for { + if _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + unix.PTRACE_SYSEMU, + uintptr(t.tid), 0, 0, 0, 0); errno != 0 { + panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) + } + + // Wait for the syscall-enter stop. + sig := t.wait(stopped) + if sig == syscall.SIGSTOP { + // SIGSTOP was delivered to another thread in the same thread + // group, which initiated another group stop. Just ignore it. + continue + } + if sig == (syscallEvent | syscall.SIGTRAP) { + t.dumpAndPanic(fmt.Sprintf("unexpected syscall event")) + } + break + } + if err := t.getRegs(regs); err != nil { + panic(fmt.Sprintf("ptrace get regs failed: %v", err)) + } +} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index b6ebe29d6..a8e6f172b 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 5b868216d..17f59ba1f 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -377,10 +377,8 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] switch level { case linux.SOL_IP: switch name { - case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR: + case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR: optlen = sizeofInt32 - case linux.IP_PKTINFO: - optlen = linux.SizeOfControlMessageIPPacketInfo } case linux.SOL_IPV6: switch name { diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 9a2cac40b..f82c7c224 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -144,16 +143,6 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return int64(n), err } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} - type socketProviderVFS2 struct { family int } diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go index 70c561cce..2f913787b 100644 --- a/pkg/sentry/socket/netfilter/ipv4.go +++ b/pkg/sentry/socket/netfilter/ipv4.go @@ -15,7 +15,6 @@ package netfilter import ( - "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -220,18 +219,6 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) } - n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) - if n == -1 { - n = len(iptip.OutputInterface) - } - ifname := string(iptip.OutputInterface[:n]) - - n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) - if n == -1 { - n = len(iptip.OutputInterfaceMask) - } - ifnameMask := string(iptip.OutputInterfaceMask[:n]) - return stack.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), // A Protocol value of 0 indicates all protocols match. @@ -242,8 +229,11 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { Src: tcpip.Address(iptip.Src[:]), SrcMask: tcpip.Address(iptip.SrcMask[:]), SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, - OutputInterface: ifname, - OutputInterfaceMask: ifnameMask, + InputInterface: string(trimNullBytes(iptip.InputInterface[:])), + InputInterfaceMask: string(trimNullBytes(iptip.InputInterfaceMask[:])), + InputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_IN != 0, + OutputInterface: string(trimNullBytes(iptip.OutputInterface[:])), + OutputInterfaceMask: string(trimNullBytes(iptip.OutputInterfaceMask[:])), OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, }, nil } @@ -254,12 +244,12 @@ func containsUnsupportedFields4(iptip linux.IPTIP) bool { // - Dst and DstMask // - Src and SrcMask // - The inverse destination IP check flag + // - InputInterface, InputInterfaceMask and its inverse. // - OutputInterface, OutputInterfaceMask and its inverse. - var emptyInterface = [linux.IFNAMSIZ]byte{} + const flagMask = 0 // Disable any supported inverse flags. - inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) - return iptip.InputInterface != emptyInterface || - iptip.InputInterfaceMask != emptyInterface || - iptip.Flags != 0 || + const inverseMask = linux.IPT_INV_DSTIP | linux.IPT_INV_SRCIP | + linux.IPT_INV_VIA_IN | linux.IPT_INV_VIA_OUT + return iptip.Flags&^flagMask != 0 || iptip.InverseFlags&^inverseMask != 0 } diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go index 5dbb604f0..263d9d3b5 100644 --- a/pkg/sentry/socket/netfilter/ipv6.go +++ b/pkg/sentry/socket/netfilter/ipv6.go @@ -15,7 +15,6 @@ package netfilter import ( - "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -223,18 +222,6 @@ func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) } - n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) - if n == -1 { - n = len(iptip.OutputInterface) - } - ifname := string(iptip.OutputInterface[:n]) - - n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) - if n == -1 { - n = len(iptip.OutputInterfaceMask) - } - ifnameMask := string(iptip.OutputInterfaceMask[:n]) - return stack.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), // In ip6tables a flag controls whether to check the protocol. @@ -245,8 +232,11 @@ func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) { Src: tcpip.Address(iptip.Src[:]), SrcMask: tcpip.Address(iptip.SrcMask[:]), SrcInvert: iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0, - OutputInterface: ifname, - OutputInterfaceMask: ifnameMask, + InputInterface: string(trimNullBytes(iptip.InputInterface[:])), + InputInterfaceMask: string(trimNullBytes(iptip.InputInterfaceMask[:])), + InputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_IN != 0, + OutputInterface: string(trimNullBytes(iptip.OutputInterface[:])), + OutputInterfaceMask: string(trimNullBytes(iptip.OutputInterfaceMask[:])), OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0, }, nil } @@ -257,14 +247,13 @@ func containsUnsupportedFields6(iptip linux.IP6TIP) bool { // - Dst and DstMask // - Src and SrcMask // - The inverse destination IP check flag + // - InputInterface, InputInterfaceMask and its inverse. // - OutputInterface, OutputInterfaceMask and its inverse. - var emptyInterface = [linux.IFNAMSIZ]byte{} - flagMask := uint8(linux.IP6T_F_PROTO) + const flagMask = linux.IP6T_F_PROTO // Disable any supported inverse flags. - inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT) - return iptip.InputInterface != emptyInterface || - iptip.InputInterfaceMask != emptyInterface || - iptip.Flags&^flagMask != 0 || + const inverseMask = linux.IP6T_INV_DSTIP | linux.IP6T_INV_SRCIP | + linux.IP6T_INV_VIA_IN | linux.IP6T_INV_VIA_OUT + return iptip.Flags&^flagMask != 0 || iptip.InverseFlags&^inverseMask != 0 || iptip.TOS != 0 } diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 26bd1abd4..7ae18b2a3 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,6 +17,7 @@ package netfilter import ( + "bytes" "errors" "fmt" @@ -393,3 +394,11 @@ func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkP rev.Revision = maxSupported return rev, nil } + +func trimNullBytes(b []byte) []byte { + n := bytes.IndexByte(b, 0) + if n == -1 { + n = len(b) + } + return b[:n] +} diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 69d13745e..176fa6116 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -112,7 +112,7 @@ func (*OwnerMatcher) Name() string { } // Match implements Matcher.Match. -func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { +func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // Support only for OUTPUT chain. // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. if hook != stack.Output { diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 352c51390..2740697b3 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -96,7 +96,7 @@ func (*TCPMatcher) Name() string { } // Match implements Matcher.Match. -func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { +func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index c88d8268d..466d5395d 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -93,7 +93,7 @@ func (*UDPMatcher) Name() string { } // Match implements Matcher.Match. -func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { +func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 1f926aa91..9313e1167 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -22,7 +22,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index 461d524e5..842036764 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" @@ -149,13 +148,3 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 22abca120..915134b41 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -42,7 +41,6 @@ go_library( "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 22e128b96..94f03af48 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -19,7 +19,7 @@ // be used to expose certain endpoints to the sentry while leaving others out, // for example, TCP endpoints and Unix-domain endpoints. // -// Lock ordering: netstack => mm: ioSequencePayload copies user memory inside +// Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during // this operation. package netstack @@ -55,7 +55,6 @@ import ( "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" @@ -194,7 +193,6 @@ var Metrics = tcpip.Stats{ RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), - OutgoingRequestNetworkUnreachableErrors: mustCreateMetric("/netstack/arp/outgoing_requests_network_unreachable", "Number of failed attempts to send an ARP request with a network unreachable error."), OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), @@ -253,11 +251,11 @@ var errStackType = syserr.New("expected but did not receive a netstack.Stack", l type commonEndpoint interface { // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and // transport.Endpoint.GetLocalAddress. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and // transport.Endpoint.GetRemoteAddress. - GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) + GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) // Readiness implements tcpip.Endpoint.Readiness and // transport.Endpoint.Readiness. @@ -265,19 +263,19 @@ type commonEndpoint interface { // SetSockOpt implements tcpip.Endpoint.SetSockOpt and // transport.Endpoint.SetSockOpt. - SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error + SetSockOpt(tcpip.SettableSocketOption) tcpip.Error // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and // transport.Endpoint.SetSockOptInt. - SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error + SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error // GetSockOpt implements tcpip.Endpoint.GetSockOpt and // transport.Endpoint.GetSockOpt. - GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error + GetSockOpt(tcpip.GettableSocketOption) tcpip.Error // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and // transport.Endpoint.GetSockOpt. - GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) + GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) // State returns a socket's lifecycle state. The returned value is // protocol-specific and is primarily used for diagnostics. @@ -285,7 +283,7 @@ type commonEndpoint interface { // LastError implements tcpip.Endpoint.LastError and // transport.Endpoint.LastError. - LastError() *tcpip.Error + LastError() tcpip.Error // SocketOptions implements tcpip.Endpoint.SocketOptions and // transport.Endpoint.SocketOptions. @@ -440,115 +438,58 @@ func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Write return int64(res.Count), nil } -// ioSequencePayload implements tcpip.Payload. -// -// t copies user memory bytes on demand based on the requested size. -type ioSequencePayload struct { - ctx context.Context - src usermem.IOSequence -} - -// FullPayload implements tcpip.Payloader.FullPayload -func (i *ioSequencePayload) FullPayload() ([]byte, *tcpip.Error) { - return i.Payload(int(i.src.NumBytes())) -} - -// Payload implements tcpip.Payloader.Payload. -func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) { - if max := int(i.src.NumBytes()); size > max { - size = max - } - v := buffer.NewView(size) - if _, err := i.src.CopyIn(i.ctx, v); err != nil { - // EOF can be returned only if src is a file and this means it - // is in a splice syscall and the error has to be ignored. - if err == io.EOF { - return v, nil - } - return nil, tcpip.ErrBadAddress - } - return v, nil -} - -// DropFirst drops the first n bytes from underlying src. -func (i *ioSequencePayload) DropFirst(n int) { - i.src = i.src.DropFirst(int(n)) -} - // Write implements fs.FileOperations.Write. func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { - f := &ioSequencePayload{ctx: ctx, src: src} - n, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) - if err == tcpip.ErrWouldBlock { + r := src.Reader(ctx) + n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) + if _, ok := err.(*tcpip.ErrWouldBlock); ok { return 0, syserror.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } - if int64(n) < src.NumBytes() { - return int64(n), syserror.ErrWouldBlock + if n < src.NumBytes() { + return n, syserror.ErrWouldBlock } - return int64(n), nil + return n, nil } -// readerPayload implements tcpip.Payloader. -// -// It allocates a view and reads from a reader on-demand, based on available -// capacity in the endpoint. -type readerPayload struct { - ctx context.Context - r io.Reader - count int64 +var _ tcpip.Payloader = (*limitedPayloader)(nil) + +type limitedPayloader struct { + inner io.LimitedReader err error } -// FullPayload implements tcpip.Payloader.FullPayload. -func (r *readerPayload) FullPayload() ([]byte, *tcpip.Error) { - return r.Payload(int(r.count)) +func (l *limitedPayloader) Read(p []byte) (int, error) { + n, err := l.inner.Read(p) + l.err = err + return n, err } -// Payload implements tcpip.Payloader.Payload. -func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) { - if size > int(r.count) { - size = int(r.count) - } - v := buffer.NewView(size) - n, err := r.r.Read(v) - if n > 0 { - // We ignore the error here. It may re-occur on subsequent - // reads, but for now we can enqueue some amount of data. - r.count -= int64(n) - return v[:n], nil - } - if err == syserror.ErrWouldBlock { - return nil, tcpip.ErrWouldBlock - } else if err != nil { - r.err = err // Save for propation. - return nil, tcpip.ErrBadAddress - } - - // There is no data and no error. Return an error, which will propagate - // r.err, which will be nil. This is the desired result: (0, nil). - return nil, tcpip.ErrBadAddress +func (l *limitedPayloader) Len() int { + return int(l.inner.N) } // ReadFrom implements fs.FileOperations.ReadFrom. func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { - f := &readerPayload{ctx: ctx, r: r, count: count} - n, err := s.Endpoint.Write(f, tcpip.WriteOptions{ + f := limitedPayloader{ + inner: io.LimitedReader{ + R: r, + N: count, + }, + } + n, err := s.Endpoint.Write(&f, tcpip.WriteOptions{ // Reads may be destructive but should be very fast, // so we can't release the lock while copying data. Atomic: true, }) - if err == tcpip.ErrWouldBlock { - return n, syserror.ErrWouldBlock - } else if err != nil { - return int64(n), f.err // Propagate error. + if _, ok := err.(*tcpip.ErrBadBuffer); ok { + return n, f.err } - - return int64(n), nil + return n, syserr.TranslateNetstackError(err).ToError() } // Readiness returns a mask of ready events for socket s. @@ -592,7 +533,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool if family == linux.AF_UNSPEC { err := s.Endpoint.Disconnect() - if err == tcpip.ErrNotSupported { + if _, ok := err.(*tcpip.ErrNotSupported); ok { return syserr.ErrAddressFamilyNotSupported } return syserr.TranslateNetstackError(err) @@ -614,15 +555,16 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool s.EventRegister(&e, waiter.EventOut) defer s.EventUnregister(&e) - if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting { + switch err := s.Endpoint.Connect(addr); err.(type) { + case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting: + case *tcpip.ErrNoPortAvailable: if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { // TCP unlike UDP returns EADDRNOTAVAIL when it can't // find an available local ephemeral port. - if err == tcpip.ErrNoPortAvailable { - return syserr.ErrAddressNotAvailable - } + return syserr.ErrAddressNotAvailable } - + return syserr.TranslateNetstackError(err) + default: return syserr.TranslateNetstackError(err) } @@ -680,16 +622,16 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Issue the bind request to the endpoint. err := s.Endpoint.Bind(addr) - if err == tcpip.ErrNoPortAvailable { + if _, ok := err.(*tcpip.ErrNoPortAvailable); ok { // Bind always returns EADDRINUSE irrespective of if the specified port was // already bound or if an ephemeral port was requested but none were // available. // - // tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because + // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because // UDP connect returns EAGAIN on ephemeral port exhaustion. // // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. - err = tcpip.ErrPortInUse + err = &tcpip.ErrPortInUse{} } return syserr.TranslateNetstackError(err) @@ -712,7 +654,8 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAdd // Try to accept the connection again; if it fails, then wait until we // get a notification. for { - if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock { + ep, wq, err := s.Endpoint.Accept(peerAddr) + if _, ok := err.(*tcpip.ErrWouldBlock); !ok { return ep, wq, syserr.TranslateNetstackError(err) } @@ -731,7 +674,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { - if terr != tcpip.ErrWouldBlock || !blocking { + if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } @@ -912,7 +855,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - size, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption) + size, err := ep.SocketOptions().GetSendBufferSize() if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -1164,6 +1107,29 @@ func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, // TODO(b/64800844): Translate fields once they are added to // tcpip.TCPInfoOption. info := linux.TCPInfo{} + switch v.CcState { + case tcpip.RTORecovery: + info.CaState = linux.TCP_CA_Loss + case tcpip.FastRecovery, tcpip.SACKRecovery: + info.CaState = linux.TCP_CA_Recovery + case tcpip.Disorder: + info.CaState = linux.TCP_CA_Disorder + case tcpip.Open: + info.CaState = linux.TCP_CA_Open + } + info.RTO = uint32(v.RTO / time.Microsecond) + info.RTT = uint32(v.RTT / time.Microsecond) + info.RTTVar = uint32(v.RTTVar / time.Microsecond) + info.SndSsthresh = v.SndSsthresh + info.SndCwnd = v.SndCwnd + + // In netstack reorderSeen is updated only when RACK is enabled. + // We only track whether the reordering is seen, which is + // different than Linux where reorderSeen is not specific to + // RACK and is incremented when a reordering event is seen. + if v.ReorderSeen { + info.ReordSeen = 1 + } // Linux truncates the output binary to outLen. buf := t.CopyScratchBuffer(info.SizeBytes()) @@ -1681,8 +1647,16 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam return syserr.ErrInvalidArgument } + family, _, _ := s.Type() + // TODO(gvisor.dev/issue/5132): We currently do not support + // setting this option for unix sockets. + if family == linux.AF_UNIX { + return nil + } + v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.SendBufferSizeOption, int(v))) + ep.SocketOptions().SetSendBufferSize(int64(v), true) + return nil case linux.SO_RCVBUF: if len(optVal) < sizeOfInt32 { @@ -1814,10 +1788,6 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam var v linux.Linger binary.Unmarshal(optVal[:linux.SizeOfLinger], usermem.ByteOrder, &v) - if v != (linux.Linger{}) { - socket.SetSockOptEmitUnimplementedEvent(t, name) - } - ep.SocketOptions().SetLinger(tcpip.LingerOption{ Enabled: v.OnOff != 0, Timeout: time.Second * time.Duration(v.Linger), @@ -2596,7 +2566,7 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq defer s.readMu.Unlock() res, err := s.Endpoint.Read(w, readOptions) - if err == tcpip.ErrBadBuffer && dst.NumBytes() == 0 { + if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 { err = nil } if err != nil { @@ -2840,45 +2810,48 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b EndOfRecord: flags&linux.MSG_EOR != 0, } - v := &ioSequencePayload{t, src} - n, err := s.Endpoint.Write(v, opts) - dontWait := flags&linux.MSG_DONTWAIT != 0 - if err == nil && (n >= v.src.NumBytes() || dontWait) { - // Complete write. - return int(n), nil - } - if err != nil && (err != tcpip.ErrWouldBlock || dontWait) { - return int(n), syserr.TranslateNetstackError(err) - } - - // We'll have to block. Register for notification and keep trying to - // send all the data. - e, ch := waiter.NewChannelEntry(nil) - s.EventRegister(&e, waiter.EventOut) - defer s.EventUnregister(&e) - - v.DropFirst(int(n)) - total := n + r := src.Reader(t) + var ( + total int64 + entry waiter.Entry + ch <-chan struct{} + ) for { - n, err = s.Endpoint.Write(v, opts) - v.DropFirst(int(n)) + n, err := s.Endpoint.Write(r, opts) total += n - - if err != nil && err != tcpip.ErrWouldBlock && total == 0 { - return 0, syserr.TranslateNetstackError(err) - } - - if err == nil && v.src.NumBytes() == 0 || err != nil && err != tcpip.ErrWouldBlock { - return int(total), nil - } - - if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - return int(total), syserr.ErrTryAgain + if flags&linux.MSG_DONTWAIT != 0 { + return int(total), syserr.TranslateNetstackError(err) + } + block := true + switch err.(type) { + case nil: + block = total != src.NumBytes() + case *tcpip.ErrWouldBlock: + default: + block = false + } + if block { + if ch == nil { + // We'll have to block. Register for notification and keep trying to + // send all the data. + entry, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&entry, waiter.EventOut) + defer s.EventUnregister(&entry) + } else { + // Don't wait immediately after registration in case more data + // became available between when we last checked and when we setup + // the notification. + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return int(total), syserr.ErrTryAgain + } + // handleIOError will consume errors from t.Block if needed. + return int(total), syserr.FromError(err) + } } - // handleIOError will consume errors from t.Block if needed. - return int(total), syserr.FromError(err) + continue } + return int(total), syserr.TranslateNetstackError(err) } } diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 6f70b02fc..24922c400 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -129,20 +128,20 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return 0, syserror.EOPNOTSUPP } - f := &ioSequencePayload{ctx: ctx, src: src} - n, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) - if err == tcpip.ErrWouldBlock { + r := src.Reader(ctx) + n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) + if _, ok := err.(*tcpip.ErrWouldBlock); ok { return 0, syserror.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } - if int64(n) < src.NumBytes() { - return int64(n), syserror.ErrWouldBlock + if n < src.NumBytes() { + return n, syserror.ErrWouldBlock } - return int64(n), nil + return n, nil } // Accept implements the linux syscall accept(2) for sockets backed by @@ -155,7 +154,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block } ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { - if terr != tcpip.ErrWouldBlock || !blocking { + if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } @@ -262,13 +261,3 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index c847ff1c7..2515dda80 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -118,7 +118,7 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* // Create the endpoint. var ep tcpip.Endpoint - var e *tcpip.Error + var e tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go index 0af805246..ba1cc79e9 100644 --- a/pkg/sentry/socket/netstack/provider_vfs2.go +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -62,7 +62,7 @@ func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int // Create the endpoint. var ep tcpip.Endpoint - var e *tcpip.Error + var e tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 9f7aca305..fc5b823b0 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -48,7 +48,7 @@ type ConnectingEndpoint interface { Type() linux.SockType // GetLocalAddress returns the bound path. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // Locker protects the following methods. While locked, only the holder of // the lock can change the return value of the protected methods. @@ -128,7 +128,7 @@ func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProv idGenerator: uid, stype: stype, } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, nil, nil) return ep } @@ -173,7 +173,7 @@ func NewExternal(ctx context.Context, stype linux.SockType, uid UniqueIDProvider idGenerator: uid, stype: stype, } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, nil, nil) return ep } @@ -296,7 +296,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn idGenerator: e.idGenerator, stype: e.stype, } - ne.ops.InitHandler(ne) + ne.ops.InitHandler(ne, nil, nil) readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} readQueue.InitRefs() diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 0813ad87d..20fa8b874 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -44,7 +44,7 @@ func NewConnectionless(ctx context.Context) Endpoint { q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} q.InitRefs() ep.receiver = &queueReceiver{readQueue: &q} - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, nil, nil) return ep } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 099a56281..70227bbd2 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -169,32 +169,32 @@ type Endpoint interface { Type() linux.SockType // GetLocalAddress returns the address to which the endpoint is bound. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // GetRemoteAddress returns the address to which the endpoint is // connected. - GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) + GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) // SetSockOpt sets a socket option. - SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error + SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error // SetSockOptInt sets a socket option for simple cases when a value has // the int type. - SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error + SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error // GetSockOpt gets a socket option. - GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error + GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error // GetSockOptInt gets a socket option for simple cases when a return // value has the int type. - GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) + GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) // State returns the current state of the socket, as represented by Linux in // procfs. State() uint32 // LastError clears and returns the last error reported by the endpoint. - LastError() *tcpip.Error + LastError() tcpip.Error // SocketOptions returns the structure which contains all the socket // level options. @@ -580,7 +580,7 @@ type ConnectedEndpoint interface { Passcred() bool // GetLocalAddress implements Endpoint.GetLocalAddress. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // Send sends a single message. This method does not block. // @@ -640,7 +640,7 @@ type connectedEndpoint struct { Passcred() bool // GetLocalAddress implements Endpoint.GetLocalAddress. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // Type implements Endpoint.Type. Type() linux.SockType @@ -655,7 +655,7 @@ func (e *connectedEndpoint) Passcred() bool { } // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. -func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return e.endpoint.GetLocalAddress() } @@ -836,13 +836,12 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess } // SetSockOpt sets a socket option. -func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { +func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { return nil } -func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { +func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { switch opt { - case tcpip.SendBufferSizeOption: case tcpip.ReceiveBufferSizeOption: default: log.Warningf("Unsupported socket option: %d", opt) @@ -850,19 +849,40 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { return nil } -func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { +// IsUnixSocket implements tcpip.SocketOptionsHandler.IsUnixSocket. +func (e *baseEndpoint) IsUnixSocket() bool { + return true +} + +// GetSendBufferSize implements tcpip.SocketOptionsHandler.GetSendBufferSize. +func (e *baseEndpoint) GetSendBufferSize() (int64, tcpip.Error) { + e.Lock() + defer e.Unlock() + + if !e.Connected() { + return -1, &tcpip.ErrNotConnected{} + } + + v := e.connected.SendMaxQueueSize() + if v < 0 { + return -1, &tcpip.ErrQueueSizeNotSupported{} + } + return v, nil +} + +func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 e.Lock() if !e.Connected() { e.Unlock() - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v = int(e.receiver.RecvQueuedSize()) e.Unlock() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return v, nil @@ -870,25 +890,12 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.Lock() if !e.Connected() { e.Unlock() - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v := e.connected.SendQueuedSize() e.Unlock() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported - } - return int(v), nil - - case tcpip.SendBufferSizeOption: - e.Lock() - if !e.Connected() { - e.Unlock() - return -1, tcpip.ErrNotConnected - } - v := e.connected.SendMaxQueueSize() - e.Unlock() - if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return int(v), nil @@ -896,29 +903,29 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.Lock() if e.receiver == nil { e.Unlock() - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v := e.receiver.RecvMaxQueueSize() e.Unlock() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return int(v), nil default: log.Warningf("Unsupported socket option: %d", opt) - return -1, tcpip.ErrUnknownProtocolOption + return -1, &tcpip.ErrUnknownProtocolOption{} } } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { +func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { log.Warningf("Unsupported socket option: %T", opt) - return tcpip.ErrUnknownProtocolOption + return &tcpip.ErrUnknownProtocolOption{} } // LastError implements Endpoint.LastError. -func (*baseEndpoint) LastError() *tcpip.Error { +func (*baseEndpoint) LastError() tcpip.Error { return nil } @@ -958,7 +965,7 @@ func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { } // GetLocalAddress returns the bound path. -func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { e.Lock() defer e.Unlock() return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil @@ -966,14 +973,14 @@ func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { // GetRemoteAddress returns the local address of the connected endpoint (if // available). -func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { +func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { e.Lock() c := e.connected e.Unlock() if c != nil { return c.GetLocalAddress() } - return tcpip.FullAddress{}, tcpip.ErrNotConnected + return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } // Release implements BoundEndpoint.Release. diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 6c4ec55b2..32e5d2304 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -496,6 +496,9 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b return int(n), syserr.FromError(err) } + // Only send SCM Rights once (see net/unix/af_unix.c:unix_stream_sendmsg). + w.Control.Rights = nil + // We'll have to block. Register for notification and keep trying to // send all the data. e, ch := waiter.NewChannelEntry(nil) diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 27f705bb2..a7d4d7f1f 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -331,16 +330,6 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} - // providerVFS2 is a unix domain socket provider for VFS2. type providerVFS2 struct{} diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index a72df62f6..62d1e8f8b 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -503,8 +503,8 @@ var ARM64 = &kernel.SyscallTable{ 72: syscalls.Supported("pselect", Pselect), 73: syscalls.Supported("ppoll", Ppoll), 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 76: syscalls.Supported("splice", Splice), 77: syscalls.Supported("tee", Tee), 78: syscalls.Supported("readlinkat", Readlinkat), 79: syscalls.Supported("fstatat", Fstatat), diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index c33571f43..a6253626e 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -1014,12 +1014,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -1030,12 +1030,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -2167,24 +2167,24 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.LOCK_EX: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 7dd9ef857..1a31898e8 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -123,6 +123,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: + // allowed + default: + return 0, nil, syserror.EBADF + } + } + switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: minfd := args[2].Int() @@ -205,8 +214,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err - case linux.F_SETLK, linux.F_SETLKW: - return 0, nil, posixLock(t, args, file, cmd) + case linux.F_SETLK: + return 0, nil, posixLock(t, args, file, false /* blocking */) + case linux.F_SETLKW: + return 0, nil, posixLock(t, args, file, true /* blocking */) + case linux.F_GETLK: + return 0, nil, posixTestLock(t, args, file) case linux.F_GETSIG: a := file.AsyncHandler() if a == nil { @@ -292,7 +305,49 @@ func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, } } -func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { +func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription) error { + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := flock.CopyIn(t, flockAddr); err != nil { + return err + } + var typ lock.LockType + switch flock.Type { + case linux.F_RDLCK: + typ = lock.ReadLock + case linux.F_WRLCK: + typ = lock.WriteLock + default: + return syserror.EINVAL + } + r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) + if err != nil { + return err + } + + newFlock, err := file.TestPOSIX(t, t.FDTable(), typ, r) + if err != nil { + return err + } + newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID) + if _, err = newFlock.CopyOut(t, flockAddr); err != nil { + return err + } + return nil +} + +// translatePID translates a pid from one namespace to another. Note that this +// may race with task termination/creation, in which case the original task +// corresponding to pid may no longer exist. This is used to implement the +// F_GETLK fcntl, which has the same potential race in Linux as well (i.e., +// there is no synchronization between retrieving the lock PID and translating +// it). See fs/locks.c:posix_lock_to_flock. +func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 { + return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid)))) +} + +func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, blocking bool) error { // Copy in the lock request. flockAddr := args[2].Pointer() var flock linux.Flock @@ -301,25 +356,30 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip } var blocker lock.Blocker - if cmd == linux.F_SETLKW { + if blocking { blocker = t } + r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) + if err != nil { + return err + } + switch flock.Type { case linux.F_RDLCK: if !file.IsReadable() { return syserror.EBADF } - return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.ReadLock, r, blocker) case linux.F_WRLCK: if !file.IsWritable() { return syserror.EBADF } - return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.WriteLock, r, blocker) case linux.F_UNLCK: - return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence) + return file.UnlockPOSIX(t, t.FDTable(), r) default: return syserror.EINVAL @@ -344,6 +404,10 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + return 0, nil, syserror.EBADF + } + // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { return 0, nil, syserror.ESPIPE diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 20c264fef..c7c3fed57 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -32,6 +32,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + return 0, nil, syserror.EBADF + } + // Handle ioctls that apply to all FDs. switch args[1].Int() { case linux.FIONCLEX: diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index b910b5a74..d1452a04d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -44,11 +44,11 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch operation { case linux.LOCK_EX: - if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil { + if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, blocker); err != nil { return 0, nil, err } case linux.LOCK_SH: - if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil { + if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, blocker); err != nil { return 0, nil, err } case linux.LOCK_UN: diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index b77b29dcc..c7417840f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -93,17 +93,11 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } @@ -134,9 +128,6 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt } file.EventUnregister(&w) - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return total, err } @@ -257,17 +248,11 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } @@ -297,10 +282,6 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of } } file.EventUnregister(&w) - - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return total, err } @@ -363,17 +344,11 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return n, err } @@ -403,10 +378,6 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op } } file.EventUnregister(&w) - - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return total, err } @@ -527,17 +498,11 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } @@ -567,10 +532,6 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o } } file.EventUnregister(&w) - - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return total, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 1ee37e5a8..903169dc2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -220,7 +220,6 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys length := args[3].Int64() file := t.GetFileVFS2(fd) - if file == nil { return 0, nil, syserror.EBADF } @@ -229,23 +228,18 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if !file.IsWritable() { return 0, nil, syserror.EBADF } - if mode != 0 { return 0, nil, syserror.ENOTSUP } - if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL } size := offset + length - if size < 0 { return 0, nil, syserror.EFBIG } - limit := limits.FromContext(t).Get(limits.FileSize).Cur - if uint64(size) >= limit { t.SendSignal(&arch.SignalInfo{ Signo: int32(linux.SIGXFSZ), @@ -254,12 +248,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EFBIG } - if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil { - return 0, nil, err - } - - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - return 0, nil, nil + return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) } // Utime implements Linux syscall utime(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 8bb763a47..19e175203 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -170,13 +170,6 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } } - if n != 0 { - // On Linux, inotify behavior is not very consistent with splice(2). We try - // our best to emulate Linux for very basic calls to splice, where for some - // reason, events are generated for output files, but not input files. - outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } - // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile) @@ -256,8 +249,6 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo } if n != 0 { - outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - // If a partial write is completed, the error is dropped. Log it here. if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { log.Debugf("tee completed a partial write with error: %v", err) @@ -449,9 +440,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } if total != 0 { - inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { // If a partial write is completed, the error is dropped. Log it here. log.Debugf("sendfile completed a partial write with error: %v", err) diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 6e9b599e2..1f8a5878c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -36,6 +36,10 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + return 0, nil, syserror.EBADF + } + return 0, nil, file.SyncFS(t) } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index a3868bf16..df4990854 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -83,6 +83,7 @@ go_library( "mount.go", "mount_namespace_refs.go", "mount_unsafe.go", + "opath.go", "options.go", "pathname.go", "permissions.go", diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5321ac80a..f612a71b2 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -161,6 +161,13 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef(ctx context.Context) { fd.FileDescriptionRefs.DecRef(func() { + // Generate inotify events. + ev := uint32(linux.IN_CLOSE_NOWRITE) + if fd.IsWritable() { + ev = linux.IN_CLOSE_WRITE + } + fd.Dentry().InotifyWithParent(ctx, ev, 0, PathEvent) + // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls @@ -448,16 +455,19 @@ type FileDescriptionImpl interface { RemoveXattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. - LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error + LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block lock.Blocker) error // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. - LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error + LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error // UnlockPOSIX releases a POSIX-style advisory file lock. - UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, ComputeLockRange lock.LockRange) error + + // TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl. + TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) } // Dirent holds the information contained in struct linux_dirent64. @@ -556,7 +566,11 @@ func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length ui if !fd.IsWritable() { return syserror.EBADF } - return fd.impl.Allocate(ctx, mode, offset, length) + if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil { + return err + } + fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) + return nil } // Readiness implements waiter.Waitable.Readiness. @@ -592,6 +606,9 @@ func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of } start := fsmetric.StartReadWait() n, err := fd.impl.PRead(ctx, dst, offset, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) + } fsmetric.Reads.Increment() fsmetric.FinishReadWait(fsmetric.ReadWait, start) return n, err @@ -604,6 +621,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt } start := fsmetric.StartReadWait() n, err := fd.impl.Read(ctx, dst, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) + } fsmetric.Reads.Increment() fsmetric.FinishReadWait(fsmetric.ReadWait, start) return n, err @@ -619,7 +639,11 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o if !fd.writable { return 0, syserror.EBADF } - return fd.impl.PWrite(ctx, src, offset, opts) + n, err := fd.impl.PWrite(ctx, src, offset, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) + } + return n, err } // Write is similar to PWrite, but does not specify an offset. @@ -627,7 +651,11 @@ func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, op if !fd.writable { return 0, syserror.EBADF } - return fd.impl.Write(ctx, src, opts) + n, err := fd.impl.Write(ctx, src, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) + } + return n, err } // IterDirents invokes cb on each entry in the directory represented by fd. If @@ -791,9 +819,9 @@ func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) e } // LockBSD tries to acquire a BSD-style advisory file lock. -func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { +func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, blocker lock.Blocker) error { atomic.StoreUint32(&fd.usedLockBSD, 1) - return fd.impl.LockBSD(ctx, fd, lockType, blocker) + return fd.impl.LockBSD(ctx, fd, ownerPID, lockType, blocker) } // UnlockBSD releases a BSD-style advisory file lock. @@ -802,13 +830,45 @@ func (fd *FileDescription) UnlockBSD(ctx context.Context) error { } // LockPOSIX locks a POSIX-style file range lock. -func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { - return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX unlocks a POSIX-style file range lock. -func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { - return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, r lock.LockRange) error { + return fd.impl.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX returns information about whether the specified lock can be held. +func (fd *FileDescription) TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) { + return fd.impl.TestPOSIX(ctx, uid, t, r) +} + +// ComputeLockRange computes the range of a file lock based on the given values. +func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, length uint64, whence int16) (lock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return lock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return lock.LockRange{}, err + } + off = int64(stat.Size) + default: + return lock.LockRange{}, syserror.EINVAL + } + + return lock.ComputeRange(int64(start), int64(length), off) } // A FileAsync sends signals to its owner when w is ready for IO. This is only diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 48ca9de44..eb7d2fd3b 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -419,8 +419,8 @@ func (fd *LockFD) Locks() *FileLocks { } // LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - return fd.locks.LockBSD(uid, t, block) +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(ctx, uid, ownerPID, t, block) } // UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. @@ -429,6 +429,21 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return nil } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + return fd.locks.LockPOSIX(ctx, uid, ownerPID, t, r, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.locks.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fd.locks.TestPOSIX(ctx, uid, t, r) +} + // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning ENOLCK. // @@ -436,7 +451,7 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { type NoLockFD struct{} // LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { return syserror.ENOLCK } @@ -446,11 +461,16 @@ func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { return syserror.ENOLCK } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return syserror.ENOLCK } + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return linux.Flock{}, syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go index 1ff202f2a..cbe4d8c2d 100644 --- a/pkg/sentry/vfs/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -39,8 +39,8 @@ type FileLocks struct { } // LockBSD tries to acquire a BSD-style lock on the entire file. -func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { +func (fl *FileLocks) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerID int32, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, ownerID, t, fslock.LockRange{0, fslock.LockEOF}, block) { return nil } @@ -61,12 +61,8 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { } // LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - rng, err := computeRange(ctx, fd, start, length, whence) - if err != nil { - return err - } - if fl.posix.LockRegion(uid, t, rng, block) { +func (fl *FileLocks) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + if fl.posix.LockRegion(uid, ownerPID, t, r, block) { return nil } @@ -82,37 +78,12 @@ func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fsl // // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { - rng, err := computeRange(ctx, fd, start, length, whence) - if err != nil { - return err - } - fl.posix.UnlockRegion(uid, rng) +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + fl.posix.UnlockRegion(uid, r) return nil } -func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { - var off int64 - switch whence { - case linux.SEEK_SET: - off = 0 - case linux.SEEK_CUR: - // Note that Linux does not hold any mutexes while retrieving the file - // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. - curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) - if err != nil { - return fslock.LockRange{}, err - } - off = curOff - case linux.SEEK_END: - stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) - if err != nil { - return fslock.LockRange{}, err - } - off = int64(stat.Size) - default: - return fslock.LockRange{}, syserror.EINVAL - } - - return fslock.ComputeRange(int64(start), int64(length), off) +// TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl. +func (fl *FileLocks) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fl.posix.TestRegion(ctx, uid, t, r), nil } diff --git a/pkg/sentry/vfs/opath.go b/pkg/sentry/vfs/opath.go new file mode 100644 index 000000000..39fbac987 --- /dev/null +++ b/pkg/sentry/vfs/opath.go @@ -0,0 +1,139 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// opathFD implements vfs.FileDescriptionImpl for a file description opened with O_PATH. +// +// +stateify savable +type opathFD struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + NoLockFD +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *opathFD) Release(context.Context) { + // noop +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EBADF +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return syserror.EBADF +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.EBADF +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return syserror.EBADF +} + +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return nil, syserror.EBADF +} + +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { + return "", syserror.EBADF +} + +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error { + return syserror.EBADF +} + +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error { + return syserror.EBADF +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *opathFD) Sync(ctx context.Context) error { + return syserror.EBADF +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error { + return syserror.EBADF +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + vfsObj := fd.vfsfd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vfsfd.vd, + Start: fd.vfsfd.vd, + }) + stat, err := fd.vfsfd.vd.mount.fs.impl.StatAt(ctx, rp, opts) + vfsObj.putResolvingPath(ctx, rp) + return stat, err +} + +// StatFS returns metadata for the filesystem containing the file represented +// by fd. +func (fd *opathFD) StatFS(ctx context.Context) (linux.Statfs, error) { + vfsObj := fd.vfsfd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vfsfd.vd, + Start: fd.vfsfd.vd, + }) + statfs, err := fd.vfsfd.vd.mount.fs.impl.StatFSAt(ctx, rp) + vfsObj.putResolvingPath(ctx, rp) + return statfs, err +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index bc79e5ecc..c9907843c 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -129,7 +129,7 @@ type OpenOptions struct { // // FilesystemImpls are responsible for implementing the following flags: // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, - // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and + // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_SYNC, O_TMPFILE, and // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file // descriptors are mostly outside the scope of VFS. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 6fd1bb0b2..0aff2dd92 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -425,6 +425,18 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential rp.mustBeDir = true rp.mustBeDirOrig = true } + if opts.Flags&linux.O_PATH != 0 { + vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) + if err != nil { + return nil, err + } + fd := &opathFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}); err != nil { + return nil, err + } + vd.DecRef(ctx) + return &fd.vfsfd, err + } for { fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { |