diff options
182 files changed, 5397 insertions, 2566 deletions
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index cddf5504b..cb272aef6 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -45,6 +45,14 @@ steps: - make BAZEL_OPTIONS=--config=cross-aarch64 artifacts/aarch64 - make release + # Images tests. + - <<: *common + label: ":docker: Images (x86_64)" + command: make ARCH=x86_64 load-all-images + - <<: *common + label: ":docker: Images (aarch64)" + command: make ARCH=aarch64 load-all-images + # Basic unit tests. - <<: *common label: ":test_tube: Unit tests" @@ -192,7 +200,7 @@ steps: command: make benchmark-platforms BENCHMARKS_SUITE=node BENCHMARKS_TARGETS=test/benchmarks/network:node_test - <<: *benchmarks label: ":redis: Redis benchmarks" - command: make benchmark-platforms BENCHMARKS_SUITE=redis BENCHMARKS_TARGETS=test/benchmarks/database:redis_test + command: make benchmark-platforms BENCHMARKS_SUITE=redis BENCHMARKS_TARGETS=test/benchmarks/database:redis_test BENCHMARKS_OPTIONS=-test.benchtime=15s - <<: *benchmarks label: ":ruby: Ruby benchmarks" command: make benchmark-platforms BENCHMARKS_SUITE=ruby BENCHMARKS_TARGETS=test/benchmarks/network:ruby_test diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md index 69033357c..8e5721ad1 100644 --- a/g3doc/user_guide/FAQ.md +++ b/g3doc/user_guide/FAQ.md @@ -137,9 +137,16 @@ sandbox isolation. There are a few different workarounds you can try: * Use IPs instead of container names. * Use [Kubernetes][k8s]. Container name lookup works fine in Kubernetes. +### I'm getting an error like `dial unix /run/containerd/s/09e4...8cff: connect: connection refused: unknown` {#shim-connect} + +This error may happen when using `gvisor-containerd-shim` with a `containerd` +that does not contain the fix for [CVE-2020-15257]. The resolve the issue, +update containerd to 1.3.9 or 1.4.3 (or newer versions respectively). + [security-model]: /docs/architecture_guide/security/ [host-net]: /docs/user_guide/networking/#network-passthrough [debugging]: /docs/user_guide/debugging/ [filesystem]: /docs/user_guide/filesystem/ [docker]: /docs/user_guide/quick_start/docker/ [k8s]: /docs/user_guide/quick_start/kubernetes/ +[CVE-2020-15257]: https://github.com/containerd/containerd/security/advisories/GHSA-36xw-fx78-c5r4 diff --git a/g3doc/user_guide/containerd/configuration.md b/g3doc/user_guide/containerd/configuration.md index 4f5e721be..011af3b10 100644 --- a/g3doc/user_guide/containerd/configuration.md +++ b/g3doc/user_guide/containerd/configuration.md @@ -1,8 +1,8 @@ # Containerd Advanced Configuration This document describes how to configure runtime options for -`containerd-shim-runsc-v1`. This follows the -[Containerd Quick Start](./quick_start.md) and requires containerd 1.2 or later. +`containerd-shim-runsc-v1`. You can find the installation instructions and +minimal requirements in [Containerd Quick Start](./quick_start.md). ## Shim Configuration @@ -47,27 +47,6 @@ When you are done, restart containerd to pick up the changes. sudo systemctl restart containerd ``` -### Containerd 1.2 - -For containerd 1.2, the config file is not configurable. It should be named -`config.toml` and located in the runtime root. By default, this is -`/run/containerd/runsc`. - -### Example: Enable the KVM platform - -gVisor enables the use of a number of platforms. This example shows how to -configure `containerd-shim-runsc-v1` to use gvisor with the KVM platform. - -Find out more about platform in the -[Platforms Guide](../../architecture_guide/platforms.md). - -```shell -cat <<EOF | sudo tee /etc/containerd/runsc.toml -[runsc_config] - platform = "kvm" -EOF -``` - ## Debug When `shim_debug` is enabled in `/etc/containerd/config.toml`, containerd will diff --git a/g3doc/user_guide/containerd/quick_start.md b/g3doc/user_guide/containerd/quick_start.md index a98fe5c4a..132d80927 100644 --- a/g3doc/user_guide/containerd/quick_start.md +++ b/g3doc/user_guide/containerd/quick_start.md @@ -1,7 +1,7 @@ # Containerd Quick Start This document describes how to use `containerd-shim-runsc-v1` with the -containerd runtime handler support on `containerd` 1.2 or later. +containerd runtime handler support on `containerd`. > ⚠️ NOTE: If you are using Kubernetes and set up your cluster using kubeadm you > may run into issues. See the [FAQ](../FAQ.md#runtime-handler) for details. @@ -11,7 +11,8 @@ containerd runtime handler support on `containerd` 1.2 or later. - **runsc** and **containerd-shim-runsc-v1**: See the [installation guide](/docs/user_guide/install/). - **containerd**: See the [containerd website](https://containerd.io/) for - information on how to install containerd. + information on how to install containerd. **Minimal version supported: 1.3.9 + or 1.4.3.** ## Configure containerd diff --git a/images/basic/fsstress/Dockerfile b/images/basic/fsstress/Dockerfile.x86_64 index 21b86065a..21b86065a 100644 --- a/images/basic/fsstress/Dockerfile +++ b/images/basic/fsstress/Dockerfile.x86_64 diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index d1ca56370..b84d7c048 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -21,6 +21,7 @@ const ( F_SETFD = 2 F_GETFL = 3 F_SETFL = 4 + F_GETLK = 5 F_SETLK = 6 F_SETLKW = 7 F_SETOWN = 8 @@ -55,7 +56,7 @@ type Flock struct { _ [4]byte Start int64 Len int64 - Pid int32 + PID int32 _ [4]byte } diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go index fdfe31417..6f3d72e83 100644 --- a/pkg/coverage/coverage.go +++ b/pkg/coverage/coverage.go @@ -26,7 +26,6 @@ import ( "fmt" "io" "sort" - "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/sync" @@ -69,12 +68,18 @@ var globalData struct { } // ClearCoverageData clears existing coverage data. +// +//go:norace func ClearCoverageData() { coverageMu.Lock() defer coverageMu.Unlock() + + // We do not use atomic operations while reading/writing to the counters, + // which would drastically degrade performance. Slight discrepancies due to + // racing is okay for the purposes of kcov. for _, counters := range coverdata.Cover.Counters { for index := 0; index < len(counters); index++ { - atomic.StoreUint32(&counters[index], 0) + counters[index] = 0 } } } @@ -114,6 +119,8 @@ var coveragePool = sync.Pool{ // ensure that each event is only reported once. Due to the limitations of Go // coverage tools, we reset the global coverage data every time this function is // run. +// +//go:norace func ConsumeCoverageData(w io.Writer) int { InitCoverageData() @@ -125,11 +132,14 @@ func ConsumeCoverageData(w io.Writer) int { for fileNum, file := range globalData.files { counters := coverdata.Cover.Counters[file] for index := 0; index < len(counters); index++ { - if atomic.LoadUint32(&counters[index]) == 0 { + // We do not use atomic operations while reading/writing to the counters, + // which would drastically degrade performance. Slight discrepancies due to + // racing is okay for the purposes of kcov. + if counters[index] == 0 { continue } // Non-zero coverage data found; consume it and report as a PC. - atomic.StoreUint32(&counters[index], 0) + counters[index] = 0 pc := globalData.syntheticPCs[fileNum][index] usermem.ByteOrder.PutUint64(pcBuffer[:], pc) n, err := w.Write(pcBuffer[:]) diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go index af8230a7d..387f713aa 100644 --- a/pkg/sentry/fs/fdpipe/pipe_state.go +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -34,7 +34,9 @@ func (p *pipeOperations) beforeSave() { } else if p.flags.Write { file, err := p.opener.NonBlockingOpen(context.Background(), fs.PermMask{Write: true}) if err != nil { - panic(fs.ErrSaveRejection{fmt.Errorf("write-only pipe end cannot be re-opened as %v: %v", p, err)}) + panic(&fs.ErrSaveRejection{ + Err: fmt.Errorf("write-only pipe end cannot be re-opened as %#v: %w", p, err), + }) } file.Close() } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index a020da53b..44587bb37 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -144,7 +144,7 @@ type ErrSaveRejection struct { } // Error returns a sensible description of the save rejection error. -func (e ErrSaveRejection) Error() string { +func (e *ErrSaveRejection) Error() string { return "save rejected due to unsupported file system state: " + e.Err.Error() } diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index a3402e343..141e3c27f 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -67,7 +67,9 @@ func (i *inodeFileState) beforeSave() { if i.sattr.Type == fs.RegularFile { uattr, err := i.unstableAttr(&dummyClockContext{context.Background()}) if err != nil { - panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.s.inodeMappings[i.sattr.InodeID], err)}) + panic(&fs.ErrSaveRejection{ + Err: fmt.Errorf("failed to get unstable atttribute of %s: %w", i.s.inodeMappings[i.sattr.InodeID], err), + }) } i.savedUAttr = &uattr } diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index ae3331737..4d3b216d8 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -41,6 +41,8 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/sync", "//pkg/waiter", diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 8a5d9c7eb..57686ce07 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -54,6 +54,8 @@ import ( "math" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) @@ -83,6 +85,17 @@ const ( // offset 0 to LockEOF. const LockEOF = math.MaxUint64 +// OwnerInfo describes the owner of a lock. +// +// TODO(gvisor.dev/issue/5264): We may need to add other fields in the future +// (e.g., Linux's file_lock.fl_flags to support open file-descriptor locks). +// +// +stateify savable +type OwnerInfo struct { + // PID is the process ID of the lock owner. + PID int32 +} + // Lock is a regional file lock. It consists of either a single writer // or a set of readers. // @@ -92,14 +105,20 @@ const LockEOF = math.MaxUint64 // A Lock may be downgraded from a write lock to a read lock only if // the write lock's uid is the same as the read lock. // +// Accesses to Lock are synchronized through the Locks object to which it +// belongs. +// // +stateify savable type Lock struct { // Readers are the set of read lock holders identified by UniqueID. - // If len(Readers) > 0 then HasWriter must be false. - Readers map[UniqueID]bool + // If len(Readers) > 0 then Writer must be nil. + Readers map[UniqueID]OwnerInfo // Writer holds the writer unique ID. It's nil if there are no writers. Writer UniqueID + + // WriterInfo describes the writer. It is only meaningful if Writer != nil. + WriterInfo OwnerInfo } // Locks is a thread-safe wrapper around a LockSet. @@ -135,14 +154,14 @@ const ( // acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode. // Blocker is the interface used to provide blocking behavior, passing a nil Blocker // will result in non-blocking behavior. -func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool { +func (l *Locks) LockRegion(uid UniqueID, ownerPID int32, t LockType, r LockRange, block Blocker) bool { for { l.mu.Lock() // Blocking locks must run in a loop because we'll be woken up whenever an unlock event // happens for this lock. We will then attempt to take the lock again and if it fails // continue blocking. - res := l.locks.lock(uid, t, r) + res := l.locks.lock(uid, ownerPID, t, r) if !res && block != nil { e, ch := waiter.NewChannelEntry(nil) l.blockedQueue.EventRegister(&e, EventMaskAll) @@ -161,6 +180,14 @@ func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) } } +// LockRegionVFS1 is a wrapper around LockRegion for VFS1, which does not implement +// F_GETLK (and does not care about storing PIDs as a result). +// +// TODO(gvisor.dev/issue/1624): Delete. +func (l *Locks) LockRegionVFS1(uid UniqueID, t LockType, r LockRange, block Blocker) bool { + return l.LockRegion(uid, 0 /* ownerPID */, t, r, block) +} + // UnlockRegion attempts to release a lock for the uid on a region of a file. // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. @@ -175,13 +202,14 @@ func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) { // makeLock returns a new typed Lock that has either uid as its only reader // or uid as its only writer. -func makeLock(uid UniqueID, t LockType) Lock { - value := Lock{Readers: make(map[UniqueID]bool)} +func makeLock(uid UniqueID, ownerPID int32, t LockType) Lock { + value := Lock{Readers: make(map[UniqueID]OwnerInfo)} switch t { case ReadLock: - value.Readers[uid] = true + value.Readers[uid] = OwnerInfo{PID: ownerPID} case WriteLock: value.Writer = uid + value.WriterInfo = OwnerInfo{PID: ownerPID} default: panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) } @@ -190,17 +218,20 @@ func makeLock(uid UniqueID, t LockType) Lock { // isHeld returns true if uid is a holder of Lock. func (l Lock) isHeld(uid UniqueID) bool { - return l.Writer == uid || l.Readers[uid] + if _, ok := l.Readers[uid]; ok { + return true + } + return l.Writer == uid } // lock sets uid as a holder of a typed lock on Lock. // // Preconditions: canLock is true for the range containing this Lock. -func (l *Lock) lock(uid UniqueID, t LockType) { +func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType) { switch t { case ReadLock: // If we are already a reader, then this is a no-op. - if l.Readers[uid] { + if _, ok := l.Readers[uid]; ok { return } // We cannot downgrade a write lock to a read lock unless the @@ -210,11 +241,11 @@ func (l *Lock) lock(uid UniqueID, t LockType) { panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) } // Ensure that there is only one reader if upgrading. - l.Readers = make(map[UniqueID]bool) + l.Readers = make(map[UniqueID]OwnerInfo) // Ensure that there is no longer a writer. l.Writer = nil } - l.Readers[uid] = true + l.Readers[uid] = OwnerInfo{PID: ownerPID} return case WriteLock: // If we are already the writer, then this is a no-op. @@ -228,13 +259,14 @@ func (l *Lock) lock(uid UniqueID, t LockType) { if readers != 1 { panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers)) } - if !l.Readers[uid] { + if _, ok := l.Readers[uid]; !ok { panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers)) } } // Ensure that there is only a writer. - l.Readers = make(map[UniqueID]bool) + l.Readers = make(map[UniqueID]OwnerInfo) l.Writer = uid + l.WriterInfo = OwnerInfo{PID: ownerPID} default: panic(fmt.Sprintf("lock: invalid lock type %d", t)) } @@ -247,7 +279,7 @@ func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool { // Get our starting point. seg := l.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { - // Note that we don't care about overruning the end of the + // Note that we don't care about overrunning the end of the // last segment because if everything checks out we'll just // split the last segment. if !check(seg.Value()) { @@ -281,7 +313,7 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { if value.Writer == nil { // Then this uid can only take a write lock if this is a private // upgrade, meaning that the only reader is uid. - return len(value.Readers) == 1 && value.Readers[uid] + return value.isOnlyReader(uid) } // If the uid is already a writer on this region, then // adding a write lock would be a no-op. @@ -292,11 +324,19 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { } } +func (l *Lock) isOnlyReader(uid UniqueID) bool { + if len(l.Readers) != 1 { + return false + } + _, ok := l.Readers[uid] + return ok +} + // lock returns true if uid took a lock of type t on the entire range of // LockRange. // // Preconditions: r.Start <= r.End (will panic otherwise). -func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { +func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange) bool { if r.Start > r.End { panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End)) } @@ -317,7 +357,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { seg, gap := l.Find(r.Start) if gap.Ok() { // Fill in the gap and get the next segment to modify. - seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment() + seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t)).NextSegment() } else if seg.Start() < r.Start { // Get our first segment to modify. _, seg = l.Split(seg, r.Start) @@ -331,12 +371,12 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { // Set the lock on the segment. This is guaranteed to // always be safe, given canLock above. value := seg.ValuePtr() - value.lock(uid, t) + value.lock(uid, ownerPID, t) // Fill subsequent gaps. gap = seg.NextGap() if gr := gap.Range().Intersect(r); gr.Length() > 0 { - seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment() + seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t)).NextSegment() } else { seg = gap.NextSegment() } @@ -380,7 +420,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { // only ever be one writer and no readers, then this // lock should always be removed from the set. remove = true - } else if value.Readers[uid] { + } else if _, ok := value.Readers[uid]; ok { // If uid is the last reader, then just remove the entire // segment. if len(value.Readers) == 1 { @@ -390,7 +430,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { // affecting any other segment's readers. To do // this, we need to make a copy of the Readers map // and not add this uid. - newValue := Lock{Readers: make(map[UniqueID]bool)} + newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)} for k, v := range value.Readers { if k != uid { newValue.Readers[k] = v @@ -451,3 +491,72 @@ func ComputeRange(start, length, offset int64) (LockRange, error) { // Offset is guaranteed to be positive at this point. return LockRange{Start: uint64(offset), End: end}, nil } + +// TestRegion checks whether the lock holder identified by uid can hold a lock +// of type t on range r. It returns a Flock struct representing this +// information as the F_GETLK fcntl does. +// +// Note that the PID returned in the flock structure is relative to the root PID +// namespace. It needs to be converted to the caller's PID namespace before +// returning to userspace. +// +// TODO(gvisor.dev/issue/5264): we don't support OFD locks through fcntl, which +// would return a struct with pid = -1. +func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange) linux.Flock { + f := linux.Flock{Type: linux.F_UNLCK} + switch t { + case ReadLock: + l.testRegion(r, func(lock Lock, start, length uint64) bool { + if lock.Writer == nil || lock.Writer == uid { + return true + } + f.Type = linux.F_WRLCK + f.PID = lock.WriterInfo.PID + f.Start = int64(start) + f.Len = int64(length) + return false + }) + case WriteLock: + l.testRegion(r, func(lock Lock, start, length uint64) bool { + if lock.Writer == nil { + for k, v := range lock.Readers { + if k != uid { + // Stop at the first conflict detected. + f.Type = linux.F_RDLCK + f.PID = v.PID + f.Start = int64(start) + f.Len = int64(length) + return false + } + } + return true + } + if lock.Writer == uid { + return true + } + f.Type = linux.F_WRLCK + f.PID = lock.WriterInfo.PID + f.Start = int64(start) + f.Len = int64(length) + return false + }) + default: + panic(fmt.Sprintf("TestRegion: invalid lock type %d", t)) + } + return f +} + +func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) { + l.mu.Lock() + defer l.mu.Unlock() + + seg := l.locks.LowerBoundSegment(r.Start) + for seg.Ok() && seg.Start() < r.End { + lock := seg.Value() + if !check(lock, seg.Start(), seg.End()-seg.Start()) { + // Stop at the first conflict detected. + return + } + seg = seg.NextSegment() + } +} diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go index 50a16e662..dcc17c0dc 100644 --- a/pkg/sentry/fs/lock/lock_set_functions.go +++ b/pkg/sentry/fs/lock/lock_set_functions.go @@ -40,7 +40,7 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) return Lock{}, false } for k := range val1.Readers { - if !val2.Readers[k] { + if _, ok := val2.Readers[k]; !ok { return Lock{}, false } } @@ -53,11 +53,12 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) { // Copy the segment so that split segments don't contain map references // to other segments. - val0 := Lock{Readers: make(map[UniqueID]bool)} + val0 := Lock{Readers: make(map[UniqueID]OwnerInfo)} for k, v := range val.Readers { val0.Readers[k] = v } val0.Writer = val.Writer + val0.WriterInfo = val.WriterInfo return val, val0 } diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go index fad90984b..9878c04e1 100644 --- a/pkg/sentry/fs/lock/lock_test.go +++ b/pkg/sentry/fs/lock/lock_test.go @@ -30,12 +30,12 @@ func equals(e0, e1 []entry) bool { } for i := range e0 { for k := range e0[i].Lock.Readers { - if !e1[i].Lock.Readers[k] { + if _, ok := e1[i].Lock.Readers[k]; !ok { return false } } for k := range e1[i].Lock.Readers { - if !e0[i].Lock.Readers[k] { + if _, ok := e0[i].Lock.Readers[k]; !ok { return false } } @@ -90,15 +90,15 @@ func TestCanLock(t *testing.T) { // 0 1024 2048 3072 4096 l := fill([]entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 3: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 3: OwnerInfo{}}}, LockRange: LockRange{2048, 3072}, }, { @@ -220,7 +220,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -266,7 +266,7 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, { @@ -283,7 +283,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -302,7 +302,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -333,7 +333,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -351,7 +351,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -366,11 +366,11 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -383,7 +383,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -398,15 +398,15 @@ func TestSetLock(t *testing.T) { // 0 4096 8192 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, 8192}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{8192, LockEOF}, }, }, @@ -419,7 +419,7 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -434,7 +434,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -447,7 +447,7 @@ func TestSetLock(t *testing.T) { // 0 4096 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, }, @@ -467,11 +467,11 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -484,7 +484,7 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -499,15 +499,15 @@ func TestSetLock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -520,15 +520,15 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -543,7 +543,7 @@ func TestSetLock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { @@ -551,7 +551,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -564,15 +564,15 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -587,7 +587,7 @@ func TestSetLock(t *testing.T) { // 0 1024 3072 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { @@ -595,7 +595,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{1024, 3072}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -608,11 +608,11 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, }, @@ -634,15 +634,15 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{1024, 2048}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{2048, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -676,7 +676,7 @@ func TestSetLock(t *testing.T) { l := fill(test.before) r := LockRange{Start: test.start, End: test.end} - success := l.lock(test.uid, test.lockType, r) + success := l.lock(test.uid, 0 /* ownerPID */, test.lockType, r) var got []entry for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { got = append(got, entry{ @@ -739,7 +739,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -752,7 +752,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -765,7 +765,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -797,7 +797,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -810,7 +810,7 @@ func TestUnlock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -849,7 +849,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -862,7 +862,7 @@ func TestUnlock(t *testing.T) { // 0 4096 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}}}, LockRange: LockRange{0, 4096}, }, }, @@ -901,7 +901,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { @@ -909,7 +909,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -922,11 +922,11 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -939,7 +939,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, LockEOF}, }, }, @@ -952,15 +952,15 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 after: []entry{ { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{1024, 4096}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}, 2: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -977,7 +977,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -994,7 +994,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 8}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -1011,7 +1011,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -1028,11 +1028,11 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{1: OwnerInfo{}}}, LockRange: LockRange{4096, 8192}, }, { - Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + Lock: Lock{Readers: map[UniqueID]OwnerInfo{0: OwnerInfo{}, 1: OwnerInfo{}}}, LockRange: LockRange{8192, LockEOF}, }, }, diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index e91fa26a4..b44117f40 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -194,16 +193,6 @@ func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions return mfd.inode.Stat(ctx, fs, opts) } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence) -} - // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { switch cmd { diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go index 70c68cf0a..a0c5b5af5 100644 --- a/pkg/sentry/fsimpl/devpts/replica.go +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -189,13 +188,3 @@ func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOption fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() return rfd.inode.Stat(ctx, fs, opts) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 0ad79b381..512b70ede 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -311,13 +310,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 4a5539b37..5ad9befcd 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -154,13 +153,3 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // TODO(b/134676337): Implement mmap(2). return syserror.ENODEV } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 3b5927702..9da01cba3 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -90,6 +90,7 @@ type createSyntheticOpts struct { // * d.isDir(). // * d does not already contain a child with the given name. func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { + now := d.fs.clock.Now().Nanoseconds() child := &dentry{ refs: 1, // held by d fs: d.fs, @@ -98,6 +99,10 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { uid: uint32(opts.kuid), gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary + atime: now, + mtime: now, + ctime: now, + btime: now, readFD: -1, writeFD: -1, mmapFD: -1, diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 91d5dc174..8f95473b6 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -36,16 +36,26 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // Snapshot current syncable dentries and special file FDs. + fs.renameMu.RLock() fs.syncMu.Lock() ds := make([]*dentry, 0, len(fs.syncableDentries)) for d := range fs.syncableDentries { + // It's safe to use IncRef here even though fs.syncableDentries doesn't + // hold references since we hold fs.renameMu. Note that we can't use + // TryIncRef since cached dentries at zero references should still be + // synced. d.IncRef() ds = append(ds, d) } + fs.renameMu.RUnlock() sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { - sffd.vfsfd.IncRef() - sffds = append(sffds, sffd) + // As above, fs.specialFileFDs doesn't hold references. However, unlike + // dentries, an FD that has reached zero references can't be + // resurrected, so we can use TryIncRef. + if sffd.vfsfd.TryIncRef() { + sffds = append(sffds, sffd) + } } fs.syncMu.Unlock() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 3cdb1e659..98f7bc52f 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1944,22 +1944,22 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { } // LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { fd.lockLogging.Do(func() { log.Infof("File lock using gofer file handled internally.") }) - return fd.LockFD.LockBSD(ctx, uid, t, block) + return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { fd.lockLogging.Do(func() { log.Infof("Range lock using gofer file handled internally.") }) - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) + return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.Locks().UnlockPOSIX(ctx, uid, r) } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 36a3f6810..05f11fbd5 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -810,13 +809,3 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) { func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index f5c596fec..0f9e20a84 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -370,13 +369,3 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return syserror.ERESTARTSYS } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 485504995..65054b0ea 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -136,13 +135,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index f8dae22f8..e55111af0 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -275,13 +274,3 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 3492409b2..082fa6504 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -42,7 +42,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -820,13 +819,3 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 75be6129f..fdae163d1 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -518,16 +517,6 @@ func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { // Release implements vfs.FileDescriptionImpl.Release. func (fd *memFD) Release(context.Context) {} -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *memFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *memFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} - // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable @@ -1110,13 +1099,3 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err func (fd *namespaceFD) Release(ctx context.Context) { fd.inode.DecRef(ctx) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 7ee6227a9..d6f076cd6 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -393,7 +393,7 @@ func TestProcSelf(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } - collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ + collector := s.WithTemporaryContext(task.AsyncContext()).ListDirents(&vfs.PathOperation{ Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc/self/"), @@ -491,11 +491,11 @@ func TestTree(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } // Add file to populate /proc/[pid]/fd and fdinfo directories. - task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{}) + task.FDTable().NewFDVFS2(task.AsyncContext(), 0, file, kernel.FDFlags{}) tasks = append(tasks, task) } - ctx := tasks[0] + ctx := tasks[0].AsyncContext() fd, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(s.Ctx), diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 146c7fdfe..4393cc13b 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -140,35 +140,35 @@ func TestLocks(t *testing.T) { uid1 := 123 uid2 := 456 - if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil { + if err := fd.Impl().LockBSD(ctx, uid1, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 1, End: 2}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.WriteLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } - if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil { + if err := fd.Impl().UnlockPOSIX(ctx, uid1, lock.LockRange{Start: 0, End: 1}); err != nil { t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 0c9c639d3..b32c54e20 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,7 +36,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -797,16 +796,6 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { return nil } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} - // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all // filesystem state is in-memory. func (*fileDescription) Sync(context.Context) error { diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index a5171b5ad..8645078a0 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -660,7 +660,6 @@ func (d *dentry) readlink(ctx context.Context) (string, error) { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - vfs.LockFD // d is the corresponding dentry to the fileDescription. d *dentry @@ -1104,14 +1103,29 @@ func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, op return 0, syserror.EROFS } +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { + return fd.lowerFD.LockBSD(ctx, ownerPID, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *fileDescription) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return fd.lowerFD.UnlockBSD(ctx) +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block) +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + return fd.lowerFD.LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence) +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.lowerFD.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (fd *fileDescription) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fd.lowerFD.TestPOSIX(ctx, uid, t, r) } // FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index 30d8b4355..798d6a9bd 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -66,7 +66,7 @@ func dentryFromFD(t *testing.T, fd *vfs.FileDescription) *dentry { // newVerityRoot creates a new verity mount, and returns the root. The // underlying file system is tmpfs. If the error is not nil, then cleanup // should be called when the root is no longer needed. -func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) { +func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, context.Context, error) { t.Helper() k, err := testutil.Boot() if err != nil { @@ -119,7 +119,7 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, root.DecRef(ctx) mntns.DecRef(ctx) }) - return vfsObj, root, task, nil + return vfsObj, root, task.AsyncContext(), nil } // openVerityAt opens a verity file. diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 0ee60569c..8a5b11d40 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -240,7 +240,6 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 7aba31587..a6afabb1c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -153,20 +153,12 @@ func (f *FDTable) drop(ctx context.Context, file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { - // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the - // entire file. - err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) + // Release any POSIX lock possibly held by the FDTable. + err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) if err != nil && err != syserror.ENOLCK { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } - // Generate inotify events. - ev := uint32(linux.IN_CLOSE_NOWRITE) - if file.IsWritable() { - ev = linux.IN_CLOSE_WRITE - } - file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) - // Drop the table's reference. file.DecRef(ctx) } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 303ae8056..ef4e934a1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -593,8 +593,8 @@ func (k *Kernel) flushWritesToFiles(ctx context.Context) error { // Wrap this error in ErrSaveRejection so that it will trigger a save // error, rather than a panic. This also allows us to distinguish Fsync // errors from state file errors in state.Save. - return fs.ErrSaveRejection{ - Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), + return &fs.ErrSaveRejection{ + Err: fmt.Errorf("%q was not sufficiently synced: %w", name, err), } } return nil diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 2c32d017d..71daa9f4b 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -27,7 +27,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index d5a91730d..3b6336e94 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -441,13 +440,3 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } return n, err } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index 9419f2e95..ecbe8f920 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -69,7 +69,7 @@ func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time. // syserror.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error { +func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline ktime.Time) error { if !haveDeadline { return t.block(C, nil) } diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index fdadb52c0..e9da99067 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -216,7 +216,7 @@ func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { return t } -// ThreadGroupWithID returns the thread group lead by the task with thread ID +// ThreadGroupWithID returns the thread group led by the task with thread ID // tid in PID namespace ns. If no task has that TID, or if the task with that // TID is not a thread group leader, ThreadGroupWithID returns nil. func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { @@ -292,6 +292,11 @@ func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { return ns.userns } +// Root returns the root PID namespace of ns. +func (ns *PIDNamespace) Root() *PIDNamespace { + return ns.owner.Root +} + // A threadGroupNode defines the relationship between a thread group and the // rest of the system. Conceptually, threadGroupNode is data belonging to the // owning TaskSet, as if TaskSet contained a field `nodes @@ -485,3 +490,8 @@ func (t *Task) Parent() *Task { func (t *Task) ThreadID() ThreadID { return t.tg.pidns.IDOfTask(t) } + +// TGIDInRoot returns t's TGID in the root PID namespace. +func (t *Task) TGIDInRoot() ThreadID { + return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg) +} diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index 6efe5102b..73bfbea49 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -17,7 +17,6 @@ package mm import ( "bytes" "fmt" - "strings" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" @@ -165,12 +164,12 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI } if s != "" { // Per linux, we pad until the 74th character. - if pad := 73 - lineLen; pad > 0 { - b.WriteString(strings.Repeat(" ", pad)) + for pad := 73 - lineLen; pad > 0; pad-- { + b.WriteByte(' ') } b.WriteString(s) } - b.WriteString("\n") + b.WriteByte('\n') } // ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 675efdc7c..69e37330b 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -1055,18 +1055,11 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { mm.activeMu.Lock() defer mm.activeMu.Unlock() - // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range() - // is analogous to our mm.invalidateLocked(ar, true, true). We inline this - // here, with the special case that we synchronously decommit - // uniquely-owned (non-copy-on-write) pages for private anonymous vma, - // which is the common case for MADV_DONTNEED. Invalidating these pmas, and - // allowing them to be reallocated when touched again, increases pma - // fragmentation, which may significantly reduce performance for - // non-vectored I/O implementations. Also, decommitting synchronously - // ensures that Decommit immediately reduces host memory usage. + // This is invalidateLocked(invalidatePrivate=true, invalidateShared=true), + // with the additional wrinkle that we must refuse to invalidate pmas under + // mlocked vmas. var didUnmapAS bool pseg := mm.pmas.LowerBoundSegment(ar.Start) - mf := mm.mfp.MemoryFile() for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { vma := vseg.ValuePtr() if vma.mlockMode != memmap.MLockNone { @@ -1081,20 +1074,8 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { } } for pseg.Ok() && pseg.Start() < vsegAR.End { - pma := pseg.ValuePtr() - if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) { - psegAR := pseg.Range().Intersect(ar) - if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { - if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil { - pseg = pseg.NextSegment() - continue - } - // If an error occurs, fall through to the general - // invalidation case below. - } - } pseg = mm.pmas.Isolate(pseg, vsegAR) - pma = pseg.ValuePtr() + pma := pseg.ValuePtr() if !didUnmapAS { // Unmap all of ar, not just pseg.Range(), to minimize host // syscalls. AddressSpace mappings must be removed before diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index b6ebe29d6..a8e6f172b 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 5b868216d..17f59ba1f 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -377,10 +377,8 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] switch level { case linux.SOL_IP: switch name { - case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR: + case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR: optlen = sizeofInt32 - case linux.IP_PKTINFO: - optlen = linux.SizeOfControlMessageIPPacketInfo } case linux.SOL_IPV6: switch name { diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 9a2cac40b..f82c7c224 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -144,16 +143,6 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return int64(n), err } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} - type socketProviderVFS2 struct { family int } diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go index 70c561cce..2f913787b 100644 --- a/pkg/sentry/socket/netfilter/ipv4.go +++ b/pkg/sentry/socket/netfilter/ipv4.go @@ -15,7 +15,6 @@ package netfilter import ( - "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -220,18 +219,6 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) } - n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) - if n == -1 { - n = len(iptip.OutputInterface) - } - ifname := string(iptip.OutputInterface[:n]) - - n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) - if n == -1 { - n = len(iptip.OutputInterfaceMask) - } - ifnameMask := string(iptip.OutputInterfaceMask[:n]) - return stack.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), // A Protocol value of 0 indicates all protocols match. @@ -242,8 +229,11 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { Src: tcpip.Address(iptip.Src[:]), SrcMask: tcpip.Address(iptip.SrcMask[:]), SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, - OutputInterface: ifname, - OutputInterfaceMask: ifnameMask, + InputInterface: string(trimNullBytes(iptip.InputInterface[:])), + InputInterfaceMask: string(trimNullBytes(iptip.InputInterfaceMask[:])), + InputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_IN != 0, + OutputInterface: string(trimNullBytes(iptip.OutputInterface[:])), + OutputInterfaceMask: string(trimNullBytes(iptip.OutputInterfaceMask[:])), OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, }, nil } @@ -254,12 +244,12 @@ func containsUnsupportedFields4(iptip linux.IPTIP) bool { // - Dst and DstMask // - Src and SrcMask // - The inverse destination IP check flag + // - InputInterface, InputInterfaceMask and its inverse. // - OutputInterface, OutputInterfaceMask and its inverse. - var emptyInterface = [linux.IFNAMSIZ]byte{} + const flagMask = 0 // Disable any supported inverse flags. - inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) - return iptip.InputInterface != emptyInterface || - iptip.InputInterfaceMask != emptyInterface || - iptip.Flags != 0 || + const inverseMask = linux.IPT_INV_DSTIP | linux.IPT_INV_SRCIP | + linux.IPT_INV_VIA_IN | linux.IPT_INV_VIA_OUT + return iptip.Flags&^flagMask != 0 || iptip.InverseFlags&^inverseMask != 0 } diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go index 5dbb604f0..263d9d3b5 100644 --- a/pkg/sentry/socket/netfilter/ipv6.go +++ b/pkg/sentry/socket/netfilter/ipv6.go @@ -15,7 +15,6 @@ package netfilter import ( - "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -223,18 +222,6 @@ func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) } - n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) - if n == -1 { - n = len(iptip.OutputInterface) - } - ifname := string(iptip.OutputInterface[:n]) - - n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) - if n == -1 { - n = len(iptip.OutputInterfaceMask) - } - ifnameMask := string(iptip.OutputInterfaceMask[:n]) - return stack.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), // In ip6tables a flag controls whether to check the protocol. @@ -245,8 +232,11 @@ func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) { Src: tcpip.Address(iptip.Src[:]), SrcMask: tcpip.Address(iptip.SrcMask[:]), SrcInvert: iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0, - OutputInterface: ifname, - OutputInterfaceMask: ifnameMask, + InputInterface: string(trimNullBytes(iptip.InputInterface[:])), + InputInterfaceMask: string(trimNullBytes(iptip.InputInterfaceMask[:])), + InputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_IN != 0, + OutputInterface: string(trimNullBytes(iptip.OutputInterface[:])), + OutputInterfaceMask: string(trimNullBytes(iptip.OutputInterfaceMask[:])), OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0, }, nil } @@ -257,14 +247,13 @@ func containsUnsupportedFields6(iptip linux.IP6TIP) bool { // - Dst and DstMask // - Src and SrcMask // - The inverse destination IP check flag + // - InputInterface, InputInterfaceMask and its inverse. // - OutputInterface, OutputInterfaceMask and its inverse. - var emptyInterface = [linux.IFNAMSIZ]byte{} - flagMask := uint8(linux.IP6T_F_PROTO) + const flagMask = linux.IP6T_F_PROTO // Disable any supported inverse flags. - inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT) - return iptip.InputInterface != emptyInterface || - iptip.InputInterfaceMask != emptyInterface || - iptip.Flags&^flagMask != 0 || + const inverseMask = linux.IP6T_INV_DSTIP | linux.IP6T_INV_SRCIP | + linux.IP6T_INV_VIA_IN | linux.IP6T_INV_VIA_OUT + return iptip.Flags&^flagMask != 0 || iptip.InverseFlags&^inverseMask != 0 || iptip.TOS != 0 } diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 26bd1abd4..7ae18b2a3 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,6 +17,7 @@ package netfilter import ( + "bytes" "errors" "fmt" @@ -393,3 +394,11 @@ func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkP rev.Revision = maxSupported return rev, nil } + +func trimNullBytes(b []byte) []byte { + n := bytes.IndexByte(b, 0) + if n == -1 { + n = len(b) + } + return b[:n] +} diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 69d13745e..176fa6116 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -112,7 +112,7 @@ func (*OwnerMatcher) Name() string { } // Match implements Matcher.Match. -func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { +func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // Support only for OUTPUT chain. // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. if hook != stack.Output { diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 352c51390..2740697b3 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -96,7 +96,7 @@ func (*TCPMatcher) Name() string { } // Match implements Matcher.Match. -func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { +func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index c88d8268d..466d5395d 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -93,7 +93,7 @@ func (*UDPMatcher) Name() string { } // Match implements Matcher.Match. -func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { +func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 1f926aa91..9313e1167 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -22,7 +22,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index 461d524e5..842036764 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" @@ -149,13 +148,3 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 22abca120..915134b41 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -42,7 +41,6 @@ go_library( "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 22e128b96..7065a0e46 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -19,7 +19,7 @@ // be used to expose certain endpoints to the sentry while leaving others out, // for example, TCP endpoints and Unix-domain endpoints. // -// Lock ordering: netstack => mm: ioSequencePayload copies user memory inside +// Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during // this operation. package netstack @@ -55,7 +55,6 @@ import ( "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" @@ -194,7 +193,6 @@ var Metrics = tcpip.Stats{ RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), - OutgoingRequestNetworkUnreachableErrors: mustCreateMetric("/netstack/arp/outgoing_requests_network_unreachable", "Number of failed attempts to send an ARP request with a network unreachable error."), OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), @@ -440,45 +438,10 @@ func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Write return int64(res.Count), nil } -// ioSequencePayload implements tcpip.Payload. -// -// t copies user memory bytes on demand based on the requested size. -type ioSequencePayload struct { - ctx context.Context - src usermem.IOSequence -} - -// FullPayload implements tcpip.Payloader.FullPayload -func (i *ioSequencePayload) FullPayload() ([]byte, *tcpip.Error) { - return i.Payload(int(i.src.NumBytes())) -} - -// Payload implements tcpip.Payloader.Payload. -func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) { - if max := int(i.src.NumBytes()); size > max { - size = max - } - v := buffer.NewView(size) - if _, err := i.src.CopyIn(i.ctx, v); err != nil { - // EOF can be returned only if src is a file and this means it - // is in a splice syscall and the error has to be ignored. - if err == io.EOF { - return v, nil - } - return nil, tcpip.ErrBadAddress - } - return v, nil -} - -// DropFirst drops the first n bytes from underlying src. -func (i *ioSequencePayload) DropFirst(n int) { - i.src = i.src.DropFirst(int(n)) -} - // Write implements fs.FileOperations.Write. func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { - f := &ioSequencePayload{ctx: ctx, src: src} - n, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + r := src.Reader(ctx) + n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) if err == tcpip.ErrWouldBlock { return 0, syserror.ErrWouldBlock } @@ -486,69 +449,40 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO return 0, syserr.TranslateNetstackError(err).ToError() } - if int64(n) < src.NumBytes() { - return int64(n), syserror.ErrWouldBlock + if n < src.NumBytes() { + return n, syserror.ErrWouldBlock } - return int64(n), nil + return n, nil } -// readerPayload implements tcpip.Payloader. -// -// It allocates a view and reads from a reader on-demand, based on available -// capacity in the endpoint. -type readerPayload struct { - ctx context.Context - r io.Reader - count int64 - err error -} +var _ tcpip.Payloader = (*limitedPayloader)(nil) -// FullPayload implements tcpip.Payloader.FullPayload. -func (r *readerPayload) FullPayload() ([]byte, *tcpip.Error) { - return r.Payload(int(r.count)) +type limitedPayloader struct { + io.LimitedReader } -// Payload implements tcpip.Payloader.Payload. -func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) { - if size > int(r.count) { - size = int(r.count) - } - v := buffer.NewView(size) - n, err := r.r.Read(v) - if n > 0 { - // We ignore the error here. It may re-occur on subsequent - // reads, but for now we can enqueue some amount of data. - r.count -= int64(n) - return v[:n], nil - } - if err == syserror.ErrWouldBlock { - return nil, tcpip.ErrWouldBlock - } else if err != nil { - r.err = err // Save for propation. - return nil, tcpip.ErrBadAddress - } - - // There is no data and no error. Return an error, which will propagate - // r.err, which will be nil. This is the desired result: (0, nil). - return nil, tcpip.ErrBadAddress +func (l limitedPayloader) Len() int { + return int(l.N) } // ReadFrom implements fs.FileOperations.ReadFrom. func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { - f := &readerPayload{ctx: ctx, r: r, count: count} - n, err := s.Endpoint.Write(f, tcpip.WriteOptions{ + f := limitedPayloader{ + LimitedReader: io.LimitedReader{ + R: r, + N: count, + }, + } + n, err := s.Endpoint.Write(&f, tcpip.WriteOptions{ // Reads may be destructive but should be very fast, // so we can't release the lock while copying data. Atomic: true, }) - if err == tcpip.ErrWouldBlock { - return n, syserror.ErrWouldBlock - } else if err != nil { - return int64(n), f.err // Propagate error. + if err == tcpip.ErrBadBuffer { + err = nil } - - return int64(n), nil + return n, syserr.TranslateNetstackError(err).ToError() } // Readiness returns a mask of ready events for socket s. @@ -912,7 +846,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - size, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption) + size, err := ep.SocketOptions().GetSendBufferSize() if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -1681,8 +1615,16 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam return syserr.ErrInvalidArgument } + family, _, _ := s.Type() + // TODO(gvisor.dev/issue/5132): We currently do not support + // setting this option for unix sockets. + if family == linux.AF_UNIX { + return nil + } + v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.SendBufferSizeOption, int(v))) + ep.SocketOptions().SetSendBufferSize(int64(v), true) + return nil case linux.SO_RCVBUF: if len(optVal) < sizeOfInt32 { @@ -1814,10 +1756,6 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam var v linux.Linger binary.Unmarshal(optVal[:linux.SizeOfLinger], usermem.ByteOrder, &v) - if v != (linux.Linger{}) { - socket.SetSockOptEmitUnimplementedEvent(t, name) - } - ep.SocketOptions().SetLinger(tcpip.LingerOption{ Enabled: v.OnOff != 0, Timeout: time.Second * time.Duration(v.Linger), @@ -2840,45 +2778,46 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b EndOfRecord: flags&linux.MSG_EOR != 0, } - v := &ioSequencePayload{t, src} - n, err := s.Endpoint.Write(v, opts) - dontWait := flags&linux.MSG_DONTWAIT != 0 - if err == nil && (n >= v.src.NumBytes() || dontWait) { - // Complete write. - return int(n), nil - } - if err != nil && (err != tcpip.ErrWouldBlock || dontWait) { - return int(n), syserr.TranslateNetstackError(err) - } - - // We'll have to block. Register for notification and keep trying to - // send all the data. - e, ch := waiter.NewChannelEntry(nil) - s.EventRegister(&e, waiter.EventOut) - defer s.EventUnregister(&e) - - v.DropFirst(int(n)) - total := n + r := src.Reader(t) + var ( + total int64 + entry waiter.Entry + ch <-chan struct{} + ) for { - n, err = s.Endpoint.Write(v, opts) - v.DropFirst(int(n)) + n, err := s.Endpoint.Write(r, opts) total += n - - if err != nil && err != tcpip.ErrWouldBlock && total == 0 { - return 0, syserr.TranslateNetstackError(err) + if flags&linux.MSG_DONTWAIT != 0 { + return int(total), syserr.TranslateNetstackError(err) } - - if err == nil && v.src.NumBytes() == 0 || err != nil && err != tcpip.ErrWouldBlock { - return int(total), nil - } - - if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - return int(total), syserr.ErrTryAgain + switch err { + case nil: + if total == src.NumBytes() { + break + } + fallthrough + case tcpip.ErrWouldBlock: + if ch == nil { + // We'll have to block. Register for notification and keep trying to + // send all the data. + entry, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&entry, waiter.EventOut) + defer s.EventUnregister(&entry) + } else { + // Don't wait immediately after registration in case more data + // became available between when we last checked and when we setup + // the notification. + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return int(total), syserr.ErrTryAgain + } + // handleIOError will consume errors from t.Block if needed. + return int(total), syserr.FromError(err) + } } - // handleIOError will consume errors from t.Block if needed. - return int(total), syserr.FromError(err) + continue } + return int(total), syserr.TranslateNetstackError(err) } } diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 6f70b02fc..3bbdf552e 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -129,8 +128,8 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return 0, syserror.EOPNOTSUPP } - f := &ioSequencePayload{ctx: ctx, src: src} - n, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + r := src.Reader(ctx) + n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) if err == tcpip.ErrWouldBlock { return 0, syserror.ErrWouldBlock } @@ -138,11 +137,11 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return 0, syserr.TranslateNetstackError(err).ToError() } - if int64(n) < src.NumBytes() { - return int64(n), syserror.ErrWouldBlock + if n < src.NumBytes() { + return n, syserror.ErrWouldBlock } - return int64(n), nil + return n, nil } // Accept implements the linux syscall accept(2) for sockets backed by @@ -262,13 +261,3 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 9f7aca305..b011082dc 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -128,7 +128,7 @@ func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProv idGenerator: uid, stype: stype, } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, nil, nil) return ep } @@ -173,7 +173,7 @@ func NewExternal(ctx context.Context, stype linux.SockType, uid UniqueIDProvider idGenerator: uid, stype: stype, } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, nil, nil) return ep } @@ -296,7 +296,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn idGenerator: e.idGenerator, stype: e.stype, } - ne.ops.InitHandler(ne) + ne.ops.InitHandler(ne, nil, nil) readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} readQueue.InitRefs() diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 0813ad87d..20fa8b874 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -44,7 +44,7 @@ func NewConnectionless(ctx context.Context) Endpoint { q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} q.InitRefs() ep.receiver = &queueReceiver{readQueue: &q} - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, nil, nil) return ep } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 099a56281..0e3889c6d 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -842,7 +842,6 @@ func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { - case tcpip.SendBufferSizeOption: case tcpip.ReceiveBufferSizeOption: default: log.Warningf("Unsupported socket option: %d", opt) @@ -850,6 +849,27 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { return nil } +// IsUnixSocket implements tcpip.SocketOptionsHandler.IsUnixSocket. +func (e *baseEndpoint) IsUnixSocket() bool { + return true +} + +// GetSendBufferSize implements tcpip.SocketOptionsHandler.GetSendBufferSize. +func (e *baseEndpoint) GetSendBufferSize() (int64, *tcpip.Error) { + e.Lock() + defer e.Unlock() + + if !e.Connected() { + return -1, tcpip.ErrNotConnected + } + + v := e.connected.SendMaxQueueSize() + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported + } + return v, nil +} + func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: @@ -879,19 +899,6 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { } return int(v), nil - case tcpip.SendBufferSizeOption: - e.Lock() - if !e.Connected() { - e.Unlock() - return -1, tcpip.ErrNotConnected - } - v := e.connected.SendMaxQueueSize() - e.Unlock() - if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported - } - return int(v), nil - case tcpip.ReceiveBufferSizeOption: e.Lock() if e.receiver == nil { diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 6c4ec55b2..32e5d2304 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -496,6 +496,9 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b return int(n), syserr.FromError(err) } + // Only send SCM Rights once (see net/unix/af_unix.c:unix_stream_sendmsg). + w.Control.Rights = nil + // We'll have to block. Register for notification and keep trying to // send all the data. e, ch := waiter.NewChannelEntry(nil) diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 27f705bb2..a7d4d7f1f 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -331,16 +330,6 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) -} - // providerVFS2 is a unix domain socket provider for VFS2. type providerVFS2 struct{} diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index a72df62f6..62d1e8f8b 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -503,8 +503,8 @@ var ARM64 = &kernel.SyscallTable{ 72: syscalls.Supported("pselect", Pselect), 73: syscalls.Supported("ppoll", Ppoll), 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 76: syscalls.Supported("splice", Splice), 77: syscalls.Supported("tee", Tee), 78: syscalls.Supported("readlinkat", Readlinkat), 79: syscalls.Supported("fstatat", Fstatat), diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index c33571f43..a6253626e 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -1014,12 +1014,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -1030,12 +1030,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -2167,24 +2167,24 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.LOCK_EX: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 7dd9ef857..e39f074f2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -205,8 +205,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err - case linux.F_SETLK, linux.F_SETLKW: - return 0, nil, posixLock(t, args, file, cmd) + case linux.F_SETLK: + return 0, nil, posixLock(t, args, file, false /* blocking */) + case linux.F_SETLKW: + return 0, nil, posixLock(t, args, file, true /* blocking */) + case linux.F_GETLK: + return 0, nil, posixTestLock(t, args, file) case linux.F_GETSIG: a := file.AsyncHandler() if a == nil { @@ -292,7 +296,49 @@ func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, } } -func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { +func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription) error { + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := flock.CopyIn(t, flockAddr); err != nil { + return err + } + var typ lock.LockType + switch flock.Type { + case linux.F_RDLCK: + typ = lock.ReadLock + case linux.F_WRLCK: + typ = lock.WriteLock + default: + return syserror.EINVAL + } + r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) + if err != nil { + return err + } + + newFlock, err := file.TestPOSIX(t, t.FDTable(), typ, r) + if err != nil { + return err + } + newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID) + if _, err = newFlock.CopyOut(t, flockAddr); err != nil { + return err + } + return nil +} + +// translatePID translates a pid from one namespace to another. Note that this +// may race with task termination/creation, in which case the original task +// corresponding to pid may no longer exist. This is used to implement the +// F_GETLK fcntl, which has the same potential race in Linux as well (i.e., +// there is no synchronization between retrieving the lock PID and translating +// it). See fs/locks.c:posix_lock_to_flock. +func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 { + return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid)))) +} + +func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, blocking bool) error { // Copy in the lock request. flockAddr := args[2].Pointer() var flock linux.Flock @@ -301,25 +347,30 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip } var blocker lock.Blocker - if cmd == linux.F_SETLKW { + if blocking { blocker = t } + r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) + if err != nil { + return err + } + switch flock.Type { case linux.F_RDLCK: if !file.IsReadable() { return syserror.EBADF } - return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.ReadLock, r, blocker) case linux.F_WRLCK: if !file.IsWritable() { return syserror.EBADF } - return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.WriteLock, r, blocker) case linux.F_UNLCK: - return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence) + return file.UnlockPOSIX(t, t.FDTable(), r) default: return syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index b910b5a74..d1452a04d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -44,11 +44,11 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch operation { case linux.LOCK_EX: - if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil { + if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, blocker); err != nil { return 0, nil, err } case linux.LOCK_SH: - if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil { + if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, blocker); err != nil { return 0, nil, err } case linux.LOCK_UN: diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index b77b29dcc..c7417840f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -93,17 +93,11 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } @@ -134,9 +128,6 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt } file.EventUnregister(&w) - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return total, err } @@ -257,17 +248,11 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } @@ -297,10 +282,6 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of } } file.EventUnregister(&w) - - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return total, err } @@ -363,17 +344,11 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return n, err } @@ -403,10 +378,6 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op } } file.EventUnregister(&w) - - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return total, err } @@ -527,17 +498,11 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { - if n > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return n, err } @@ -567,10 +532,6 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o } } file.EventUnregister(&w) - - if total > 0 { - file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - } return total, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 1ee37e5a8..903169dc2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -220,7 +220,6 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys length := args[3].Int64() file := t.GetFileVFS2(fd) - if file == nil { return 0, nil, syserror.EBADF } @@ -229,23 +228,18 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if !file.IsWritable() { return 0, nil, syserror.EBADF } - if mode != 0 { return 0, nil, syserror.ENOTSUP } - if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL } size := offset + length - if size < 0 { return 0, nil, syserror.EFBIG } - limit := limits.FromContext(t).Get(limits.FileSize).Cur - if uint64(size) >= limit { t.SendSignal(&arch.SignalInfo{ Signo: int32(linux.SIGXFSZ), @@ -254,12 +248,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EFBIG } - if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil { - return 0, nil, err - } - - file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - return 0, nil, nil + return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) } // Utime implements Linux syscall utime(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 8bb763a47..19e175203 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -170,13 +170,6 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } } - if n != 0 { - // On Linux, inotify behavior is not very consistent with splice(2). We try - // our best to emulate Linux for very basic calls to splice, where for some - // reason, events are generated for output files, but not input files. - outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - } - // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile) @@ -256,8 +249,6 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo } if n != 0 { - outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - // If a partial write is completed, the error is dropped. Log it here. if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { log.Debugf("tee completed a partial write with error: %v", err) @@ -449,9 +440,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } if total != 0 { - inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) - outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) - if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { // If a partial write is completed, the error is dropped. Log it here. log.Debugf("sendfile completed a partial write with error: %v", err) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5321ac80a..f612a71b2 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -161,6 +161,13 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef(ctx context.Context) { fd.FileDescriptionRefs.DecRef(func() { + // Generate inotify events. + ev := uint32(linux.IN_CLOSE_NOWRITE) + if fd.IsWritable() { + ev = linux.IN_CLOSE_WRITE + } + fd.Dentry().InotifyWithParent(ctx, ev, 0, PathEvent) + // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls @@ -448,16 +455,19 @@ type FileDescriptionImpl interface { RemoveXattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. - LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error + LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block lock.Blocker) error // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. - LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error + LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error // UnlockPOSIX releases a POSIX-style advisory file lock. - UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, ComputeLockRange lock.LockRange) error + + // TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl. + TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) } // Dirent holds the information contained in struct linux_dirent64. @@ -556,7 +566,11 @@ func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length ui if !fd.IsWritable() { return syserror.EBADF } - return fd.impl.Allocate(ctx, mode, offset, length) + if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil { + return err + } + fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) + return nil } // Readiness implements waiter.Waitable.Readiness. @@ -592,6 +606,9 @@ func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of } start := fsmetric.StartReadWait() n, err := fd.impl.PRead(ctx, dst, offset, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) + } fsmetric.Reads.Increment() fsmetric.FinishReadWait(fsmetric.ReadWait, start) return n, err @@ -604,6 +621,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt } start := fsmetric.StartReadWait() n, err := fd.impl.Read(ctx, dst, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) + } fsmetric.Reads.Increment() fsmetric.FinishReadWait(fsmetric.ReadWait, start) return n, err @@ -619,7 +639,11 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o if !fd.writable { return 0, syserror.EBADF } - return fd.impl.PWrite(ctx, src, offset, opts) + n, err := fd.impl.PWrite(ctx, src, offset, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) + } + return n, err } // Write is similar to PWrite, but does not specify an offset. @@ -627,7 +651,11 @@ func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, op if !fd.writable { return 0, syserror.EBADF } - return fd.impl.Write(ctx, src, opts) + n, err := fd.impl.Write(ctx, src, opts) + if n > 0 { + fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) + } + return n, err } // IterDirents invokes cb on each entry in the directory represented by fd. If @@ -791,9 +819,9 @@ func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) e } // LockBSD tries to acquire a BSD-style advisory file lock. -func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { +func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, blocker lock.Blocker) error { atomic.StoreUint32(&fd.usedLockBSD, 1) - return fd.impl.LockBSD(ctx, fd, lockType, blocker) + return fd.impl.LockBSD(ctx, fd, ownerPID, lockType, blocker) } // UnlockBSD releases a BSD-style advisory file lock. @@ -802,13 +830,45 @@ func (fd *FileDescription) UnlockBSD(ctx context.Context) error { } // LockPOSIX locks a POSIX-style file range lock. -func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { - return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX unlocks a POSIX-style file range lock. -func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { - return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, r lock.LockRange) error { + return fd.impl.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX returns information about whether the specified lock can be held. +func (fd *FileDescription) TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) { + return fd.impl.TestPOSIX(ctx, uid, t, r) +} + +// ComputeLockRange computes the range of a file lock based on the given values. +func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, length uint64, whence int16) (lock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return lock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return lock.LockRange{}, err + } + off = int64(stat.Size) + default: + return lock.LockRange{}, syserror.EINVAL + } + + return lock.ComputeRange(int64(start), int64(length), off) } // A FileAsync sends signals to its owner when w is ready for IO. This is only diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 48ca9de44..eb7d2fd3b 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -419,8 +419,8 @@ func (fd *LockFD) Locks() *FileLocks { } // LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - return fd.locks.LockBSD(uid, t, block) +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(ctx, uid, ownerPID, t, block) } // UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. @@ -429,6 +429,21 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return nil } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + return fd.locks.LockPOSIX(ctx, uid, ownerPID, t, r, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return fd.locks.UnlockPOSIX(ctx, uid, r) +} + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fd.locks.TestPOSIX(ctx, uid, t, r) +} + // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning ENOLCK. // @@ -436,7 +451,7 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { type NoLockFD struct{} // LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { return syserror.ENOLCK } @@ -446,11 +461,16 @@ func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { return syserror.ENOLCK } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return syserror.ENOLCK } + +// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return linux.Flock{}, syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go index 1ff202f2a..cbe4d8c2d 100644 --- a/pkg/sentry/vfs/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -39,8 +39,8 @@ type FileLocks struct { } // LockBSD tries to acquire a BSD-style lock on the entire file. -func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { +func (fl *FileLocks) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerID int32, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, ownerID, t, fslock.LockRange{0, fslock.LockEOF}, block) { return nil } @@ -61,12 +61,8 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { } // LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - rng, err := computeRange(ctx, fd, start, length, whence) - if err != nil { - return err - } - if fl.posix.LockRegion(uid, t, rng, block) { +func (fl *FileLocks) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + if fl.posix.LockRegion(uid, ownerPID, t, r, block) { return nil } @@ -82,37 +78,12 @@ func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fsl // // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { - rng, err := computeRange(ctx, fd, start, length, whence) - if err != nil { - return err - } - fl.posix.UnlockRegion(uid, rng) +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + fl.posix.UnlockRegion(uid, r) return nil } -func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { - var off int64 - switch whence { - case linux.SEEK_SET: - off = 0 - case linux.SEEK_CUR: - // Note that Linux does not hold any mutexes while retrieving the file - // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. - curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) - if err != nil { - return fslock.LockRange{}, err - } - off = curOff - case linux.SEEK_END: - stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) - if err != nil { - return fslock.LockRange{}, err - } - off = int64(stat.Size) - default: - return fslock.LockRange{}, syserror.EINVAL - } - - return fslock.ComputeRange(int64(start), int64(length), off) +// TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl. +func (fl *FileLocks) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return fl.posix.TestRegion(ctx, uid, t, r), nil } diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go index fdeec12d3..7c7495c30 100644 --- a/pkg/tcpip/adapters/gonet/gonet.go +++ b/pkg/tcpip/adapters/gonet/gonet.go @@ -16,6 +16,7 @@ package gonet import ( + "bytes" "context" "errors" "io" @@ -354,8 +355,6 @@ func (c *TCPConn) Write(b []byte) (int, error) { default: } - v := buffer.NewViewFromBytes(b) - // We must handle two soft failure conditions simultaneously: // 1. Write may write nothing and return tcpip.ErrWouldBlock. // If this happens, we need to register for notifications if we have @@ -368,22 +367,23 @@ func (c *TCPConn) Write(b []byte) (int, error) { // There is no guarantee that all of the condition #1s will occur before // all of the condition #2s or visa-versa. var ( - err *tcpip.Error - nbytes int - reg bool - notifyCh chan struct{} + r bytes.Reader + nbytes int + entry waiter.Entry + ch <-chan struct{} ) - for nbytes < len(b) && (err == tcpip.ErrWouldBlock || err == nil) { - if err == tcpip.ErrWouldBlock { - if !reg { - // Only register once. - reg = true - - // Create wait queue entry that notifies a channel. - var waitEntry waiter.Entry - waitEntry, notifyCh = waiter.NewChannelEntry(nil) - c.wq.EventRegister(&waitEntry, waiter.EventOut) - defer c.wq.EventUnregister(&waitEntry) + for nbytes != len(b) { + r.Reset(b[nbytes:]) + n, err := c.ep.Write(&r, tcpip.WriteOptions{}) + nbytes += int(n) + switch err { + case nil: + case tcpip.ErrWouldBlock: + if ch == nil { + entry, ch = waiter.NewChannelEntry(nil) + + c.wq.EventRegister(&entry, waiter.EventOut) + defer c.wq.EventUnregister(&entry) } else { // Don't wait immediately after registration in case more data // became available between when we last checked and when we setup @@ -391,22 +391,15 @@ func (c *TCPConn) Write(b []byte) (int, error) { select { case <-deadline: return nbytes, c.newOpError("write", &timeoutError{}) - case <-notifyCh: + case <-ch: + continue } } + default: + return nbytes, c.newOpError("write", errors.New(err.String())) } - - var n int64 - n, err = c.ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{}) - nbytes += int(n) - v.TrimFront(int(n)) - } - - if err == nil { - return nbytes, nil } - - return nbytes, c.newOpError("write", errors.New(err.String())) + return nbytes, nil } // Close implements net.Conn.Close. @@ -644,16 +637,18 @@ func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { } // If we're being called by Write, there is no addr - wopts := tcpip.WriteOptions{} + writeOptions := tcpip.WriteOptions{} if addr != nil { ua := addr.(*net.UDPAddr) - wopts.To = &tcpip.FullAddress{Addr: tcpip.Address(ua.IP), Port: uint16(ua.Port)} + writeOptions.To = &tcpip.FullAddress{ + Addr: tcpip.Address(ua.IP), + Port: uint16(ua.Port), + } } - v := buffer.NewView(len(b)) - copy(v, b) - - n, err := c.ep.Write(tcpip.SlicePayload(v), wopts) + var r bytes.Reader + r.Reset(b) + n, err := c.ep.Write(&r, writeOptions) if err == tcpip.ErrWouldBlock { // Create wait queue entry that notifies a channel. waitEntry, notifyCh := waiter.NewChannelEntry(nil) @@ -666,7 +661,7 @@ func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { case <-notifyCh: } - n, err = c.ep.Write(tcpip.SlicePayload(v), wopts) + n, err = c.ep.Write(&r, writeOptions) if err != tcpip.ErrWouldBlock { break } diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go index 0ac2000ca..07b4393a4 100644 --- a/pkg/tcpip/checker/checker.go +++ b/pkg/tcpip/checker/checker.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -234,7 +234,7 @@ func IPv4RouterAlert() NetworkChecker { for { opt, done, err := iterator.Next() if err != nil { - t.Fatalf("error acquiring next IPv4 option %s", err) + t.Fatalf("error acquiring next IPv4 option at offset %d", err.Pointer) } if done { break diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go index e6103f4bc..48ca60319 100644 --- a/pkg/tcpip/header/ipv4.go +++ b/pkg/tcpip/header/ipv4.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ package header import ( "encoding/binary" - "errors" "fmt" "gvisor.dev/gvisor/pkg/tcpip" @@ -481,15 +480,13 @@ const ( IPv4OptionLengthOffset = 1 ) -// Potential errors when parsing generic IP options. -var ( - ErrIPv4OptZeroLength = errors.New("zero length IP option") - ErrIPv4OptDuplicate = errors.New("duplicate IP option") - ErrIPv4OptInvalid = errors.New("invalid IP option") - ErrIPv4OptMalformed = errors.New("malformed IP option") - ErrIPv4OptionTruncated = errors.New("truncated IP option") - ErrIPv4OptionAddress = errors.New("bad IP option address") -) +// IPv4OptParameterProblem indicates that a Parameter Problem message +// should be generated, and gives the offset in the current entity +// that should be used in that packet. +type IPv4OptParameterProblem struct { + Pointer uint8 + NeedICMP bool +} // IPv4Option is an interface representing various option types. type IPv4Option interface { @@ -583,8 +580,9 @@ func (i *IPv4OptionIterator) Finalize() IPv4Options { // It returns // - A slice of bytes holding the next option or nil if there is error. // - A boolean which is true if parsing of all the options is complete. -// - An error which is non-nil if an error condition was encountered. -func (i *IPv4OptionIterator) Next() (IPv4Option, bool, error) { +// Undefined in the case of error. +// - An error indication which is non-nil if an error condition was found. +func (i *IPv4OptionIterator) Next() (IPv4Option, bool, *IPv4OptParameterProblem) { // The opts slice gets shorter as we process the options. When we have no // bytes left we are done. if len(i.options) == 0 { @@ -606,24 +604,22 @@ func (i *IPv4OptionIterator) Next() (IPv4Option, bool, error) { // There are no more single byte options defined. All the rest have a length // field so we need to sanity check it. if len(i.options) == 1 { - return nil, true, ErrIPv4OptMalformed + return nil, false, &IPv4OptParameterProblem{ + Pointer: i.ErrCursor, + NeedICMP: true, + } } optLen := i.options[IPv4OptionLengthOffset] - if optLen == 0 { - i.ErrCursor++ - return nil, true, ErrIPv4OptZeroLength - } + if optLen <= IPv4OptionLengthOffset || optLen > uint8(len(i.options)) { + // The actual error is in the length (2nd byte of the option) but we + // return the start of the option for compatibility with Linux. - if optLen == 1 { - i.ErrCursor++ - return nil, true, ErrIPv4OptMalformed - } - - if optLen > uint8(len(i.options)) { - i.ErrCursor++ - return nil, true, ErrIPv4OptionTruncated + return nil, false, &IPv4OptParameterProblem{ + Pointer: i.ErrCursor, + NeedICMP: true, + } } optionBody := i.options[:optLen] @@ -635,7 +631,10 @@ func (i *IPv4OptionIterator) Next() (IPv4Option, bool, error) { case IPv4OptionTimestampType: if optLen < IPv4OptionTimestampHdrLength { i.ErrCursor++ - return nil, true, ErrIPv4OptMalformed + return nil, false, &IPv4OptParameterProblem{ + Pointer: i.ErrCursor, + NeedICMP: true, + } } retval := IPv4OptionTimestamp(optionBody) return &retval, false, nil @@ -643,7 +642,10 @@ func (i *IPv4OptionIterator) Next() (IPv4Option, bool, error) { case IPv4OptionRecordRouteType: if optLen < IPv4OptionRecordRouteHdrLength { i.ErrCursor++ - return nil, true, ErrIPv4OptMalformed + return nil, false, &IPv4OptParameterProblem{ + Pointer: i.ErrCursor, + NeedICMP: true, + } } retval := IPv4OptionRecordRoute(optionBody) return &retval, false, nil diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 10072eac1..ae1394ebf 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -35,7 +35,6 @@ go_test( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", - "//pkg/tcpip/link/rawfile", "//pkg/tcpip/stack", "@com_github_google_go_cmp//cmp:go_default_library", ], diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 90da22d34..e2985cb84 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -30,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" - "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -465,67 +464,85 @@ var capLengthTestCases = []struct { config: []int{1, 2, 3}, n: 3, wantUsed: 2, - wantLengths: []int{1, 2, 3}, + wantLengths: []int{1, 2}, }, } -func TestReadVDispatcherCapLength(t *testing.T) { +func TestIovecBuffer(t *testing.T) { for _, c := range capLengthTestCases { - // fd does not matter for this test. - d := readVDispatcher{fd: -1, e: &endpoint{}} - d.views = make([]buffer.View, len(c.config)) - d.iovecs = make([]syscall.Iovec, len(c.config)) - d.allocateViews(c.config) - - used := d.capViews(c.n, c.config) - if used != c.wantUsed { - t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) - } - lengths := make([]int, len(d.views)) - for i, v := range d.views { - lengths[i] = len(v) - } - if !reflect.DeepEqual(lengths, c.wantLengths) { - t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) - } - } -} + t.Run(c.comment, func(t *testing.T) { + b := newIovecBuffer(c.config, false /* skipsVnetHdr */) -func TestRecvMMsgDispatcherCapLength(t *testing.T) { - for _, c := range capLengthTestCases { - d := recvMMsgDispatcher{ - fd: -1, // fd does not matter for this test. - e: &endpoint{}, - views: make([][]buffer.View, 1), - iovecs: make([][]syscall.Iovec, 1), - msgHdrs: make([]rawfile.MMsgHdr, 1), - } + // Test initial allocation. + iovecs := b.nextIovecs() + if got, want := len(iovecs), len(c.config); got != want { + t.Fatalf("len(iovecs) = %d, want %d", got, want) + } - for i := range d.views { - d.views[i] = make([]buffer.View, len(c.config)) - } - for i := range d.iovecs { - d.iovecs[i] = make([]syscall.Iovec, len(c.config)) - } - for k, msgHdr := range d.msgHdrs { - msgHdr.Msg.Iov = &d.iovecs[k][0] - msgHdr.Msg.Iovlen = uint64(len(c.config)) - } + // Make a copy as iovecs points to internal slice. We will need this state + // later. + oldIovecs := append([]syscall.Iovec(nil), iovecs...) - d.allocateViews(c.config) + // Test the views that get pulled. + vv := b.pullViews(c.n) + var lengths []int + for _, v := range vv.Views() { + lengths = append(lengths, len(v)) + } + if !reflect.DeepEqual(lengths, c.wantLengths) { + t.Errorf("Pulled view lengths = %v, want %v", lengths, c.wantLengths) + } - used := d.capViews(0, c.n, c.config) - if used != c.wantUsed { - t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) - } - lengths := make([]int, len(d.views[0])) - for i, v := range d.views[0] { - lengths[i] = len(v) - } - if !reflect.DeepEqual(lengths, c.wantLengths) { - t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) - } + // Test that new views get reallocated. + for i, newIov := range b.nextIovecs() { + if i < c.wantUsed { + if newIov.Base == oldIovecs[i].Base { + t.Errorf("b.views[%d] should have been reallocated", i) + } + } else { + if newIov.Base != oldIovecs[i].Base { + t.Errorf("b.views[%d] should not have been reallocated", i) + } + } + } + }) + } +} +func TestIovecBufferSkipVnetHdr(t *testing.T) { + for _, test := range []struct { + desc string + readN int + wantLen int + }{ + { + desc: "nothing read", + readN: 0, + wantLen: 0, + }, + { + desc: "smaller than vnet header", + readN: virtioNetHdrSize - 1, + wantLen: 0, + }, + { + desc: "header skipped", + readN: virtioNetHdrSize + 100, + wantLen: 100, + }, + } { + t.Run(test.desc, func(t *testing.T) { + b := newIovecBuffer([]int{10, 20, 50, 50}, true) + // Pretend a read happend. + b.nextIovecs() + vv := b.pullViews(test.readN) + if got, want := vv.Size(), test.wantLen; got != want { + t.Errorf("b.pullView(%d).Size() = %d; want %d", test.readN, got, want) + } + if got, want := len(vv.ToOwnedView()), test.wantLen; got != want { + t.Errorf("b.pullView(%d).ToOwnedView() has length %d; want %d", test.readN, got, want) + } + }) } } diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 8c3ca86d6..edab110b5 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -29,92 +29,124 @@ import ( // BufConfig defines the shape of the vectorised view used to read packets from the NIC. var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} -// readVDispatcher uses readv() system call to read inbound packets and -// dispatches them. -type readVDispatcher struct { - // fd is the file descriptor used to send and receive packets. - fd int - - // e is the endpoint this dispatcher is attached to. - e *endpoint - +type iovecBuffer struct { // views are the actual buffers that hold the packet contents. views []buffer.View // iovecs are initialized with base pointers/len of the corresponding - // entries in the views defined above, except when GSO is enabled then - // the first iovec points to a buffer for the vnet header which is - // stripped before the views are passed up the stack for further + // entries in the views defined above, except when GSO is enabled + // (skipsVnetHdr) then the first iovec points to a buffer for the vnet header + // which is stripped before the views are passed up the stack for further // processing. iovecs []syscall.Iovec + + // sizes is an array of buffer sizes for the underlying views. sizes is + // immutable. + sizes []int + + // skipsVnetHdr is true if virtioNetHdr is to skipped. + skipsVnetHdr bool } -func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { - d := &readVDispatcher{fd: fd, e: e} - d.views = make([]buffer.View, len(BufConfig)) - iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - iovLen++ +func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer { + b := &iovecBuffer{ + views: make([]buffer.View, len(sizes)), + sizes: sizes, + skipsVnetHdr: skipsVnetHdr, } - d.iovecs = make([]syscall.Iovec, iovLen) - return d, nil + niov := len(b.views) + if b.skipsVnetHdr { + niov++ + } + b.iovecs = make([]syscall.Iovec, niov) + return b } -func (d *readVDispatcher) allocateViews(bufConfig []int) { - var vnetHdr [virtioNetHdrSize]byte +func (b *iovecBuffer) nextIovecs() []syscall.Iovec { vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { + if b.skipsVnetHdr { + var vnetHdr [virtioNetHdrSize]byte // The kernel adds virtioNetHdr before each packet, but // we don't use it, so so we allocate a buffer for it, // add it in iovecs but don't add it in a view. - d.iovecs[0] = syscall.Iovec{ + b.iovecs[0] = syscall.Iovec{ Base: &vnetHdr[0], Len: uint64(virtioNetHdrSize), } vnetHdrOff++ } - for i := 0; i < len(bufConfig); i++ { - if d.views[i] != nil { + for i := range b.views { + if b.views[i] != nil { break } - b := buffer.NewView(bufConfig[i]) - d.views[i] = b - d.iovecs[i+vnetHdrOff] = syscall.Iovec{ - Base: &b[0], - Len: uint64(len(b)), + v := buffer.NewView(b.sizes[i]) + b.views[i] = v + b.iovecs[i+vnetHdrOff] = syscall.Iovec{ + Base: &v[0], + Len: uint64(len(v)), } } + return b.iovecs } -func (d *readVDispatcher) capViews(n int, buffers []int) int { +func (b *iovecBuffer) pullViews(n int) buffer.VectorisedView { + var views []buffer.View c := 0 - for i, s := range buffers { - c += s + if b.skipsVnetHdr { + c += virtioNetHdrSize if c >= n { - d.views[i].CapLength(s - (c - n)) - return i + 1 + // Nothing in the packet. + return buffer.NewVectorisedView(0, nil) + } + } + for i, v := range b.views { + c += len(v) + if c >= n { + b.views[i].CapLength(len(v) - (c - n)) + views = append([]buffer.View(nil), b.views[:i+1]...) + break } } - return len(buffers) + // Remove the first len(views) used views from the state. + for i := range views { + b.views[i] = nil + } + if b.skipsVnetHdr { + // Exclude the size of the vnet header. + n -= virtioNetHdrSize + } + return buffer.NewVectorisedView(n, views) +} + +// readVDispatcher uses readv() system call to read inbound packets and +// dispatches them. +type readVDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // buf is the iovec buffer that contains the packet contents. + buf *iovecBuffer +} + +func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &readVDispatcher{fd: fd, e: e} + skipsVnetHdr := d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 + d.buf = newIovecBuffer(BufConfig, skipsVnetHdr) + return d, nil } // dispatch reads one packet from the file descriptor and dispatches it. func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { - d.allocateViews(BufConfig) - - n, err := rawfile.BlockingReadv(d.fd, d.iovecs) + n, err := rawfile.BlockingReadv(d.fd, d.buf.nextIovecs()) if n == 0 || err != nil { return false, err } - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - // Skip virtioNetHdr which is added before each packet, it - // isn't used and it isn't in a view. - n -= virtioNetHdrSize - } - used := d.capViews(n, BufConfig) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)), + Data: d.buf.pullViews(n), }) var ( @@ -133,7 +165,12 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { } else { // We don't get any indication of what the packet is, so try to guess // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(d.views[0]) { + // IP version information is at the first octet, so pulling up 1 byte. + h, ok := pkt.Data.PullUp(1) + if !ok { + return true, nil + } + switch header.IPVersion(h) { case header.IPv4Version: p = header.IPv4ProtocolNumber case header.IPv6Version: @@ -145,11 +182,6 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt) - // Prepare e.views for another packet: release used views. - for i := 0; i < used; i++ { - d.views[i] = nil - } - return true, nil } @@ -162,15 +194,8 @@ type recvMMsgDispatcher struct { // e is the endpoint this dispatcher is attached to. e *endpoint - // views is an array of array of buffers that contain packet contents. - views [][]buffer.View - - // iovecs is an array of array of iovec records where each iovec base - // pointer and length are initialzed to the corresponding view above, - // except when GSO is enabled then the first iovec in each array of - // iovecs points to a buffer for the vnet header which is stripped - // before the views are passed up the stack for further processing. - iovecs [][]syscall.Iovec + // bufs is an array of iovec buffers that contain packet contents. + bufs []*iovecBuffer // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to // reference an array of iovecs in the iovecs field defined above. This @@ -187,74 +212,32 @@ const ( func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { d := &recvMMsgDispatcher{ - fd: fd, - e: e, - } - d.views = make([][]buffer.View, MaxMsgsPerRecv) - for i := range d.views { - d.views[i] = make([]buffer.View, len(BufConfig)) - } - d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv) - iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - // virtioNetHdr is prepended before each packet. - iovLen++ + fd: fd, + e: e, + bufs: make([]*iovecBuffer, MaxMsgsPerRecv), + msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv), } - for i := range d.iovecs { - d.iovecs[i] = make([]syscall.Iovec, iovLen) - } - d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv) - for i := range d.msgHdrs { - d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0] - d.msgHdrs[i].Msg.Iovlen = uint64(iovLen) + skipsVnetHdr := d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 + for i := range d.bufs { + d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr) } return d, nil } -func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int { - c := 0 - for i, s := range buffers { - c += s - if c >= n { - d.views[k][i].CapLength(s - (c - n)) - return i + 1 - } - } - return len(buffers) -} - -func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) { - for k := 0; k < len(d.views); k++ { - var vnetHdr [virtioNetHdrSize]byte - vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - // The kernel adds virtioNetHdr before each packet, but - // we don't use it, so so we allocate a buffer for it, - // add it in iovecs but don't add it in a view. - d.iovecs[k][0] = syscall.Iovec{ - Base: &vnetHdr[0], - Len: uint64(virtioNetHdrSize), - } - vnetHdrOff++ - } - for i := 0; i < len(bufConfig); i++ { - if d.views[k][i] != nil { - break - } - b := buffer.NewView(bufConfig[i]) - d.views[k][i] = b - d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{ - Base: &b[0], - Len: uint64(len(b)), - } - } - } -} - // recvMMsgDispatch reads more than one packet at a time from the file // descriptor and dispatches it. func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { - d.allocateViews(BufConfig) + // Fill message headers. + for k := range d.msgHdrs { + if d.msgHdrs[k].Msg.Iovlen > 0 { + break + } + iovecs := d.bufs[k].nextIovecs() + iovLen := len(iovecs) + d.msgHdrs[k].Len = 0 + d.msgHdrs[k].Msg.Iov = &iovecs[0] + d.msgHdrs[k].Msg.Iovlen = uint64(iovLen) + } nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs) if err != nil { @@ -263,15 +246,14 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { // Process each of received packets. for k := 0; k < nMsgs; k++ { n := int(d.msgHdrs[k].Len) - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - n -= virtioNetHdrSize - } - used := d.capViews(k, int(n), BufConfig) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)), + Data: d.bufs[k].pullViews(n), }) + // Mark that this iovec has been processed. + d.msgHdrs[k].Msg.Iovlen = 0 + var ( p tcpip.NetworkProtocolNumber remote, local tcpip.LinkAddress @@ -288,26 +270,24 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { } else { // We don't get any indication of what the packet is, so try to guess // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(d.views[k][0]) { + // IP version information is at the first octet, so pulling up 1 byte. + h, ok := pkt.Data.PullUp(1) + if !ok { + // Skip this packet. + continue + } + switch header.IPVersion(h) { case header.IPv4Version: p = header.IPv4ProtocolNumber case header.IPv6Version: p = header.IPv6ProtocolNumber default: - return true, nil + // Skip this packet. + continue } } d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt) - - // Prepare e.views for another packet: release used views. - for i := 0; i < used; i++ { - d.views[k][i] = nil - } - } - - for k := 0; k < nMsgs; k++ { - d.msgHdrs[k].Len = 0 } return true, nil diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go index d6e83a414..36aa9055c 100644 --- a/pkg/tcpip/link/pipe/pipe.go +++ b/pkg/tcpip/link/pipe/pipe.go @@ -45,12 +45,7 @@ type Endpoint struct { linkAddr tcpip.LinkAddress } -// WritePacket implements stack.LinkEndpoint. -func (e *Endpoint) WritePacket(r stack.RouteInfo, _ *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - if !e.linked.IsAttached() { - return nil - } - +func (e *Endpoint) deliverPackets(r stack.RouteInfo, proto tcpip.NetworkProtocolNumber, pkts stack.PacketBufferList) { // Note that the local address from the perspective of this endpoint is the // remote address from the perspective of the other end of the pipe // (e.linked). Similarly, the remote address from the perspective of this @@ -70,16 +65,33 @@ func (e *Endpoint) WritePacket(r stack.RouteInfo, _ *stack.GSO, proto tcpip.Netw // // TODO(gvisor.dev/issue/5289): don't use a new goroutine once we support send // and receive queues. - go e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()), - })) + go func() { + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()), + })) + } + }() +} + +// WritePacket implements stack.LinkEndpoint. +func (e *Endpoint) WritePacket(r stack.RouteInfo, _ *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { + if e.linked.IsAttached() { + var pkts stack.PacketBufferList + pkts.PushBack(pkt) + e.deliverPackets(r, proto, pkts) + } return nil } // WritePackets implements stack.LinkEndpoint. -func (*Endpoint) WritePackets(stack.RouteInfo, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { - panic("not implemented") +func (e *Endpoint) WritePackets(r stack.RouteInfo, _ *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + if e.linked.IsAttached() { + e.deliverPackets(r, proto, pkts) + } + + return pkts.Len(), nil } // Attach implements stack.LinkEndpoint. diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go index 87035b034..03efba606 100644 --- a/pkg/tcpip/link/qdisc/fifo/endpoint.go +++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go @@ -165,12 +165,15 @@ func (e *endpoint) WritePacket(r stack.RouteInfo, gso *stack.GSO, protocol tcpip } // WritePackets implements stack.LinkEndpoint.WritePackets. +// +// Being a batch API, each packet in pkts should have the following +// fields populated: +// - pkt.EgressRoute +// - pkt.GSOOptions +// - pkt.NetworkProtocolNumber func (e *endpoint) WritePackets(r stack.RouteInfo, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { enqueued := 0 for pkt := pkts.Front(); pkt != nil; { - pkt.EgressRoute = r - pkt.GSOOptions = gso - pkt.NetworkProtocolNumber = protocol d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)] nxt := pkt.Next() if !d.q.enqueue(pkt) { diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD index 8a6bcfc2c..c7ab876bf 100644 --- a/pkg/tcpip/network/arp/BUILD +++ b/pkg/tcpip/network/arp/BUILD @@ -4,9 +4,13 @@ package(licenses = ["notice"]) go_library( name = "arp", - srcs = ["arp.go"], + srcs = [ + "arp.go", + "stats.go", + ], visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", @@ -33,3 +37,15 @@ go_test( "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", ], ) + +go_test( + name = "stats_test", + size = "small", + srcs = ["stats_test.go"], + library = ":arp", + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/network/testutil", + "//pkg/tcpip/stack", + ], +) diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 9255a4f6a..6bc8c5c02 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -19,8 +19,10 @@ package arp import ( "fmt" + "reflect" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -50,6 +52,7 @@ type endpoint struct { nic stack.NetworkInterface linkAddrCache stack.LinkAddressCache nud stack.NUDHandler + stats sharedStats } func (e *endpoint) Enable() *tcpip.Error { @@ -98,7 +101,9 @@ func (e *endpoint) MaxHeaderLength() uint16 { return e.nic.MaxHeaderLength() + header.ARPSize } -func (*endpoint) Close() {} +func (e *endpoint) Close() { + e.protocol.forgetEndpoint(e.nic.ID()) +} func (*endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error { return tcpip.ErrNotSupported @@ -119,27 +124,27 @@ func (*endpoint) WriteHeaderIncludedPacket(*stack.Route, *stack.PacketBuffer) *t } func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { - stats := e.protocol.stack.Stats().ARP - stats.PacketsReceived.Increment() + stats := e.stats.arp + stats.packetsReceived.Increment() if !e.isEnabled() { - stats.DisabledPacketsReceived.Increment() + stats.disabledPacketsReceived.Increment() return } h := header.ARP(pkt.NetworkHeader().View()) if !h.IsValid() { - stats.MalformedPacketsReceived.Increment() + stats.malformedPacketsReceived.Increment() return } switch h.Op() { case header.ARPRequest: - stats.RequestsReceived.Increment() + stats.requestsReceived.Increment() localAddr := tcpip.Address(h.ProtocolAddressTarget()) if e.protocol.stack.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 { - stats.RequestsReceivedUnknownTargetAddress.Increment() + stats.requestsReceivedUnknownTargetAddress.Increment() return // we have no useful answer, ignore the request } @@ -180,13 +185,13 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // Send the packet to the (new) target hardware address on the same // hardware on which the request was received. if err := e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), nil /* gso */, ProtocolNumber, respPkt); err != nil { - stats.OutgoingRepliesDropped.Increment() + stats.outgoingRepliesDropped.Increment() } else { - stats.OutgoingRepliesSent.Increment() + stats.outgoingRepliesSent.Increment() } case header.ARPReply: - stats.RepliesReceived.Increment() + stats.repliesReceived.Increment() addr := tcpip.Address(h.ProtocolAddressSender()) linkAddr := tcpip.LinkAddress(h.HardwareAddressSender()) @@ -212,21 +217,23 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // Stats implements stack.NetworkEndpoint. func (e *endpoint) Stats() stack.NetworkEndpointStats { - // TODO(gvisor.dev/issues/4963): Record statistics for ARP. - return &Stats{} + return &e.stats.localStats } -var _ stack.NetworkEndpointStats = (*Stats)(nil) - -// Stats holds ARP statistics. -type Stats struct{} - -// IsNetworkEndpointStats implements stack.NetworkEndpointStats. -func (*Stats) IsNetworkEndpointStats() {} +var _ stack.NetworkProtocol = (*protocol)(nil) +var _ stack.LinkAddressResolver = (*protocol)(nil) // protocol implements stack.NetworkProtocol and stack.LinkAddressResolver. type protocol struct { stack *stack.Stack + + mu struct { + sync.RWMutex + + // eps is keyed by NICID to allow protocol methods to retrieve the correct + // endpoint depending on the NIC. + eps map[tcpip.NICID]*endpoint + } } func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } @@ -244,9 +251,25 @@ func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.L linkAddrCache: linkAddrCache, nud: nud, } + + tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem()) + + stackStats := p.stack.Stats() + e.stats.arp.init(&e.stats.localStats.ARP, &stackStats.ARP) + + p.mu.Lock() + p.mu.eps[nic.ID()] = e + p.mu.Unlock() + return e } +func (p *protocol) forgetEndpoint(nicID tcpip.NICID) { + p.mu.Lock() + defer p.mu.Unlock() + delete(p.mu.eps, nicID) +} + // LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol. func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { return header.IPv4ProtocolNumber @@ -254,28 +277,35 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest. func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic stack.NetworkInterface) *tcpip.Error { - stats := p.stack.Stats().ARP + nicID := nic.ID() + + p.mu.Lock() + netEP, ok := p.mu.eps[nicID] + p.mu.Unlock() + if !ok { + return tcpip.ErrNotConnected + } + + stats := netEP.stats.arp if len(remoteLinkAddr) == 0 { remoteLinkAddr = header.EthernetBroadcastAddress } - nicID := nic.ID() if len(localAddr) == 0 { - addr, err := p.stack.GetMainNICAddress(nicID, header.IPv4ProtocolNumber) - if err != nil { - stats.OutgoingRequestInterfaceHasNoLocalAddressErrors.Increment() - return err + addr, ok := p.stack.GetMainNICAddress(nicID, header.IPv4ProtocolNumber) + if !ok { + return tcpip.ErrUnknownNICID } if len(addr.Address) == 0 { - stats.OutgoingRequestNetworkUnreachableErrors.Increment() + stats.outgoingRequestInterfaceHasNoLocalAddressErrors.Increment() return tcpip.ErrNetworkUnreachable } localAddr = addr.Address } else if p.stack.CheckLocalAddress(nicID, header.IPv4ProtocolNumber, localAddr) == 0 { - stats.OutgoingRequestBadLocalAddressErrors.Increment() + stats.outgoingRequestBadLocalAddressErrors.Increment() return tcpip.ErrBadLocalAddress } @@ -296,10 +326,10 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize)) } if err := nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt); err != nil { - stats.OutgoingRequestsDropped.Increment() + stats.outgoingRequestsDropped.Increment() return err } - stats.OutgoingRequestsSent.Increment() + stats.outgoingRequestsSent.Increment() return nil } @@ -337,5 +367,11 @@ func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNu // NewProtocol returns an ARP network protocol. func NewProtocol(s *stack.Stack) stack.NetworkProtocol { - return &protocol{stack: s} + return &protocol{ + stack: s, + mu: struct { + sync.RWMutex + eps map[tcpip.NICID]*endpoint + }{eps: make(map[tcpip.NICID]*endpoint)}, + } } diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go index 8d6ee37fa..6b23c0079 100644 --- a/pkg/tcpip/network/arp/arp_test.go +++ b/pkg/tcpip/network/arp/arp_test.go @@ -585,104 +585,122 @@ func TestLinkAddressRequest(t *testing.T) { testAddr := tcpip.Address([]byte{1, 2, 3, 4}) tests := []struct { - name string - nicAddr tcpip.Address - localAddr tcpip.Address - remoteLinkAddr tcpip.LinkAddress - - linkErr *tcpip.Error - expectedErr *tcpip.Error - expectedLocalAddr tcpip.Address - expectedRemoteLinkAddr tcpip.LinkAddress - expectedRequestsSent uint64 - expectedRequestBadLocalAddressErrors uint64 - expectedRequestNetworkUnreachableErrors uint64 - expectedRequestDroppedErrors uint64 + name string + nicAddr tcpip.Address + localAddr tcpip.Address + remoteLinkAddr tcpip.LinkAddress + linkErr *tcpip.Error + expectedErr *tcpip.Error + expectedLocalAddr tcpip.Address + expectedRemoteLinkAddr tcpip.LinkAddress + expectedRequestsSent uint64 + expectedRequestBadLocalAddressErrors uint64 + expectedRequestInterfaceHasNoLocalAddressErrors uint64 + expectedRequestDroppedErrors uint64 }{ { - name: "Unicast", - nicAddr: stackAddr, - localAddr: stackAddr, - remoteLinkAddr: remoteLinkAddr, - expectedLocalAddr: stackAddr, - expectedRemoteLinkAddr: remoteLinkAddr, - expectedRequestsSent: 1, - expectedRequestBadLocalAddressErrors: 0, - expectedRequestNetworkUnreachableErrors: 0, + name: "Unicast", + nicAddr: stackAddr, + localAddr: stackAddr, + remoteLinkAddr: remoteLinkAddr, + expectedLocalAddr: stackAddr, + expectedRemoteLinkAddr: remoteLinkAddr, + expectedRequestsSent: 1, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 0, }, { - name: "Multicast", - nicAddr: stackAddr, - localAddr: stackAddr, - remoteLinkAddr: "", - expectedLocalAddr: stackAddr, - expectedRemoteLinkAddr: header.EthernetBroadcastAddress, - expectedRequestsSent: 1, - expectedRequestBadLocalAddressErrors: 0, - expectedRequestNetworkUnreachableErrors: 0, + name: "Multicast", + nicAddr: stackAddr, + localAddr: stackAddr, + remoteLinkAddr: "", + expectedLocalAddr: stackAddr, + expectedRemoteLinkAddr: header.EthernetBroadcastAddress, + expectedRequestsSent: 1, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 0, }, { - name: "Unicast with unspecified source", - nicAddr: stackAddr, - remoteLinkAddr: remoteLinkAddr, - expectedLocalAddr: stackAddr, - expectedRemoteLinkAddr: remoteLinkAddr, - expectedRequestsSent: 1, - expectedRequestBadLocalAddressErrors: 0, - expectedRequestNetworkUnreachableErrors: 0, + name: "Unicast with unspecified source", + nicAddr: stackAddr, + localAddr: "", + remoteLinkAddr: remoteLinkAddr, + expectedLocalAddr: stackAddr, + expectedRemoteLinkAddr: remoteLinkAddr, + expectedRequestsSent: 1, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 0, }, { - name: "Multicast with unspecified source", - nicAddr: stackAddr, - remoteLinkAddr: "", - expectedLocalAddr: stackAddr, - expectedRemoteLinkAddr: header.EthernetBroadcastAddress, - expectedRequestsSent: 1, - expectedRequestBadLocalAddressErrors: 0, - expectedRequestNetworkUnreachableErrors: 0, + name: "Multicast with unspecified source", + nicAddr: stackAddr, + localAddr: "", + remoteLinkAddr: "", + expectedLocalAddr: stackAddr, + expectedRemoteLinkAddr: header.EthernetBroadcastAddress, + expectedRequestsSent: 1, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 0, }, { - name: "Unicast with unassigned address", - localAddr: testAddr, - remoteLinkAddr: remoteLinkAddr, - expectedErr: tcpip.ErrBadLocalAddress, - expectedRequestsSent: 0, - expectedRequestBadLocalAddressErrors: 1, - expectedRequestNetworkUnreachableErrors: 0, + name: "Unicast with unassigned address", + nicAddr: stackAddr, + localAddr: testAddr, + remoteLinkAddr: remoteLinkAddr, + expectedErr: tcpip.ErrBadLocalAddress, + expectedRequestsSent: 0, + expectedRequestBadLocalAddressErrors: 1, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 0, }, { - name: "Multicast with unassigned address", - localAddr: testAddr, - remoteLinkAddr: "", - expectedErr: tcpip.ErrBadLocalAddress, - expectedRequestsSent: 0, - expectedRequestBadLocalAddressErrors: 1, - expectedRequestNetworkUnreachableErrors: 0, + name: "Multicast with unassigned address", + nicAddr: stackAddr, + localAddr: testAddr, + remoteLinkAddr: "", + expectedErr: tcpip.ErrBadLocalAddress, + expectedRequestsSent: 0, + expectedRequestBadLocalAddressErrors: 1, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 0, }, { - name: "Unicast with no local address available", - remoteLinkAddr: remoteLinkAddr, - expectedErr: tcpip.ErrNetworkUnreachable, - expectedRequestsSent: 0, - expectedRequestBadLocalAddressErrors: 0, - expectedRequestNetworkUnreachableErrors: 1, + name: "Unicast with no local address available", + nicAddr: "", + localAddr: "", + remoteLinkAddr: remoteLinkAddr, + expectedErr: tcpip.ErrNetworkUnreachable, + expectedRequestsSent: 0, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 1, + expectedRequestDroppedErrors: 0, }, { - name: "Multicast with no local address available", - remoteLinkAddr: "", - expectedErr: tcpip.ErrNetworkUnreachable, - expectedRequestsSent: 0, - expectedRequestBadLocalAddressErrors: 0, - expectedRequestNetworkUnreachableErrors: 1, + name: "Multicast with no local address available", + nicAddr: "", + localAddr: "", + remoteLinkAddr: "", + expectedErr: tcpip.ErrNetworkUnreachable, + expectedRequestsSent: 0, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 1, + expectedRequestDroppedErrors: 0, }, { - name: "Link error", - nicAddr: stackAddr, - localAddr: stackAddr, - remoteLinkAddr: remoteLinkAddr, - linkErr: tcpip.ErrInvalidEndpointState, - expectedErr: tcpip.ErrInvalidEndpointState, - expectedRequestDroppedErrors: 1, + name: "Link error", + nicAddr: stackAddr, + localAddr: stackAddr, + remoteLinkAddr: remoteLinkAddr, + linkErr: tcpip.ErrInvalidEndpointState, + expectedErr: tcpip.ErrInvalidEndpointState, + expectedRequestsSent: 0, + expectedRequestBadLocalAddressErrors: 0, + expectedRequestInterfaceHasNoLocalAddressErrors: 0, + expectedRequestDroppedErrors: 1, }, } @@ -721,12 +739,12 @@ func TestLinkAddressRequest(t *testing.T) { if got := s.Stats().ARP.OutgoingRequestsSent.Value(); got != test.expectedRequestsSent { t.Errorf("got s.Stats().ARP.OutgoingRequestsSent.Value() = %d, want = %d", got, test.expectedRequestsSent) } + if got := s.Stats().ARP.OutgoingRequestInterfaceHasNoLocalAddressErrors.Value(); got != test.expectedRequestInterfaceHasNoLocalAddressErrors { + t.Errorf("got s.Stats().ARP.OutgoingRequestInterfaceHasNoLocalAddressErrors.Value() = %d, want = %d", got, test.expectedRequestInterfaceHasNoLocalAddressErrors) + } if got := s.Stats().ARP.OutgoingRequestBadLocalAddressErrors.Value(); got != test.expectedRequestBadLocalAddressErrors { t.Errorf("got s.Stats().ARP.OutgoingRequestBadLocalAddressErrors.Value() = %d, want = %d", got, test.expectedRequestBadLocalAddressErrors) } - if got := s.Stats().ARP.OutgoingRequestNetworkUnreachableErrors.Value(); got != test.expectedRequestNetworkUnreachableErrors { - t.Errorf("got s.Stats().ARP.OutgoingRequestNetworkUnreachableErrors.Value() = %d, want = %d", got, test.expectedRequestNetworkUnreachableErrors) - } if got := s.Stats().ARP.OutgoingRequestsDropped.Value(); got != test.expectedRequestDroppedErrors { t.Errorf("got s.Stats().ARP.OutgoingRequestsDropped.Value() = %d, want = %d", got, test.expectedRequestDroppedErrors) } @@ -774,11 +792,7 @@ func TestLinkAddressRequestWithoutNIC(t *testing.T) { t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver") } - if err := linkRes.LinkAddressRequest(remoteAddr, "", remoteLinkAddr, &testInterface{nicID: nicID}); err != tcpip.ErrUnknownNICID { - t.Fatalf("got p.LinkAddressRequest(%s, %s, %s, _) = %s, want = %s", remoteAddr, "", remoteLinkAddr, err, tcpip.ErrUnknownNICID) - } - - if got := s.Stats().ARP.OutgoingRequestInterfaceHasNoLocalAddressErrors.Value(); got != 1 { - t.Errorf("got s.Stats().ARP.OutgoingRequestInterfaceHasNoLocalAddressErrors.Value() = %d, want = 1", got) + if err := linkRes.LinkAddressRequest(remoteAddr, "", remoteLinkAddr, &testInterface{nicID: nicID}); err != tcpip.ErrNotConnected { + t.Fatalf("got p.LinkAddressRequest(%s, %s, %s, _) = %s, want = %s", remoteAddr, "", remoteLinkAddr, err, tcpip.ErrNotConnected) } } diff --git a/pkg/tcpip/network/arp/stats.go b/pkg/tcpip/network/arp/stats.go new file mode 100644 index 000000000..6d7194c6c --- /dev/null +++ b/pkg/tcpip/network/arp/stats.go @@ -0,0 +1,70 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arp + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +var _ stack.NetworkEndpointStats = (*Stats)(nil) + +// Stats holds statistics related to ARP. +type Stats struct { + // ARP holds ARP statistics. + ARP tcpip.ARPStats +} + +// IsNetworkEndpointStats implements stack.NetworkEndpointStats. +func (*Stats) IsNetworkEndpointStats() {} + +type sharedStats struct { + localStats Stats + arp multiCounterARPStats +} + +// LINT.IfChange(multiCounterARPStats) + +type multiCounterARPStats struct { + packetsReceived tcpip.MultiCounterStat + disabledPacketsReceived tcpip.MultiCounterStat + malformedPacketsReceived tcpip.MultiCounterStat + requestsReceived tcpip.MultiCounterStat + requestsReceivedUnknownTargetAddress tcpip.MultiCounterStat + outgoingRequestInterfaceHasNoLocalAddressErrors tcpip.MultiCounterStat + outgoingRequestBadLocalAddressErrors tcpip.MultiCounterStat + outgoingRequestsDropped tcpip.MultiCounterStat + outgoingRequestsSent tcpip.MultiCounterStat + repliesReceived tcpip.MultiCounterStat + outgoingRepliesDropped tcpip.MultiCounterStat + outgoingRepliesSent tcpip.MultiCounterStat +} + +func (m *multiCounterARPStats) init(a, b *tcpip.ARPStats) { + m.packetsReceived.Init(a.PacketsReceived, b.PacketsReceived) + m.disabledPacketsReceived.Init(a.DisabledPacketsReceived, b.DisabledPacketsReceived) + m.malformedPacketsReceived.Init(a.MalformedPacketsReceived, b.MalformedPacketsReceived) + m.requestsReceived.Init(a.RequestsReceived, b.RequestsReceived) + m.requestsReceivedUnknownTargetAddress.Init(a.RequestsReceivedUnknownTargetAddress, b.RequestsReceivedUnknownTargetAddress) + m.outgoingRequestInterfaceHasNoLocalAddressErrors.Init(a.OutgoingRequestInterfaceHasNoLocalAddressErrors, b.OutgoingRequestInterfaceHasNoLocalAddressErrors) + m.outgoingRequestBadLocalAddressErrors.Init(a.OutgoingRequestBadLocalAddressErrors, b.OutgoingRequestBadLocalAddressErrors) + m.outgoingRequestsDropped.Init(a.OutgoingRequestsDropped, b.OutgoingRequestsDropped) + m.outgoingRequestsSent.Init(a.OutgoingRequestsSent, b.OutgoingRequestsSent) + m.repliesReceived.Init(a.RepliesReceived, b.RepliesReceived) + m.outgoingRepliesDropped.Init(a.OutgoingRepliesDropped, b.OutgoingRepliesDropped) + m.outgoingRepliesSent.Init(a.OutgoingRepliesSent, b.OutgoingRepliesSent) +} + +// LINT.ThenChange(../../tcpip.go:ARPStats) diff --git a/pkg/tcpip/network/arp/stats_test.go b/pkg/tcpip/network/arp/stats_test.go new file mode 100644 index 000000000..036fdf739 --- /dev/null +++ b/pkg/tcpip/network/arp/stats_test.go @@ -0,0 +1,93 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arp + +import ( + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/testutil" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +var _ stack.NetworkInterface = (*testInterface)(nil) + +type testInterface struct { + stack.NetworkInterface + nicID tcpip.NICID +} + +func (t *testInterface) ID() tcpip.NICID { + return t.nicID +} + +func knownNICIDs(proto *protocol) []tcpip.NICID { + var nicIDs []tcpip.NICID + + for k := range proto.mu.eps { + nicIDs = append(nicIDs, k) + } + + return nicIDs +} + +func TestClearEndpointFromProtocolOnClose(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol}, + }) + proto := s.NetworkProtocolInstance(ProtocolNumber).(*protocol) + nic := testInterface{nicID: 1} + ep := proto.NewEndpoint(&nic, nil, nil, nil).(*endpoint) + var nicIDs []tcpip.NICID + + proto.mu.Lock() + foundEP, hasEndpointBeforeClose := proto.mu.eps[nic.ID()] + nicIDs = knownNICIDs(proto) + proto.mu.Unlock() + + if !hasEndpointBeforeClose { + t.Fatalf("expected to find the nic id %d in the protocol's endpoint map (%v)", nic.ID(), nicIDs) + } + if foundEP != ep { + t.Fatalf("found an incorrect endpoint mapped to nic id %d", nic.ID()) + } + + ep.Close() + + proto.mu.Lock() + _, hasEndpointAfterClose := proto.mu.eps[nic.ID()] + nicIDs = knownNICIDs(proto) + proto.mu.Unlock() + if hasEndpointAfterClose { + t.Fatalf("unexpectedly found an endpoint mapped to the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) + } +} + +func TestMultiCounterStatsInitialization(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol}, + }) + proto := s.NetworkProtocolInstance(ProtocolNumber).(*protocol) + var nic testInterface + ep := proto.NewEndpoint(&nic, nil, nil, nil).(*endpoint) + // At this point, the Stack's stats and the NetworkEndpoint's stats are + // expected to be bound by a MultiCounterStat. + refStack := s.Stats() + refEP := ep.stats.localStats + if err := testutil.ValidateMultiCounterStats(reflect.ValueOf(&ep.stats.arp).Elem(), []reflect.Value{reflect.ValueOf(&refEP.ARP).Elem(), reflect.ValueOf(&refStack.ARP).Elem()}); err != nil { + t.Error(err) + } +} diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index 3005973d7..2a6ec19dc 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -235,14 +235,14 @@ func buildIPv6Route(local, remote tcpip.Address) (*stack.Route, *tcpip.Error) { return s.FindRoute(nicID, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */) } -func buildDummyStackWithLinkEndpoint(t *testing.T) (*stack.Stack, *channel.Endpoint) { +func buildDummyStackWithLinkEndpoint(t *testing.T, mtu uint32) (*stack.Stack, *channel.Endpoint) { t.Helper() s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, }) - e := channel.New(0, 1280, "") + e := channel.New(1, mtu, "") if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } @@ -263,7 +263,7 @@ func buildDummyStackWithLinkEndpoint(t *testing.T) (*stack.Stack, *channel.Endpo func buildDummyStack(t *testing.T) *stack.Stack { t.Helper() - s, _ := buildDummyStackWithLinkEndpoint(t) + s, _ := buildDummyStackWithLinkEndpoint(t, header.IPv6MinimumMTU) return s } @@ -416,7 +416,7 @@ func TestSourceAddressValidation(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, e := buildDummyStackWithLinkEndpoint(t) + s, e := buildDummyStackWithLinkEndpoint(t, header.IPv6MinimumMTU) test.rxICMP(e, test.srcAddress) var wantValid uint64 @@ -1490,7 +1490,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{test.protoFactory}, }) - e := channel.New(1, 1280, "") + e := channel.New(1, header.IPv6MinimumMTU, "") if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } @@ -1526,3 +1526,246 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { }) } } + +// Test that the included data in an ICMP error packet conforms to the +// requirements of RFC 972, RFC 4443 section 2.4 and RFC 1812 Section 4.3.2.3 +func TestICMPInclusionSize(t *testing.T) { + const ( + replyHeaderLength4 = header.IPv4MinimumSize + header.IPv4MinimumSize + header.ICMPv4MinimumSize + replyHeaderLength6 = header.IPv6MinimumSize + header.IPv6MinimumSize + header.ICMPv6MinimumSize + targetSize4 = header.IPv4MinimumProcessableDatagramSize + targetSize6 = header.IPv6MinimumMTU + // A protocol number that will cause an error response. + reservedProtocol = 254 + ) + + // IPv4 function to create a IP packet and send it to the stack. + // The packet should generate an error response. We can do that by using an + // unknown transport protocol (254). + rxIPv4Bad := func(e *channel.Endpoint, src tcpip.Address, payload []byte) buffer.View { + totalLen := header.IPv4MinimumSize + len(payload) + hdr := buffer.NewPrependable(header.IPv4MinimumSize) + ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) + ip.Encode(&header.IPv4Fields{ + TotalLength: uint16(totalLen), + Protocol: reservedProtocol, + TTL: ipv4.DefaultTTL, + SrcAddr: src, + DstAddr: localIPv4Addr, + }) + ip.SetChecksum(^ip.CalculateChecksum()) + vv := hdr.View().ToVectorisedView() + vv.AppendView(buffer.View(payload)) + // Take a copy before InjectInbound takes ownership of vv + // as vv may be changed during the call. + v := vv.ToView() + e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: vv, + })) + return v + } + + // IPv6 function to create a packet and send it to the stack. + // The packet should be errant in a way that causes the stack to send an + // ICMP error response and have enough data to allow the testing of the + // inclusion of the errant packet. Use `unknown next header' to generate + // the error. + rxIPv6Bad := func(e *channel.Endpoint, src tcpip.Address, payload []byte) buffer.View { + hdr := buffer.NewPrependable(header.IPv6MinimumSize) + ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) + ip.Encode(&header.IPv6Fields{ + PayloadLength: uint16(len(payload)), + TransportProtocol: reservedProtocol, + HopLimit: ipv6.DefaultTTL, + SrcAddr: src, + DstAddr: localIPv6Addr, + }) + vv := hdr.View().ToVectorisedView() + vv.AppendView(buffer.View(payload)) + // Take a copy before InjectInbound takes ownership of vv + // as vv may be changed during the call. + v := vv.ToView() + + e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: vv, + })) + return v + } + + v4Checker := func(t *testing.T, pkt *stack.PacketBuffer, payload buffer.View) { + // We already know the entire packet is the right size so we can use its + // length to calculate the right payload size to check. + expectedPayloadLength := pkt.Size() - header.IPv4MinimumSize - header.ICMPv4MinimumSize + checker.IPv4(t, stack.PayloadSince(pkt.NetworkHeader()), + checker.SrcAddr(localIPv4Addr), + checker.DstAddr(remoteIPv4Addr), + checker.IPv4HeaderLength(header.IPv4MinimumSize), + checker.IPFullLength(uint16(header.IPv4MinimumSize+header.ICMPv4MinimumSize+expectedPayloadLength)), + checker.ICMPv4( + checker.ICMPv4Checksum(), + checker.ICMPv4Type(header.ICMPv4DstUnreachable), + checker.ICMPv4Code(header.ICMPv4ProtoUnreachable), + checker.ICMPv4Payload(payload[:expectedPayloadLength]), + ), + ) + } + + v6Checker := func(t *testing.T, pkt *stack.PacketBuffer, payload buffer.View) { + // We already know the entire packet is the right size so we can use its + // length to calculate the right payload size to check. + expectedPayloadLength := pkt.Size() - header.IPv6MinimumSize - header.ICMPv6MinimumSize + checker.IPv6(t, stack.PayloadSince(pkt.NetworkHeader()), + checker.SrcAddr(localIPv6Addr), + checker.DstAddr(remoteIPv6Addr), + checker.IPFullLength(uint16(header.IPv6MinimumSize+header.ICMPv6MinimumSize+expectedPayloadLength)), + checker.ICMPv6( + checker.ICMPv6Type(header.ICMPv6ParamProblem), + checker.ICMPv6Code(header.ICMPv6UnknownHeader), + checker.ICMPv6Payload(payload[:expectedPayloadLength]), + ), + ) + } + tests := []struct { + name string + srcAddress tcpip.Address + injector func(*channel.Endpoint, tcpip.Address, []byte) buffer.View + checker func(*testing.T, *stack.PacketBuffer, buffer.View) + payloadLength int // Not including IP header. + linkMTU uint32 // Largest IP packet that the link can send as payload. + replyLength int // Total size of IP/ICMP packet expected back. + }{ + { + name: "IPv4 exact match", + srcAddress: remoteIPv4Addr, + injector: rxIPv4Bad, + checker: v4Checker, + payloadLength: targetSize4 - replyHeaderLength4, + linkMTU: targetSize4, + replyLength: targetSize4, + }, + { + name: "IPv4 larger MTU", + srcAddress: remoteIPv4Addr, + injector: rxIPv4Bad, + checker: v4Checker, + payloadLength: targetSize4, + linkMTU: targetSize4 + 1000, + replyLength: targetSize4, + }, + { + name: "IPv4 smaller MTU", + srcAddress: remoteIPv4Addr, + injector: rxIPv4Bad, + checker: v4Checker, + payloadLength: targetSize4, + linkMTU: targetSize4 - 50, + replyLength: targetSize4 - 50, + }, + { + name: "IPv4 payload exceeds", + srcAddress: remoteIPv4Addr, + injector: rxIPv4Bad, + checker: v4Checker, + payloadLength: targetSize4 + 10, + linkMTU: targetSize4, + replyLength: targetSize4, + }, + { + name: "IPv4 1 byte less", + srcAddress: remoteIPv4Addr, + injector: rxIPv4Bad, + checker: v4Checker, + payloadLength: targetSize4 - replyHeaderLength4 - 1, + linkMTU: targetSize4, + replyLength: targetSize4 - 1, + }, + { + name: "IPv4 No payload", + srcAddress: remoteIPv4Addr, + injector: rxIPv4Bad, + checker: v4Checker, + payloadLength: 0, + linkMTU: targetSize4, + replyLength: replyHeaderLength4, + }, + { + name: "IPv6 exact match", + srcAddress: remoteIPv6Addr, + injector: rxIPv6Bad, + checker: v6Checker, + payloadLength: targetSize6 - replyHeaderLength6, + linkMTU: targetSize6, + replyLength: targetSize6, + }, + { + name: "IPv6 larger MTU", + srcAddress: remoteIPv6Addr, + injector: rxIPv6Bad, + checker: v6Checker, + payloadLength: targetSize6, + linkMTU: targetSize6 + 400, + replyLength: targetSize6, + }, + // NB. No "smaller MTU" test here as less than 1280 is not permitted + // in IPv6. + { + name: "IPv6 payload exceeds", + srcAddress: remoteIPv6Addr, + injector: rxIPv6Bad, + checker: v6Checker, + payloadLength: targetSize6, + linkMTU: targetSize6, + replyLength: targetSize6, + }, + { + name: "IPv6 1 byte less", + srcAddress: remoteIPv6Addr, + injector: rxIPv6Bad, + checker: v6Checker, + payloadLength: targetSize6 - replyHeaderLength6 - 1, + linkMTU: targetSize6, + replyLength: targetSize6 - 1, + }, + { + name: "IPv6 no payload", + srcAddress: remoteIPv6Addr, + injector: rxIPv6Bad, + checker: v6Checker, + payloadLength: 0, + linkMTU: targetSize6, + replyLength: replyHeaderLength6, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s, e := buildDummyStackWithLinkEndpoint(t, test.linkMTU) + // Allocate and initialize the payload view. + payload := buffer.NewView(test.payloadLength) + for i := 0; i < len(payload); i++ { + payload[i] = uint8(i) + } + // Default routes for IPv4&6 so ICMP can find a route to the remote + // node when attempting to send the ICMP error Reply. + s.SetRouteTable([]tcpip.Route{ + { + Destination: header.IPv4EmptySubnet, + NIC: nicID, + }, + { + Destination: header.IPv6EmptySubnet, + NIC: nicID, + }, + }) + v := test.injector(e, test.srcAddress, payload) + pkt, ok := e.Read() + if !ok { + t.Fatal("expected a packet to be written") + } + if got, want := pkt.Pkt.Size(), test.replyLength; got != want { + t.Fatalf("got %d bytes of icmp error packet, want %d", got, want) + } + test.checker(t, pkt.Pkt, v) + }) + } +} diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index 3f60de749..6bb97c46a 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ package ipv4 import ( - "errors" "fmt" "gvisor.dev/gvisor/pkg/tcpip" @@ -105,17 +104,12 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { } else { op = &optionUsageReceive{} } - aux, tmp, err := e.processIPOptions(pkt, opts, op) - if err != nil { - switch { - case - errors.Is(err, header.ErrIPv4OptDuplicate), - errors.Is(err, errIPv4RecordRouteOptInvalidLength), - errors.Is(err, errIPv4RecordRouteOptInvalidPointer), - errors.Is(err, errIPv4TimestampOptInvalidLength), - errors.Is(err, errIPv4TimestampOptInvalidPointer), - errors.Is(err, errIPv4TimestampOptOverflow): - _ = e.protocol.returnError(&icmpReasonParamProblem{pointer: aux}, pkt) + tmp, optProblem := e.processIPOptions(pkt, opts, op) + if optProblem != nil { + if optProblem.NeedICMP { + _ = e.protocol.returnError(&icmpReasonParamProblem{ + pointer: optProblem.Pointer, + }, pkt) e.protocol.stack.Stats().MalformedRcvdPackets.Increment() e.stats.ip.MalformedPacketsReceived.Increment() } @@ -442,13 +436,13 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi // systems implement the RFC 1812 definition and not the original // requirement. We treat 8 bytes as the minimum but will try send more. mtu := int(route.MTU()) - if mtu > header.IPv4MinimumProcessableDatagramSize { - mtu = header.IPv4MinimumProcessableDatagramSize + const maxIPData = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize + if mtu > maxIPData { + mtu = maxIPData } - headerLen := int(route.MaxHeaderLength()) + header.ICMPv4MinimumSize - available := int(mtu) - headerLen + available := mtu - header.ICMPv4MinimumSize - if available < header.IPv4MinimumSize+header.ICMPv4MinimumErrorPayloadSize { + if available < len(origIPHdr)+header.ICMPv4MinimumErrorPayloadSize { return nil } @@ -471,7 +465,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi payload.CapLength(payloadLen) icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: headerLen, + ReserveHeaderBytes: int(route.MaxHeaderLength()) + header.ICMPv4MinimumSize, Data: payload, }) diff --git a/pkg/tcpip/network/ipv4/igmp.go b/pkg/tcpip/network/ipv4/igmp.go index 9515fde45..4550aacd6 100644 --- a/pkg/tcpip/network/ipv4/igmp.go +++ b/pkg/tcpip/network/ipv4/igmp.go @@ -157,14 +157,13 @@ func (igmp *igmpState) handleIGMP(pkt *stack.PacketBuffer) { } h := header.IGMP(headerView) - // Temporarily reset the checksum field to 0 in order to calculate the proper - // checksum. - wantChecksum := h.Checksum() - h.SetChecksum(0) - gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */) - h.SetChecksum(wantChecksum) - - if gotChecksum != wantChecksum { + // As per RFC 1071 section 1.3, + // + // To check a checksum, the 1's complement sum is computed over the + // same set of octets, including the checksum field. If the result + // is all 1 bits (-0 in 1's complement arithmetic), the check + // succeeds. + if header.ChecksumVV(pkt.Data, 0 /* initial */) != 0xFFFF { received.checksumErrors.Increment() return } diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 7f03696ae..a05275a5b 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ package ipv4 import ( - "errors" "fmt" "math" "reflect" @@ -322,8 +321,8 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw // iptables filtering. All packets that reach here are locally // generated. - nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) - if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "", nicName); !ok { + outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesOutputDropped.Increment() return nil @@ -437,10 +436,10 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe } } - nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) // iptables filtering. All packets that reach here are locally // generated. - dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, nicName) + dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, "", outNicName) if len(dropped) == 0 && len(natPkts) == 0 { // Fast path: If no packets are to be dropped then we can just invoke the // faster WritePackets API directly. @@ -612,7 +611,8 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // Loopback traffic skips the prerouting chain. if !e.nic.IsLoopback() { - if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, ""); !ok { + inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok { // iptables is telling us to drop the packet. stats.IPTablesPreroutingDropped.Increment() return @@ -699,7 +699,8 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { // iptables filtering. All packets that reach here are intended for // this machine and will not be forwarded. - if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "", ""); !ok { + inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "" /* preroutingAddr */, inNicName, "" /* outNicName */); !ok { // iptables is telling us to drop the packet. stats.ip.IPTablesInputDropped.Increment() return @@ -780,17 +781,11 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { // TODO(gvisor.dev/issue/4586): // When we add forwarding support we should use the verified options // rather than just throwing them away. - aux, _, err := e.processIPOptions(pkt, opts, &optionUsageReceive{}) - if err != nil { - switch { - case - errors.Is(err, header.ErrIPv4OptDuplicate), - errors.Is(err, errIPv4RecordRouteOptInvalidPointer), - errors.Is(err, errIPv4RecordRouteOptInvalidLength), - errors.Is(err, errIPv4TimestampOptInvalidLength), - errors.Is(err, errIPv4TimestampOptInvalidPointer), - errors.Is(err, errIPv4TimestampOptOverflow): - _ = e.protocol.returnError(&icmpReasonParamProblem{pointer: aux}, pkt) + if _, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{}); optProblem != nil { + if optProblem.NeedICMP { + _ = e.protocol.returnError(&icmpReasonParamProblem{ + pointer: optProblem.Pointer, + }, pkt) e.protocol.stack.Stats().MalformedRcvdPackets.Increment() stats.ip.MalformedPacketsReceived.Increment() } @@ -1233,16 +1228,9 @@ func (*optionUsageEcho) actions() optionActions { } } -var ( - errIPv4TimestampOptInvalidLength = errors.New("invalid Timestamp length") - errIPv4TimestampOptInvalidPointer = errors.New("invalid Timestamp pointer") - errIPv4TimestampOptOverflow = errors.New("overflow in Timestamp") - errIPv4TimestampOptInvalidFlags = errors.New("invalid Timestamp flags") -) - // handleTimestamp does any required processing on a Timestamp option // in place. -func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) (uint8, error) { +func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem { flags := tsOpt.Flags() var entrySize uint8 switch flags { @@ -1253,7 +1241,10 @@ func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Addres header.IPv4OptionTimestampWithPredefinedIPFlag: entrySize = header.IPv4OptionTimestampWithAddrSize default: - return header.IPv4OptTSOFLWAndFLGOffset, errIPv4TimestampOptInvalidFlags + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptTSOFLWAndFLGOffset, + NeedICMP: true, + } } pointer := tsOpt.Pointer() @@ -1261,7 +1252,10 @@ func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Addres // Since the pointer is 1 based, and the header is 4 bytes long the // pointer must point beyond the header therefore 4 or less is bad. if pointer <= header.IPv4OptionTimestampHdrLength { - return header.IPv4OptTSPointerOffset, errIPv4TimestampOptInvalidPointer + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptTSPointerOffset, + NeedICMP: true, + } } // To simplify processing below, base further work on the array of timestamps // beyond the header, rather than on the whole option. Also to aid @@ -1295,14 +1289,17 @@ func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Addres // timestamp, but the overflow count is incremented by one. if flags == header.IPv4OptionTimestampWithPredefinedIPFlag { // By definition we have nothing to do. - return 0, nil + return nil } if tsOpt.IncOverflow() != 0 { - return 0, nil + return nil } // The overflow count is also full. - return header.IPv4OptTSOFLWAndFLGOffset, errIPv4TimestampOptOverflow + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptTSOFLWAndFLGOffset, + NeedICMP: true, + } } if nextSlot+entrySize > dataLength { // The data area isn't full but there isn't room for a new entry. @@ -1321,32 +1318,36 @@ func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Addres if dataLength%entrySize != 0 { // The Data section size should be a multiple of the expected // timestamp entry size. - return header.IPv4OptionLengthOffset, errIPv4TimestampOptInvalidLength + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptionLengthOffset, + NeedICMP: false, + } } // If the size is OK, the pointer must be corrupted. } - return header.IPv4OptTSPointerOffset, errIPv4TimestampOptInvalidPointer + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptTSPointerOffset, + NeedICMP: true, + } } if usage.actions().timestamp == optionProcess { tsOpt.UpdateTimestamp(localAddress, clock) } - return 0, nil + return nil } -var ( - errIPv4RecordRouteOptInvalidLength = errors.New("invalid length in Record Route") - errIPv4RecordRouteOptInvalidPointer = errors.New("invalid pointer in Record Route") -) - // handleRecordRoute checks and processes a Record route option. It is much // like the timestamp type 1 option, but without timestamps. The passed in // address is stored in the option in the correct spot if possible. -func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) (uint8, error) { +func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem { optlen := rrOpt.Size() if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength { - return header.IPv4OptionLengthOffset, errIPv4RecordRouteOptInvalidLength + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptionLengthOffset, + NeedICMP: true, + } } pointer := rrOpt.Pointer() @@ -1356,7 +1357,10 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad // Since the pointer is 1 based, and the header is 3 bytes long the // pointer must point beyond the header therefore 3 or less is bad. if pointer <= header.IPv4OptionRecordRouteHdrLength { - return header.IPv4OptRRPointerOffset, errIPv4RecordRouteOptInvalidPointer + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptRRPointerOffset, + NeedICMP: true, + } } // RFC 791 page 21 says @@ -1373,7 +1377,7 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad // of these words is a copy/paste error from the timestamp option where // there are two failure reasons given. if pointer > optlen { - return 0, nil + return nil } // The data area isn't full but there isn't room for a new entry. @@ -1398,17 +1402,23 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad // } if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 { // Length is bad, not on integral number of slots. - return header.IPv4OptionLengthOffset, errIPv4RecordRouteOptInvalidLength + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptionLengthOffset, + NeedICMP: true, + } } // If not length, the fault must be with the pointer. } - return header.IPv4OptRRPointerOffset, errIPv4RecordRouteOptInvalidPointer + return &header.IPv4OptParameterProblem{ + Pointer: header.IPv4OptRRPointerOffset, + NeedICMP: true, + } } if usage.actions().recordRoute == optionVerify { - return 0, nil + return nil } rrOpt.StoreAddress(localAddress) - return 0, nil + return nil } // processIPOptions parses the IPv4 options and produces a new set of options @@ -1419,7 +1429,7 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad // - The location of an error if there was one (or 0 if no error) // - If there is an error, information as to what it was was. // - The replacement option set. -func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Options, usage optionsUsage) (uint8, header.IPv4Options, error) { +func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Options, usage optionsUsage) (header.IPv4Options, *header.IPv4OptParameterProblem) { stats := e.stats.ip opts := header.IPv4Options(orig) optIter := opts.MakeIterator() @@ -1433,21 +1443,23 @@ func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Opt // This will need tweaking when we start really forwarding packets // as we may need to get two addresses, for rx and tx interfaces. // We will also have to take usage into account. - prefixedAddress, err := e.protocol.stack.GetMainNICAddress(e.nic.ID(), ProtocolNumber) + prefixedAddress, ok := e.protocol.stack.GetMainNICAddress(e.nic.ID(), ProtocolNumber) localAddress := prefixedAddress.Address - if err != nil { + if !ok { h := header.IPv4(pkt.NetworkHeader().View()) dstAddr := h.DestinationAddress() if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) { - return 0 /* errCursor */, nil, header.ErrIPv4OptionAddress + return nil, &header.IPv4OptParameterProblem{ + NeedICMP: false, + } } localAddress = dstAddr } for { - option, done, err := optIter.Next() - if done || err != nil { - return optIter.ErrCursor, optIter.Finalize(), err + option, done, optProblem := optIter.Next() + if done || optProblem != nil { + return optIter.Finalize(), optProblem } optType := option.Type() if optType == header.IPv4OptionNOPType { @@ -1456,12 +1468,15 @@ func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Opt } if optType == header.IPv4OptionListEndType { optIter.PushNOPOrEnd(optType) - return 0 /* errCursor */, optIter.Finalize(), nil /* err */ + return optIter.Finalize(), nil } // check for repeating options (multiple NOPs are OK) if seenOptions[optType] { - return optIter.ErrCursor, nil, header.ErrIPv4OptDuplicate + return nil, &header.IPv4OptParameterProblem{ + Pointer: optIter.ErrCursor, + NeedICMP: true, + } } seenOptions[optType] = true @@ -1473,9 +1488,9 @@ func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Opt clock := e.protocol.stack.Clock() newBuffer := optIter.RemainingBuffer()[:len(*option)] _ = copy(newBuffer, option.Contents()) - offset, err := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage) - if err != nil { - return optIter.ErrCursor + offset, nil, err + if optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage); optProblem != nil { + optProblem.Pointer += optIter.ErrCursor + return nil, optProblem } optIter.ConsumeBuffer(optLen) } @@ -1485,9 +1500,9 @@ func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Opt if usage.actions().recordRoute != optionRemove { newBuffer := optIter.RemainingBuffer()[:len(*option)] _ = copy(newBuffer, option.Contents()) - offset, err := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage) - if err != nil { - return optIter.ErrCursor + offset, nil, err + if optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage); optProblem != nil { + optProblem.Pointer += optIter.ErrCursor + return nil, optProblem } optIter.ConsumeBuffer(optLen) } diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index a9e137c24..dac7cbfd4 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -270,6 +270,11 @@ func TestIPv4Sanity(t *testing.T) { nicID = 1 randomSequence = 123 randomIdent = 42 + // In some cases Linux sets the error pointer to the start of the option + // (offset 0) instead of the actual wrong value, which is the length byte + // (offset 1). For compatibility we must do the same. Use this constant + // to indicate where this happens. + pointerOffsetForInvalidLength = 0 ) var ( ipv4Addr = tcpip.AddressWithPrefix{ @@ -439,6 +444,21 @@ func TestIPv4Sanity(t *testing.T) { replyOptions: header.IPv4Options{}, }, { + name: "bad option - no length", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 1, 1, 1, 68, + // ^-start of timestamp.. but no length.. + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + 3, + }, + { name: "bad option - length 0", maxTotalLength: ipv4.MaxTotalSize, transportProtocol: uint8(header.ICMPv4ProtocolNumber), @@ -448,7 +468,27 @@ func TestIPv4Sanity(t *testing.T) { // ^ 1, 2, 3, 4, }, - shouldFail: true, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + pointerOffsetForInvalidLength, + }, + { + name: "bad option - length 1", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 68, 1, 9, 0, + // ^ + 1, 2, 3, 4, + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + pointerOffsetForInvalidLength, }, { name: "bad option - length big", @@ -462,7 +502,11 @@ func TestIPv4Sanity(t *testing.T) { // space is not possible. (Second byte) 1, 2, 3, 4, }, - shouldFail: true, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + pointerOffsetForInvalidLength, }, { // This tests for some linux compatible behaviour. @@ -484,7 +528,7 @@ func TestIPv4Sanity(t *testing.T) { expectErrorICMP: true, ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, - paramProblemPointer: header.IPv4MinimumSize + 2, + paramProblemPointer: header.IPv4MinimumSize + header.IPv4OptTSPointerOffset, }, { name: "multiple type 0 with room", @@ -589,7 +633,7 @@ func TestIPv4Sanity(t *testing.T) { expectErrorICMP: true, ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, - paramProblemPointer: header.IPv4MinimumSize + 2, + paramProblemPointer: header.IPv4MinimumSize + header.IPv4OptTSPointerOffset, }, { name: "valid timestamp pointer", @@ -624,7 +668,7 @@ func TestIPv4Sanity(t *testing.T) { expectErrorICMP: true, ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, - paramProblemPointer: header.IPv4MinimumSize + 2, + paramProblemPointer: header.IPv4MinimumSize + header.IPv4OptTSPointerOffset, }, // End of option list with illegal option after it, which should be ignored. { @@ -636,24 +680,31 @@ func TestIPv4Sanity(t *testing.T) { 68, 12, 13, 0x11, 192, 168, 1, 12, 1, 2, 3, 4, - 0, 10, 3, 99, + 0, 10, 3, 99, // EOL followed by junk }, replyOptions: header.IPv4Options{ 68, 12, 13, 0x21, 192, 168, 1, 12, 1, 2, 3, 4, - 0, 0, 0, 0, // 3 bytes unknown option - }, // ^ End of options hides following bytes. + 0, // End of Options hides following bytes. + 0, 0, 0, // 3 bytes unknown option removed. + }, }, { - // Timestamp with a size too small. + // Timestamp with a size much too small. name: "timestamp truncated", maxTotalLength: ipv4.MaxTotalSize, transportProtocol: uint8(header.ICMPv4ProtocolNumber), TTL: ttl, - options: header.IPv4Options{68, 1, 0, 0}, - // ^ Smallest possible is 8. - shouldFail: true, + options: header.IPv4Options{ + 68, 1, 0, 0, + // ^ Smallest possible is 8. Linux points at the 68. + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + pointerOffsetForInvalidLength, }, { name: "single record route with room", @@ -751,7 +802,7 @@ func TestIPv4Sanity(t *testing.T) { expectErrorICMP: true, ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, - paramProblemPointer: header.IPv4MinimumSize + 2, + paramProblemPointer: header.IPv4MinimumSize + header.IPv4OptRRPointerOffset, }, { // Pointer must be 4 or more as it must point past the 3 byte header @@ -769,7 +820,7 @@ func TestIPv4Sanity(t *testing.T) { expectErrorICMP: true, ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, - paramProblemPointer: header.IPv4MinimumSize + 2, + paramProblemPointer: header.IPv4MinimumSize + header.IPv4OptRRPointerOffset, }, { // Pointer must be 4 or more as it must point past the 3 byte header @@ -808,8 +859,7 @@ func TestIPv4Sanity(t *testing.T) { expectErrorICMP: true, ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, - paramProblemPointer: header.IPv4MinimumSize + 2, - replyOptions: header.IPv4Options{}, + paramProblemPointer: header.IPv4MinimumSize + header.IPv4OptRRPointerOffset, }, { name: "duplicate record route", @@ -828,7 +878,6 @@ func TestIPv4Sanity(t *testing.T) { ICMPType: header.ICMPv4ParamProblem, ICMPCode: header.ICMPv4UnusedCode, paramProblemPointer: header.IPv4MinimumSize + 7, - replyOptions: header.IPv4Options{}, }, } @@ -884,7 +933,6 @@ func TestIPv4Sanity(t *testing.T) { if test.maxTotalLength < totalLen { totalLen = test.maxTotalLength } - ip.Encode(&header.IPv4Fields{ TotalLength: totalLen, Protocol: test.transportProtocol, @@ -2608,7 +2656,7 @@ func (*limitedMatcher) Name() string { } // Match implements Matcher.Match. -func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) { +func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string, string) (bool, bool) { if lm.limit == 0 { return true, false } diff --git a/pkg/tcpip/network/ipv4/stats.go b/pkg/tcpip/network/ipv4/stats.go index 7620728f9..bee72c649 100644 --- a/pkg/tcpip/network/ipv4/stats.go +++ b/pkg/tcpip/network/ipv4/stats.go @@ -35,7 +35,7 @@ type Stats struct { } // IsNetworkEndpointStats implements stack.NetworkEndpointStats. -func (s *Stats) IsNetworkEndpointStats() {} +func (*Stats) IsNetworkEndpointStats() {} // IPStats implements stack.IPNetworkEndointStats func (s *Stats) IPStats() *tcpip.IPStats { diff --git a/pkg/tcpip/network/ipv4/stats_test.go b/pkg/tcpip/network/ipv4/stats_test.go index 84641bcf4..b28e7dcde 100644 --- a/pkg/tcpip/network/ipv4/stats_test.go +++ b/pkg/tcpip/network/ipv4/stats_test.go @@ -34,7 +34,7 @@ func (t *testInterface) ID() tcpip.NICID { return t.nicID } -func getKnownNICIDs(proto *protocol) []tcpip.NICID { +func knownNICIDs(proto *protocol) []tcpip.NICID { var nicIDs []tcpip.NICID for k := range proto.mu.eps { @@ -51,30 +51,28 @@ func TestClearEndpointFromProtocolOnClose(t *testing.T) { proto := s.NetworkProtocolInstance(ProtocolNumber).(*protocol) nic := testInterface{nicID: 1} ep := proto.NewEndpoint(&nic, nil, nil, nil).(*endpoint) - { - proto.mu.Lock() - foundEP, hasEP := proto.mu.eps[nic.ID()] - nicIDs := getKnownNICIDs(proto) - proto.mu.Unlock() + var nicIDs []tcpip.NICID + + proto.mu.Lock() + foundEP, hasEndpointBeforeClose := proto.mu.eps[nic.ID()] + nicIDs = knownNICIDs(proto) + proto.mu.Unlock() - if !hasEP { - t.Fatalf("expected to find the nic id %d in the protocol's endpoint map (%v)", nic.ID(), nicIDs) - } - if foundEP != ep { - t.Fatalf("expected protocol to map endpoint %p to nic id %d, but endpoint %p was found instead", ep, nic.ID(), foundEP) - } + if !hasEndpointBeforeClose { + t.Fatalf("expected to find the nic id %d in the protocol's endpoint map (%v)", nic.ID(), nicIDs) + } + if foundEP != ep { + t.Fatalf("found an incorrect endpoint mapped to nic id %d", nic.ID()) } ep.Close() - { - proto.mu.Lock() - _, hasEP := proto.mu.eps[nic.ID()] - nicIDs := getKnownNICIDs(proto) - proto.mu.Unlock() - if hasEP { - t.Fatalf("unexpectedly found an endpoint mapped to the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) - } + proto.mu.Lock() + _, hasEP := proto.mu.eps[nic.ID()] + nicIDs = knownNICIDs(proto) + proto.mu.Unlock() + if hasEP { + t.Fatalf("unexpectedly found an endpoint mapped to the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) } } diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index ae5179d93..95efada3a 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -688,25 +688,38 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { // LinkAddressRequest implements stack.LinkAddressResolver. func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic stack.NetworkInterface) *tcpip.Error { + nicID := nic.ID() + + p.mu.Lock() + netEP, ok := p.mu.eps[nicID] + p.mu.Unlock() + if !ok { + return tcpip.ErrNotConnected + } + remoteAddr := targetAddr if len(remoteLinkAddr) == 0 { remoteAddr = header.SolicitedNodeAddr(targetAddr) remoteLinkAddr = header.EthernetAddressFromMulticastIPv6Address(remoteAddr) } - r, err := p.stack.FindRoute(nic.ID(), localAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */) - if err != nil { - return err + if len(localAddr) == 0 { + addressEndpoint := netEP.AcquireOutgoingPrimaryAddress(remoteAddr, false /* allowExpired */) + if addressEndpoint == nil { + return tcpip.ErrNetworkUnreachable + } + + localAddr = addressEndpoint.AddressWithPrefix().Address + } else if p.stack.CheckLocalAddress(nicID, ProtocolNumber, localAddr) == 0 { + return tcpip.ErrBadLocalAddress } - defer r.Release() - r.ResolveWith(remoteLinkAddr) optsSerializer := header.NDPOptionsSerializer{ header.NDPSourceLinkLayerAddressOption(nic.LinkAddress()), } neighborSolicitSize := header.ICMPv6NeighborSolicitMinimumSize + optsSerializer.Length() pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborSolicitSize, + ReserveHeaderBytes: int(nic.MaxHeaderLength()) + header.IPv6FixedHeaderSize + neighborSolicitSize, }) pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber packet := header.ICMPv6(pkt.TransportHeader().Push(neighborSolicitSize)) @@ -714,20 +727,18 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot ns := header.NDPNeighborSolicit(packet.MessageBody()) ns.SetTargetAddress(targetAddr) ns.Options().Serialize(optsSerializer) - packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + packet.SetChecksum(header.ICMPv6Checksum(packet, localAddr, remoteAddr, buffer.VectorisedView{})) - p.mu.Lock() - netEP, ok := p.mu.eps[nic.ID()] - p.mu.Unlock() - if !ok { - return tcpip.ErrNotConnected + if err := addIPHeader(localAddr, remoteAddr, pkt, stack.NetworkHeaderParams{ + Protocol: header.ICMPv6ProtocolNumber, + TTL: header.NDPHopLimit, + }, header.IPv6ExtHdrSerializer{}); err != nil { + panic(fmt.Sprintf("failed to add IP header: %s", err)) } + stat := netEP.stats.icmp.packetsSent - if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{ - Protocol: header.ICMPv6ProtocolNumber, - TTL: header.NDPHopLimit, - }, pkt); err != nil { + if err := nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt); err != nil { stat.dropped.Increment() return err } @@ -910,11 +921,11 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi // the error message packet exceed the minimum IPv6 MTU // [IPv6]. mtu := int(route.MTU()) - if mtu > header.IPv6MinimumMTU { - mtu = header.IPv6MinimumMTU + const maxIPv6Data = header.IPv6MinimumMTU - header.IPv6FixedHeaderSize + if mtu > maxIPv6Data { + mtu = maxIPv6Data } - headerLen := int(route.MaxHeaderLength()) + header.ICMPv6ErrorHeaderSize - available := int(mtu) - headerLen + available := mtu - header.ICMPv6ErrorHeaderSize if available < header.IPv6MinimumSize { return nil } @@ -928,7 +939,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi payload.CapLength(payloadLen) newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: headerLen, + ReserveHeaderBytes: int(route.MaxHeaderLength()) + header.ICMPv6ErrorHeaderSize, Data: payload, }) newPkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go index defea46b0..641c60b7c 100644 --- a/pkg/tcpip/network/ipv6/icmp_test.go +++ b/pkg/tcpip/network/ipv6/icmp_test.go @@ -15,6 +15,7 @@ package ipv6 import ( + "bytes" "context" "net" "reflect" @@ -22,6 +23,7 @@ import ( "testing" "time" + "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" @@ -638,7 +640,6 @@ func TestLinkResolution(t *testing.T) { pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize)) pkt.SetType(header.ICMPv6EchoRequest) pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) - payload := tcpip.SlicePayload(hdr.View()) // We can't send our payload directly over the route because that // doesn't provoke NDP discovery. @@ -648,8 +649,12 @@ func TestLinkResolution(t *testing.T) { t.Fatalf("NewEndpoint(_) = (_, %s), want = (_, nil)", err) } - if _, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: nicID, Addr: lladdr1}}); err != nil { - t.Fatalf("ep.Write(_): %s", err) + { + var r bytes.Reader + r.Reset(hdr.View()) + if _, err := ep.Write(&r, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: nicID, Addr: lladdr1}}); err != nil { + t.Fatalf("ep.Write(_): %s", err) + } } for _, args := range []routeArgs{ {src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))}, @@ -1316,13 +1321,13 @@ func TestLinkAddressRequest(t *testing.T) { name: "Unicast with unassigned address", localAddr: lladdr1, remoteLinkAddr: linkAddr1, - expectedErr: tcpip.ErrNetworkUnreachable, + expectedErr: tcpip.ErrBadLocalAddress, }, { name: "Multicast with unassigned address", localAddr: lladdr1, remoteLinkAddr: "", - expectedErr: tcpip.ErrNetworkUnreachable, + expectedErr: tcpip.ErrBadLocalAddress, }, { name: "Unicast with no local address available", @@ -1337,58 +1342,58 @@ func TestLinkAddressRequest(t *testing.T) { } for _, test := range tests { - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol}, - }) - p := s.NetworkProtocolInstance(ProtocolNumber) - linkRes, ok := p.(stack.LinkAddressResolver) - if !ok { - t.Fatalf("expected IPv6 protocol to implement stack.LinkAddressResolver") - } + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol}, + }) + p := s.NetworkProtocolInstance(ProtocolNumber) + linkRes, ok := p.(stack.LinkAddressResolver) + if !ok { + t.Fatalf("expected IPv6 protocol to implement stack.LinkAddressResolver") + } - linkEP := channel.New(defaultChannelSize, defaultMTU, linkAddr0) - if err := s.CreateNIC(nicID, linkEP); err != nil { - t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) - } - if len(test.nicAddr) != 0 { - if err := s.AddAddress(nicID, ProtocolNumber, test.nicAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ProtocolNumber, test.nicAddr, err) + linkEP := channel.New(defaultChannelSize, defaultMTU, linkAddr0) + if err := s.CreateNIC(nicID, linkEP); err != nil { + t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) + } + if len(test.nicAddr) != 0 { + if err := s.AddAddress(nicID, ProtocolNumber, test.nicAddr); err != nil { + t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ProtocolNumber, test.nicAddr, err) + } } - } - // We pass a test network interface to LinkAddressRequest with the same NIC - // ID and link endpoint used by the NIC we created earlier so that we can - // mock a link address request and observe the packets sent to the link - // endpoint even though the stack uses the real NIC. - if err := linkRes.LinkAddressRequest(lladdr0, test.localAddr, test.remoteLinkAddr, &testInterface{LinkEndpoint: linkEP, nicID: nicID}); err != test.expectedErr { - t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s, want = %s", lladdr0, test.localAddr, test.remoteLinkAddr, err, test.expectedErr) - } + // We pass a test network interface to LinkAddressRequest with the same NIC + // ID and link endpoint used by the NIC we created earlier so that we can + // mock a link address request and observe the packets sent to the link + // endpoint even though the stack uses the real NIC. + if err := linkRes.LinkAddressRequest(lladdr0, test.localAddr, test.remoteLinkAddr, &testInterface{LinkEndpoint: linkEP, nicID: nicID}); err != test.expectedErr { + t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s, want = %s", lladdr0, test.localAddr, test.remoteLinkAddr, err, test.expectedErr) + } - if test.expectedErr != nil { - return - } + if test.expectedErr != nil { + return + } - pkt, ok := linkEP.Read() - if !ok { - t.Fatal("expected to send a link address request") - } - if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr { - t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr) - } - if pkt.Route.RemoteAddress != test.expectedRemoteAddr { - t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, test.expectedRemoteAddr) - } - if pkt.Route.LocalAddress != lladdr1 { - t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", pkt.Route.LocalAddress, lladdr1) - } - checker.IPv6(t, stack.PayloadSince(pkt.Pkt.NetworkHeader()), - checker.SrcAddr(lladdr1), - checker.DstAddr(test.expectedRemoteAddr), - checker.TTL(header.NDPHopLimit), - checker.NDPNS( - checker.NDPNSTargetAddress(lladdr0), - checker.NDPNSOptions([]header.NDPOption{header.NDPSourceLinkLayerAddressOption(linkAddr0)}), - )) + pkt, ok := linkEP.Read() + if !ok { + t.Fatal("expected to send a link address request") + } + + var want stack.RouteInfo + want.NetProto = ProtocolNumber + want.RemoteLinkAddress = test.expectedRemoteLinkAddr + if diff := cmp.Diff(want, pkt.Route, cmp.AllowUnexported(want)); diff != "" { + t.Errorf("route info mismatch (-want +got):\n%s", diff) + } + checker.IPv6(t, stack.PayloadSince(pkt.Pkt.NetworkHeader()), + checker.SrcAddr(lladdr1), + checker.DstAddr(test.expectedRemoteAddr), + checker.TTL(header.NDPHopLimit), + checker.NDPNS( + checker.NDPNSTargetAddress(lladdr0), + checker.NDPNSOptions([]header.NDPOption{header.NDPSourceLinkLayerAddressOption(linkAddr0)}), + )) + }) } } diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index 37884505e..d658f9bcb 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -555,7 +555,7 @@ func (e *endpoint) MaxHeaderLength() uint16 { return e.nic.MaxHeaderLength() + header.IPv6MinimumSize } -func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, extensionHeaders header.IPv6ExtHdrSerializer) *tcpip.Error { +func addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, extensionHeaders header.IPv6ExtHdrSerializer) *tcpip.Error { extHdrsLen := extensionHeaders.Length() length := pkt.Size() + extensionHeaders.Length() if length > math.MaxUint16 { @@ -625,14 +625,14 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU ui // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error { - if err := e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* extensionHeaders */); err != nil { + if err := addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* extensionHeaders */); err != nil { return err } // iptables filtering. All packets that reach here are locally // generated. - nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) - if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "", nicName); !ok { + outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesOutputDropped.Increment() return nil @@ -718,7 +718,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe stats := e.stats.ip linkMTU := e.nic.MTU() for pb := pkts.Front(); pb != nil; pb = pb.Next() { - if err := e.addIPHeader(r.LocalAddress, r.RemoteAddress, pb, params, nil /* extensionHeaders */); err != nil { + if err := addIPHeader(r.LocalAddress, r.RemoteAddress, pb, params, nil /* extensionHeaders */); err != nil { return 0, err } @@ -747,8 +747,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe // iptables filtering. All packets that reach here are locally // generated. - nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) - dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, nicName) + outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, "" /* inNicName */, outNicName) if len(dropped) == 0 && len(natPkts) == 0 { // Fast path: If no packets are to be dropped then we can just invoke the // faster WritePackets API directly. @@ -897,7 +897,8 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // Loopback traffic skips the prerouting chain. if !e.nic.IsLoopback() { - if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, ""); !ok { + inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok { // iptables is telling us to drop the packet. stats.IPTablesPreroutingDropped.Increment() return @@ -955,7 +956,8 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { // iptables filtering. All packets that reach here are intended for // this machine and need not be forwarded. - if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "", ""); !ok { + inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) + if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "" /* preroutingAddr */, inNicName, "" /* outNicName */); !ok { // iptables is telling us to drop the packet. stats.IPTablesInputDropped.Increment() return diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go index aa892d043..5276878a0 100644 --- a/pkg/tcpip/network/ipv6/ipv6_test.go +++ b/pkg/tcpip/network/ipv6/ipv6_test.go @@ -371,12 +371,10 @@ func TestAddIpv6Address(t *testing.T) { t.Fatalf("AddAddress(_, %d, nil) = %s", ProtocolNumber, err) } - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err) - } - if addr.Address != test.addr { - t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr.Address, test.addr) + if addr, ok := s.GetMainNICAddress(1, header.IPv6ProtocolNumber); !ok { + t.Fatalf("got stack.GetMainNICAddress(1, %d) = (_, false), want = (_, true)", header.IPv6ProtocolNumber) + } else if addr.Address != test.addr { + t.Fatalf("got stack.GetMainNICAddress(1_, %d) = (%s, true), want = (%s, true)", header.IPv6ProtocolNumber, addr.Address, test.addr) } }) } @@ -2575,7 +2573,7 @@ func (*limitedMatcher) Name() string { } // Match implements Matcher.Match. -func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) { +func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string, string) (bool, bool) { if lm.limit == 0 { return true, false } @@ -2583,7 +2581,7 @@ func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, return false, false } -func getKnownNICIDs(proto *protocol) []tcpip.NICID { +func knownNICIDs(proto *protocol) []tcpip.NICID { var nicIDs []tcpip.NICID for k := range proto.mu.eps { @@ -2600,29 +2598,27 @@ func TestClearEndpointFromProtocolOnClose(t *testing.T) { proto := s.NetworkProtocolInstance(ProtocolNumber).(*protocol) var nic testInterface ep := proto.NewEndpoint(&nic, nil, nil, nil).(*endpoint) - { - proto.mu.Lock() - foundEP, hasEP := proto.mu.eps[nic.ID()] - nicIDs := getKnownNICIDs(proto) - proto.mu.Unlock() - if !hasEP { - t.Fatalf("expected to find the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) - } - if foundEP != ep { - t.Fatalf("expected protocol to map endpoint %p to nic id %d, but endpoint %p was found instead", ep, nic.ID(), foundEP) - } + var nicIDs []tcpip.NICID + + proto.mu.Lock() + foundEP, hasEndpointBeforeClose := proto.mu.eps[nic.ID()] + nicIDs = knownNICIDs(proto) + proto.mu.Unlock() + if !hasEndpointBeforeClose { + t.Fatalf("expected to find the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) + } + if foundEP != ep { + t.Fatalf("found an incorrect endpoint mapped to nic id %d", nic.ID()) } ep.Close() - { - proto.mu.Lock() - _, hasEP := proto.mu.eps[nic.ID()] - nicIDs := getKnownNICIDs(proto) - proto.mu.Unlock() - if hasEP { - t.Fatalf("unexpectedly found an endpoint mapped to the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) - } + proto.mu.Lock() + _, hasEndpointAfterClose := proto.mu.eps[nic.ID()] + nicIDs = knownNICIDs(proto) + proto.mu.Unlock() + if hasEndpointAfterClose { + t.Fatalf("unexpectedly found an endpoint mapped to the nic id %d in the protocol's known nic ids (%v)", nic.ID(), nicIDs) } } diff --git a/pkg/tcpip/network/ipv6/mld.go b/pkg/tcpip/network/ipv6/mld.go index 78d86e523..c376016e9 100644 --- a/pkg/tcpip/network/ipv6/mld.go +++ b/pkg/tcpip/network/ipv6/mld.go @@ -249,7 +249,7 @@ func (mld *mldState) writePacket(destAddress, groupAddress tcpip.Address, mldTyp Data: buffer.View(icmp).ToVectorisedView(), }) - if err := mld.ep.addIPHeader(localAddress, destAddress, pkt, stack.NetworkHeaderParams{ + if err := addIPHeader(localAddress, destAddress, pkt, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.MLDHopLimit, }, extensionHeaders); err != nil { diff --git a/pkg/tcpip/network/ipv6/ndp.go b/pkg/tcpip/network/ipv6/ndp.go index 41112a0c4..ca4ff621d 100644 --- a/pkg/tcpip/network/ipv6/ndp.go +++ b/pkg/tcpip/network/ipv6/ndp.go @@ -732,7 +732,7 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address, addressEndpoint stack.Add }) sent := ndp.ep.stats.icmp.packetsSent - if err := ndp.ep.addIPHeader(header.IPv6Any, snmc, pkt, stack.NetworkHeaderParams{ + if err := addIPHeader(header.IPv6Any, snmc, pkt, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, }, nil /* extensionHeaders */); err != nil { @@ -1857,7 +1857,7 @@ func (ndp *ndpState) startSolicitingRouters() { }) sent := ndp.ep.stats.icmp.packetsSent - if err := ndp.ep.addIPHeader(localAddr, header.IPv6AllRoutersMulticastAddress, pkt, stack.NetworkHeaderParams{ + if err := addIPHeader(localAddr, header.IPv6AllRoutersMulticastAddress, pkt, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, }, nil /* extensionHeaders */); err != nil { diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go index b1a5a5510..7a22309e5 100644 --- a/pkg/tcpip/network/ipv6/ndp_test.go +++ b/pkg/tcpip/network/ipv6/ndp_test.go @@ -162,6 +162,11 @@ func TestStackNDPEndpointInvalidateDefaultRouter(t *testing.T) { } } +type linkResolutionResult struct { + linkAddr tcpip.LinkAddress + ok bool +} + // TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a // valid NDP NS message with the Source Link Layer Address option results in a // new entry in the link address cache for the sender of the message. @@ -231,35 +236,28 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) { Data: hdr.View().ToVectorisedView(), })) - linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil) - if linkAddr != test.expectedLinkAddr { - t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr) - } - - if test.expectedLinkAddr != "" { - if err != nil { - t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err) - } - if c != nil { - t.Errorf("got unexpected channel") - } + ch := make(chan stack.LinkResolutionResult, 1) + err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, func(r stack.LinkResolutionResult) { + ch <- r + }) - // Invalid count should not have increased. - if got := invalid.Value(); got != 0 { - t.Errorf("got invalid = %d, want = 0", got) - } - } else { - if err != tcpip.ErrWouldBlock { - t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock) - } - if c == nil { - t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber) - } + wantInvalid := uint64(0) + wantErr := (*tcpip.Error)(nil) + wantSucccess := true + if len(test.expectedLinkAddr) == 0 { + wantInvalid = 1 + wantErr = tcpip.ErrWouldBlock + wantSucccess = false + } - // Invalid count should have increased. - if got := invalid.Value(); got != 1 { - t.Errorf("got invalid = %d, want = 1", got) - } + if err != wantErr { + t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, _) = %s, want = %s", nicID, lladdr1, lladdr0, ProtocolNumber, err, wantErr) + } + if diff := cmp.Diff(stack.LinkResolutionResult{LinkAddress: test.expectedLinkAddr, Success: wantSucccess}, <-ch); diff != "" { + t.Errorf("linkResolutionResult mismatch (-want +got):\n%s", diff) + } + if got := invalid.Value(); got != wantInvalid { + t.Errorf("got invalid = %d, want = %d", got, wantInvalid) } }) } @@ -640,18 +638,12 @@ func TestNeighorSolicitationResponse(t *testing.T) { t.Fatal("expected an NDP NS response") } - if p.Route.LocalAddress != nicAddr { - t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, nicAddr) - } - if p.Route.LocalLinkAddress != nicLinkAddr { - t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr) - } respNSDst := header.SolicitedNodeAddr(test.nsSrc) - if p.Route.RemoteAddress != respNSDst { - t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst) - } - if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want) + var want stack.RouteInfo + want.NetProto = ProtocolNumber + want.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(respNSDst) + if diff := cmp.Diff(want, p.Route, cmp.AllowUnexported(want)); diff != "" { + t.Errorf("route info mismatch (-want +got):\n%s", diff) } checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), @@ -803,35 +795,28 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) { Data: hdr.View().ToVectorisedView(), })) - linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil) - if linkAddr != test.expectedLinkAddr { - t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr) - } - - if test.expectedLinkAddr != "" { - if err != nil { - t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err) - } - if c != nil { - t.Errorf("got unexpected channel") - } + ch := make(chan stack.LinkResolutionResult, 1) + err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, func(r stack.LinkResolutionResult) { + ch <- r + }) - // Invalid count should not have increased. - if got := invalid.Value(); got != 0 { - t.Errorf("got invalid = %d, want = 0", got) - } - } else { - if err != tcpip.ErrWouldBlock { - t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock) - } - if c == nil { - t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber) - } + wantInvalid := uint64(0) + wantErr := (*tcpip.Error)(nil) + wantSucccess := true + if len(test.expectedLinkAddr) == 0 { + wantInvalid = 1 + wantErr = tcpip.ErrWouldBlock + wantSucccess = false + } - // Invalid count should have increased. - if got := invalid.Value(); got != 1 { - t.Errorf("got invalid = %d, want = 1", got) - } + if err != wantErr { + t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, _) = %s, want = %s", nicID, lladdr1, lladdr0, ProtocolNumber, err, wantErr) + } + if diff := cmp.Diff(stack.LinkResolutionResult{LinkAddress: test.expectedLinkAddr, Success: wantSucccess}, <-ch); diff != "" { + t.Errorf("linkResolutionResult mismatch (-want +got):\n%s", diff) + } + if got := invalid.Value(); got != wantInvalid { + t.Errorf("got invalid = %d, want = %d", got, wantInvalid) } }) } diff --git a/pkg/tcpip/network/ipv6/stats.go b/pkg/tcpip/network/ipv6/stats.go index a2f2f4f78..0839be3cd 100644 --- a/pkg/tcpip/network/ipv6/stats.go +++ b/pkg/tcpip/network/ipv6/stats.go @@ -32,7 +32,7 @@ type Stats struct { } // IsNetworkEndpointStats implements stack.NetworkEndpointStats. -func (s *Stats) IsNetworkEndpointStats() {} +func (*Stats) IsNetworkEndpointStats() {} // IPStats implements stack.IPNetworkEndointStats func (s *Stats) IPStats() *tcpip.IPStats { diff --git a/pkg/tcpip/network/testutil/BUILD b/pkg/tcpip/network/testutil/BUILD index 652b92a21..bd62c4482 100644 --- a/pkg/tcpip/network/testutil/BUILD +++ b/pkg/tcpip/network/testutil/BUILD @@ -9,6 +9,7 @@ go_library( "testutil_unsafe.go", ], visibility = [ + "//pkg/tcpip/network/arp:__pkg__", "//pkg/tcpip/network/fragmentation:__pkg__", "//pkg/tcpip/network/ipv4:__pkg__", "//pkg/tcpip/network/ipv6:__pkg__", diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD index cf0a5fefe..db9b91815 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/BUILD +++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD @@ -8,7 +8,6 @@ go_binary( visibility = ["//:sandbox"], deps = [ "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/rawfile", diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go index 3b4f900e3..3d9954c84 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/main.go +++ b/pkg/tcpip/sample/tun_tcp_connect/main.go @@ -41,7 +41,7 @@ package main import ( - "bufio" + "bytes" "fmt" "log" "math/rand" @@ -51,7 +51,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" @@ -71,24 +70,21 @@ func writer(ch chan struct{}, ep tcpip.Endpoint) { close(ch) }() - r := bufio.NewReader(os.Stdin) - for { - v := buffer.NewView(1024) - n, err := r.Read(v) - if err != nil { - return - } - - v.CapLength(n) - for len(v) > 0 { - n, err := ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{}) - if err != nil { - fmt.Println("Write failed:", err) - return + var b bytes.Buffer + if err := func() error { + for { + if _, err := b.ReadFrom(os.Stdin); err != nil { + return fmt.Errorf("b.ReadFrom failed: %w", err) } - v.TrimFront(int(n)) + for b.Len() != 0 { + if _, err := ep.Write(&b, tcpip.WriteOptions{Atomic: true}); err != nil { + return fmt.Errorf("ep.Write failed: %s", err) + } + } } + }(); err != nil { + fmt.Println(err) } } diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go index 3ac562756..ae9cf44e7 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/main.go +++ b/pkg/tcpip/sample/tun_tcp_echo/main.go @@ -20,6 +20,7 @@ package main import ( + "bytes" "flag" "io" "log" @@ -58,7 +59,9 @@ func (e *tcpipError) Error() string { } func (e *endpointWriter) Write(p []byte) (int, error) { - n, err := e.ep.Write(tcpip.SlicePayload(p), tcpip.WriteOptions{}) + var r bytes.Reader + r.Reset(p) + n, err := e.ep.Write(&r, tcpip.WriteOptions{}) if err != nil { return int(n), &tcpipError{ inner: err, diff --git a/pkg/tcpip/socketops.go b/pkg/tcpip/socketops.go index f3ad40fdf..7eabbc599 100644 --- a/pkg/tcpip/socketops.go +++ b/pkg/tcpip/socketops.go @@ -15,11 +15,16 @@ package tcpip import ( + "math" "sync/atomic" "gvisor.dev/gvisor/pkg/sync" ) +// PacketOverheadFactor is used to multiply the value provided by the user on a +// SetSockOpt for setting the send/receive buffer sizes sockets. +const PacketOverheadFactor = 2 + // SocketOptionsHandler holds methods that help define endpoint specific // behavior for socket level socket options. These must be implemented by // endpoints to get notified when socket level options are set. @@ -48,6 +53,12 @@ type SocketOptionsHandler interface { // HasNIC is invoked to check if the NIC is valid for SO_BINDTODEVICE. HasNIC(v int32) bool + + // GetSendBufferSize is invoked to get the SO_SNDBUFSIZE. + GetSendBufferSize() (int64, *Error) + + // IsUnixSocket is invoked to check if the socket is of unix domain. + IsUnixSocket() bool } // DefaultSocketOptionsHandler is an embeddable type that implements no-op @@ -84,6 +95,27 @@ func (*DefaultSocketOptionsHandler) HasNIC(int32) bool { return false } +// GetSendBufferSize implements SocketOptionsHandler.GetSendBufferSize. +func (*DefaultSocketOptionsHandler) GetSendBufferSize() (int64, *Error) { + return 0, nil +} + +// IsUnixSocket implements SocketOptionsHandler.IsUnixSocket. +func (*DefaultSocketOptionsHandler) IsUnixSocket() bool { + return false +} + +// StackHandler holds methods to access the stack options. These must be +// implemented by the stack. +type StackHandler interface { + // Option allows retrieving stack wide options. + Option(option interface{}) *Error + + // TransportProtocolOption allows retrieving individual protocol level + // option values. + TransportProtocolOption(proto TransportProtocolNumber, option GettableTransportProtocolOption) *Error +} + // SocketOptions contains all the variables which store values for SOL_SOCKET, // SOL_IP, SOL_IPV6 and SOL_TCP level options. // @@ -91,6 +123,9 @@ func (*DefaultSocketOptionsHandler) HasNIC(int32) bool { type SocketOptions struct { handler SocketOptionsHandler + // StackHandler is initialized at the creation time and will not change. + stackHandler StackHandler `state:"manual"` + // These fields are accessed and modified using atomic operations. // broadcastEnabled determines whether datagram sockets are allowed to @@ -170,6 +205,14 @@ type SocketOptions struct { // bindToDevice determines the device to which the socket is bound. bindToDevice int32 + // getSendBufferLimits provides the handler to get the min, default and + // max size for send buffer. It is initialized at the creation time and + // will not change. + getSendBufferLimits GetSendBufferLimits `state:"manual"` + + // sendBufferSize determines the send buffer size for this socket. + sendBufferSize int64 + // mu protects the access to the below fields. mu sync.Mutex `state:"nosave"` @@ -180,8 +223,10 @@ type SocketOptions struct { // InitHandler initializes the handler. This must be called before using the // socket options utility. -func (so *SocketOptions) InitHandler(handler SocketOptionsHandler) { +func (so *SocketOptions) InitHandler(handler SocketOptionsHandler, stack StackHandler, getSendBufferLimits GetSendBufferLimits) { so.handler = handler + so.stackHandler = stack + so.getSendBufferLimits = getSendBufferLimits } func storeAtomicBool(addr *uint32, v bool) { @@ -518,3 +563,44 @@ func (so *SocketOptions) SetBindToDevice(bindToDevice int32) *Error { atomic.StoreInt32(&so.bindToDevice, bindToDevice) return nil } + +// GetSendBufferSize gets value for SO_SNDBUF option. +func (so *SocketOptions) GetSendBufferSize() (int64, *Error) { + if so.handler.IsUnixSocket() { + return so.handler.GetSendBufferSize() + } + return atomic.LoadInt64(&so.sendBufferSize), nil +} + +// SetSendBufferSize sets value for SO_SNDBUF option. notify indicates if the +// stack handler should be invoked to set the send buffer size. +func (so *SocketOptions) SetSendBufferSize(sendBufferSize int64, notify bool) { + if so.handler.IsUnixSocket() { + return + } + + v := sendBufferSize + if notify { + // TODO(b/176170271): Notify waiters after size has grown. + // Make sure the send buffer size is within the min and max + // allowed. + ss := so.getSendBufferLimits(so.stackHandler) + min := int64(ss.Min) + max := int64(ss.Max) + // Validate the send buffer size with min and max values. + // Multiply it by factor of 2. + if v > max { + v = max + } + + if v < math.MaxInt32/PacketOverheadFactor { + v *= PacketOverheadFactor + if v < min { + v = min + } + } else { + v = math.MaxInt32 + } + } + atomic.StoreInt64(&so.sendBufferSize, v) +} diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 09c7811fa..04af933a6 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -267,11 +267,11 @@ const ( // dropped. // // TODO(gvisor.dev/issue/170): PacketBuffer should hold the GSO and route, from -// which address and nicName can be gathered. Currently, address is only -// needed for prerouting and nicName is only needed for output. +// which address can be gathered. Currently, address is only needed for +// prerouting. // // Precondition: pkt.NetworkHeader is set. -func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) bool { +func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) bool { if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber { return true } @@ -302,7 +302,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, prer table = it.v4Tables[tableID] } ruleIdx := table.BuiltinChains[hook] - switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict { + switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, inNicName, outNicName); verdict { // If the table returns Accept, move on to the next table. case chainAccept: continue @@ -385,10 +385,10 @@ func (it *IPTables) startReaper(interval time.Duration) { // // NOTE: unlike the Check API the returned map contains packets that should be // dropped. -func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *Route, nicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { +func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *Route, inNicName, outNicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { if !pkt.NatDone { - if ok := it.Check(hook, pkt, gso, r, "", nicName); !ok { + if ok := it.Check(hook, pkt, gso, r, "", inNicName, outNicName); !ok { if drop == nil { drop = make(map[*PacketBuffer]struct{}) } @@ -408,11 +408,11 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r * // Preconditions: // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. -func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) chainVerdict { +func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) chainVerdict { // Start from ruleIdx and walk the list of rules until a rule gives us // a verdict. for ruleIdx < len(table.Rules) { - switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict { + switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, inNicName, outNicName); verdict { case RuleAccept: return chainAccept @@ -429,7 +429,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId ruleIdx++ continue } - switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, nicName); verdict { + switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, inNicName, outNicName); verdict { case chainAccept: return chainAccept case chainDrop: @@ -455,11 +455,11 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId // Preconditions: // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. -func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) (RuleVerdict, int) { +func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) (RuleVerdict, int) { rule := table.Rules[ruleIdx] // Check whether the packet matches the IP header filter. - if !rule.Filter.match(pkt, hook, nicName) { + if !rule.Filter.match(pkt, hook, inNicName, outNicName) { // Continue on to the next rule. return RuleJump, ruleIdx + 1 } @@ -467,7 +467,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx // Go through each rule matcher. If they all match, run // the rule target. for _, matcher := range rule.Matchers { - matches, hotdrop := matcher.Match(hook, pkt, "") + matches, hotdrop := matcher.Match(hook, pkt, inNicName, outNicName) if hotdrop { return RuleDrop, 0 } diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index 56a3e7861..fd9d61e39 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -210,8 +210,19 @@ type IPHeaderFilter struct { // filter will match packets that fail the source comparison. SrcInvert bool - // OutputInterface matches the name of the outgoing interface for the - // packet. + // InputInterface matches the name of the incoming interface for the packet. + InputInterface string + + // InputInterfaceMask masks the characters of the interface name when + // comparing with InputInterface. + InputInterfaceMask string + + // InputInterfaceInvert inverts the meaning of incoming interface check, + // i.e. when true the filter will match packets that fail the incoming + // interface comparison. + InputInterfaceInvert bool + + // OutputInterface matches the name of the outgoing interface for the packet. OutputInterface string // OutputInterfaceMask masks the characters of the interface name when @@ -228,7 +239,7 @@ type IPHeaderFilter struct { // // Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4 // or IPv6 header length. -func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) bool { +func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, inNicName, outNicName string) bool { // Extract header fields. var ( // TODO(gvisor.dev/issue/170): Support other filter fields. @@ -264,26 +275,35 @@ func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) boo return false } - // Check the output interface. - // TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING - // hooks after supported. - if hook == Output { - n := len(fl.OutputInterface) - if n == 0 { - return true - } - - // If the interface name ends with '+', any interface which - // begins with the name should be matched. - ifName := fl.OutputInterface - matches := nicName == ifName - if strings.HasSuffix(ifName, "+") { - matches = strings.HasPrefix(nicName, ifName[:n-1]) - } - return fl.OutputInterfaceInvert != matches + switch hook { + case Prerouting, Input: + return matchIfName(inNicName, fl.InputInterface, fl.InputInterfaceInvert) + case Output: + return matchIfName(outNicName, fl.OutputInterface, fl.OutputInterfaceInvert) + case Forward, Postrouting: + // TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING + // hooks after supported. + return true + default: + panic(fmt.Sprintf("unknown hook: %d", hook)) } +} - return true +func matchIfName(nicName string, ifName string, invert bool) bool { + n := len(ifName) + if n == 0 { + // If the interface name is omitted in the filter, any interface will match. + return true + } + // If the interface name ends with '+', any interface which begins with the + // name should be matched. + var matches bool + if strings.HasSuffix(ifName, "+") { + matches = strings.HasPrefix(nicName, ifName[:n-1]) + } else { + matches = nicName == ifName + } + return matches != invert } // NetworkProtocol returns the protocol (IPv4 or IPv6) on to which the header @@ -320,7 +340,7 @@ type Matcher interface { // used for suspicious packets. // // Precondition: packet.NetworkHeader is set. - Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool) + Match(hook Hook, packet *PacketBuffer, inputInterfaceName, outputInterfaceName string) (matches bool, hotdrop bool) } // A Target is the interface for taking an action for a packet. diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go index 3c4fa341e..ba6d56a7d 100644 --- a/pkg/tcpip/stack/linkaddrcache.go +++ b/pkg/tcpip/stack/linkaddrcache.go @@ -32,6 +32,8 @@ var _ LinkAddressCache = (*linkAddrCache)(nil) // // This struct is safe for concurrent use. type linkAddrCache struct { + nic *NIC + // ageLimit is how long a cache entry is valid for. ageLimit time.Duration @@ -79,6 +81,8 @@ type linkAddrEntry struct { // linkAddrEntryEntry access is synchronized by the linkAddrCache lock. linkAddrEntryEntry + cache *linkAddrCache + // TODO(gvisor.dev/issue/5150): move these fields under mu. // mu protects the fields below. mu sync.RWMutex @@ -93,17 +97,26 @@ type linkAddrEntry struct { done chan struct{} // onResolve is called with the result of address resolution. - onResolve []func(tcpip.LinkAddress, bool) + onResolve []func(LinkResolutionResult) } func (e *linkAddrEntry) notifyCompletionLocked(linkAddr tcpip.LinkAddress) { + res := LinkResolutionResult{LinkAddress: linkAddr, Success: len(linkAddr) != 0} for _, callback := range e.onResolve { - callback(linkAddr, len(linkAddr) != 0) + callback(res) } e.onResolve = nil if ch := e.done; ch != nil { close(ch) e.done = nil + // Dequeue the pending packets in a new goroutine to not hold up the current + // goroutine as writing packets may be a costly operation. + // + // At the time of writing, when writing packets, a neighbor's link address + // is resolved (which ends up obtaining the entry's lock) while holding the + // link resolution queue's lock. Dequeuing packets in a new goroutine avoids + // a lock ordering violation. + go e.cache.nic.linkResQueue.dequeue(ch, linkAddr, len(linkAddr) != 0) } } @@ -174,8 +187,9 @@ func (c *linkAddrCache) getOrCreateEntryLocked(k tcpip.Address) *linkAddrEntry { } *entry = linkAddrEntry{ - addr: k, - s: incomplete, + cache: c, + addr: k, + s: incomplete, } c.cache.table[k] = entry c.cache.lru.PushFront(entry) @@ -183,7 +197,7 @@ func (c *linkAddrCache) getOrCreateEntryLocked(k tcpip.Address) *linkAddrEntry { } // get reports any known link address for k. -func (c *linkAddrCache) get(k tcpip.Address, linkRes LinkAddressResolver, localAddr tcpip.Address, nic NetworkInterface, onResolve func(tcpip.LinkAddress, bool)) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { +func (c *linkAddrCache) get(k tcpip.Address, linkRes LinkAddressResolver, localAddr tcpip.Address, nic NetworkInterface, onResolve func(LinkResolutionResult)) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { c.cache.Lock() defer c.cache.Unlock() entry := c.getOrCreateEntryLocked(k) @@ -195,7 +209,7 @@ func (c *linkAddrCache) get(k tcpip.Address, linkRes LinkAddressResolver, localA if !time.Now().After(entry.expiration) { // Not expired. if onResolve != nil { - onResolve(entry.linkAddr, true) + onResolve(LinkResolutionResult{LinkAddress: entry.linkAddr, Success: true}) } return entry.linkAddr, nil, nil } @@ -264,8 +278,9 @@ func (c *linkAddrCache) checkLinkRequest(now time.Time, k tcpip.Address, attempt return true } -func newLinkAddrCache(ageLimit, resolutionTimeout time.Duration, resolutionAttempts int) *linkAddrCache { +func newLinkAddrCache(nic *NIC, ageLimit, resolutionTimeout time.Duration, resolutionAttempts int) *linkAddrCache { c := &linkAddrCache{ + nic: nic, ageLimit: ageLimit, resolutionTimeout: resolutionTimeout, resolutionAttempts: resolutionAttempts, diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go index 8c35067c6..88fbbf3fe 100644 --- a/pkg/tcpip/stack/linkaddrcache_test.go +++ b/pkg/tcpip/stack/linkaddrcache_test.go @@ -93,8 +93,14 @@ func getBlocking(c *linkAddrCache, addr tcpip.Address, linkRes LinkAddressResolv } } +func newEmptyNIC() *NIC { + n := &NIC{} + n.linkResQueue.init(n) + return n +} + func TestCacheOverflow(t *testing.T) { - c := newLinkAddrCache(1<<63-1, 1*time.Second, 3) + c := newLinkAddrCache(newEmptyNIC(), 1<<63-1, 1*time.Second, 3) for i := len(testAddrs) - 1; i >= 0; i-- { e := testAddrs[i] c.AddLinkAddress(e.addr, e.linkAddr) @@ -129,7 +135,7 @@ func TestCacheOverflow(t *testing.T) { } func TestCacheConcurrent(t *testing.T) { - c := newLinkAddrCache(1<<63-1, 1*time.Second, 3) + c := newLinkAddrCache(newEmptyNIC(), 1<<63-1, 1*time.Second, 3) linkRes := &testLinkAddressResolver{cache: c} var wg sync.WaitGroup @@ -165,7 +171,7 @@ func TestCacheConcurrent(t *testing.T) { } func TestCacheAgeLimit(t *testing.T) { - c := newLinkAddrCache(1*time.Millisecond, 1*time.Second, 3) + c := newLinkAddrCache(newEmptyNIC(), 1*time.Millisecond, 1*time.Second, 3) linkRes := &testLinkAddressResolver{cache: c} e := testAddrs[0] @@ -177,7 +183,7 @@ func TestCacheAgeLimit(t *testing.T) { } func TestCacheReplace(t *testing.T) { - c := newLinkAddrCache(1<<63-1, 1*time.Second, 3) + c := newLinkAddrCache(newEmptyNIC(), 1<<63-1, 1*time.Second, 3) e := testAddrs[0] l2 := e.linkAddr + "2" c.AddLinkAddress(e.addr, e.linkAddr) @@ -206,7 +212,7 @@ func TestCacheResolution(t *testing.T) { // // Using a large resolution timeout decreases the probability of experiencing // this race condition and does not affect how long this test takes to run. - c := newLinkAddrCache(1<<63-1, math.MaxInt64, 1) + c := newLinkAddrCache(newEmptyNIC(), 1<<63-1, math.MaxInt64, 1) linkRes := &testLinkAddressResolver{cache: c} for i, ta := range testAddrs { got, err := getBlocking(c, ta.addr, linkRes) @@ -232,7 +238,7 @@ func TestCacheResolution(t *testing.T) { } func TestCacheResolutionFailed(t *testing.T) { - c := newLinkAddrCache(1<<63-1, 10*time.Millisecond, 5) + c := newLinkAddrCache(newEmptyNIC(), 1<<63-1, 10*time.Millisecond, 5) linkRes := &testLinkAddressResolver{cache: c} var requestCount uint32 @@ -265,7 +271,7 @@ func TestCacheResolutionFailed(t *testing.T) { func TestCacheResolutionTimeout(t *testing.T) { resolverDelay := 500 * time.Millisecond expiration := resolverDelay / 10 - c := newLinkAddrCache(expiration, 1*time.Millisecond, 3) + c := newLinkAddrCache(newEmptyNIC(), expiration, 1*time.Millisecond, 3) linkRes := &testLinkAddressResolver{cache: c, delay: resolverDelay} e := testAddrs[0] diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 270f5fb1a..d7bbb25ea 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -45,6 +45,8 @@ const ( linkAddr3 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08") linkAddr4 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09") + defaultPrefixLen = 128 + // Extra time to use when waiting for an async event to occur. defaultAsyncPositiveEventTimeout = 10 * time.Second @@ -330,8 +332,12 @@ func TestDADDisabled(t *testing.T) { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) + addrWithPrefix := tcpip.AddressWithPrefix{ + Address: addr1, + PrefixLen: defaultPrefixLen, + } + if err := s.AddAddressWithPrefix(nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { + t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addrWithPrefix, err) } // Should get the address immediately since we should not have performed @@ -344,12 +350,8 @@ func TestDADDisabled(t *testing.T) { default: t.Fatal("expected DAD event") } - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(%d, %d) err = %s", nicID, header.IPv6ProtocolNumber, err) - } - if addr.Address != addr1 { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { + t.Fatal(err) } // We should not have sent any NDP NS messages. @@ -440,24 +442,24 @@ func TestDADResolve(t *testing.T) { NIC: nicID, }}) - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) + addrWithPrefix := tcpip.AddressWithPrefix{ + Address: addr1, + PrefixLen: defaultPrefixLen, + } + if err := s.AddAddressWithPrefix(nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { + t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addrWithPrefix, err) } // Address should not be considered bound to the NIC yet (DAD ongoing). - if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } else if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Make sure the address does not resolve before the resolution time has // passed. time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - defaultAsyncNegativeEventTimeout) - if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } else if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Error(err) } // Should not get a route even if we specify the local address as the // tentative address. @@ -493,10 +495,8 @@ func TestDADResolve(t *testing.T) { t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } else if addr.Address != addr1 { - t.Errorf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { + t.Error(err) } // Should get a route using the address now that it is resolved. { @@ -662,12 +662,8 @@ func TestDADFail(t *testing.T) { // Address should not be considered bound to the NIC yet // (DAD ongoing). - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Receive a packet to simulate an address conflict. @@ -691,12 +687,8 @@ func TestDADFail(t *testing.T) { t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Attempting to add the address again should not fail if the address's @@ -777,12 +769,8 @@ func TestDADStop(t *testing.T) { } // Address should not be considered bound to the NIC yet (DAD ongoing). - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } test.stopFn(t, s) @@ -800,12 +788,8 @@ func TestDADStop(t *testing.T) { } if !test.skipFinalAddrCheck { - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } } @@ -901,26 +885,25 @@ func TestSetNDPConfigurations(t *testing.T) { } // Add addresses for each NIC. - if err := s.AddAddress(nicID1, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addr1, err) + addrWithPrefix1 := tcpip.AddressWithPrefix{Address: addr1, PrefixLen: defaultPrefixLen} + if err := s.AddAddressWithPrefix(nicID1, header.IPv6ProtocolNumber, addrWithPrefix1); err != nil { + t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addrWithPrefix1, err) } - if err := s.AddAddress(nicID2, header.IPv6ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addr2, err) + addrWithPrefix2 := tcpip.AddressWithPrefix{Address: addr2, PrefixLen: defaultPrefixLen} + if err := s.AddAddressWithPrefix(nicID2, header.IPv6ProtocolNumber, addrWithPrefix2); err != nil { + t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addrWithPrefix2, err) } expectDADEvent(nicID2, addr2) - if err := s.AddAddress(nicID3, header.IPv6ProtocolNumber, addr3); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addr3, err) + addrWithPrefix3 := tcpip.AddressWithPrefix{Address: addr3, PrefixLen: defaultPrefixLen} + if err := s.AddAddressWithPrefix(nicID3, header.IPv6ProtocolNumber, addrWithPrefix3); err != nil { + t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addrWithPrefix3, err) } expectDADEvent(nicID3, addr3) // Address should not be considered bound to NIC(1) yet // (DAD ongoing). - addr, err := s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID1, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Should get the address on NIC(2) and NIC(3) @@ -928,31 +911,19 @@ func TestSetNDPConfigurations(t *testing.T) { // it as the stack was configured to not do DAD by // default and we only updated the NDP configurations on // NIC(1). - addr, err = s.GetMainNICAddress(nicID2, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID2, header.IPv6ProtocolNumber, err) - } - if addr.Address != addr2 { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID2, header.IPv6ProtocolNumber, addr, addr2) + if err := checkGetMainNICAddress(s, nicID2, header.IPv6ProtocolNumber, addrWithPrefix2); err != nil { + t.Fatal(err) } - addr, err = s.GetMainNICAddress(nicID3, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID3, header.IPv6ProtocolNumber, err) - } - if addr.Address != addr3 { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID3, header.IPv6ProtocolNumber, addr, addr3) + if err := checkGetMainNICAddress(s, nicID3, header.IPv6ProtocolNumber, addrWithPrefix3); err != nil { + t.Fatal(err) } // Sleep until right (500ms before) before resolution to // make sure the address didn't resolve on NIC(1) yet. const delta = 500 * time.Millisecond time.Sleep(time.Duration(test.dupAddrDetectTransmits)*test.expectedRetransmitTimer - delta) - addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID1, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Wait for DAD to resolve. @@ -970,12 +941,8 @@ func TestSetNDPConfigurations(t *testing.T) { t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err) - } - if addr.Address != addr1 { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID1, header.IPv6ProtocolNumber, addr, addr1) + if err := checkGetMainNICAddress(s, nicID1, header.IPv6ProtocolNumber, addrWithPrefix1); err != nil { + t.Fatal(err) } }) } @@ -2946,10 +2913,8 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) { t.Helper() - if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) - } else if got != addr { - t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addr); err != nil { + t.Fatal(err) } if got := addrForNewConnection(t, s); got != addr.Address { @@ -3094,10 +3059,8 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) { t.Helper() - if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) - } else if got != addr { - t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addr); err != nil { + t.Fatal(err) } if got := addrForNewConnection(t, s); got != addr.Address { @@ -3244,10 +3207,8 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { t.Fatalf("should not have %s in the list of addresses", addr2) } // Should not have any primary endpoints. - if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) - } else if want := (tcpip.AddressWithPrefix{}); got != want { - t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } wq := waiter.Queue{} we, ch := waiter.NewChannelEntry(nil) @@ -3621,10 +3582,8 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) { expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) { t.Helper() - if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil { - t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) - } else if got != addr { - t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addr); err != nil { + t.Fatal(err) } if got := addrForNewConnection(t, s); got != addr.Address { diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go index acee72572..204196d00 100644 --- a/pkg/tcpip/stack/neighbor_cache.go +++ b/pkg/tcpip/stack/neighbor_cache.go @@ -126,7 +126,7 @@ func (n *neighborCache) getOrCreateEntry(remoteAddr tcpip.Address, linkRes LinkA // packet prompting NUD/link address resolution. // // TODO(gvisor.dev/issue/5151): Don't return the neighbor entry. -func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver, onResolve func(tcpip.LinkAddress, bool)) (NeighborEntry, <-chan struct{}, *tcpip.Error) { +func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver, onResolve func(LinkResolutionResult)) (NeighborEntry, <-chan struct{}, *tcpip.Error) { entry := n.getOrCreateEntry(remoteAddr, linkRes) entry.mu.Lock() defer entry.mu.Unlock() @@ -142,7 +142,7 @@ func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkA // a node continues sending packets to that neighbor using the cached // link-layer address." if onResolve != nil { - onResolve(entry.neigh.LinkAddr, true) + onResolve(LinkResolutionResult{LinkAddress: entry.neigh.LinkAddr, Success: true}) } return entry.neigh, nil, nil case Unknown, Incomplete, Failed: diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go index db27cbc73..dbdb51bb4 100644 --- a/pkg/tcpip/stack/neighbor_cache_test.go +++ b/pkg/tcpip/stack/neighbor_cache_test.go @@ -1188,12 +1188,9 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) { if !ok { t.Fatalf("store.entry(%d) not found", i) } - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if !ok { - t.Fatal("expected successful address resolution") - } - if linkAddr != entry.LinkAddr { - t.Fatalf("got linkAddr = %s, want = %s", linkAddr, entry.LinkAddr) + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{LinkAddress: entry.LinkAddr, Success: true}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1247,12 +1244,9 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) { t.Fatalf("store.entry(%d) not found", i) } - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if !ok { - t.Fatal("expected successful address resolution") - } - if linkAddr != entry.LinkAddr { - t.Fatalf("got linkAddr = %s, want = %s", linkAddr, entry.LinkAddr) + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{LinkAddress: entry.LinkAddr, Success: true}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1423,12 +1417,9 @@ func TestNeighborCacheReplace(t *testing.T) { t.Fatal("store.entry(0) not found") } - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if !ok { - t.Fatal("expected successful address resolution") - } - if linkAddr != entry.LinkAddr { - t.Fatalf("got linkAddr = %s, want = %s", linkAddr, entry.LinkAddr) + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{LinkAddress: entry.LinkAddr, Success: true}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1539,12 +1530,9 @@ func TestNeighborCacheResolutionFailed(t *testing.T) { // First, sanity check that resolution is working { - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if !ok { - t.Fatal("expected successful address resolution") - } - if linkAddr != entry.LinkAddr { - t.Fatalf("got linkAddr = %s, want = %s", linkAddr, entry.LinkAddr) + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{LinkAddress: entry.LinkAddr, Success: true}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1576,15 +1564,9 @@ func TestNeighborCacheResolutionFailed(t *testing.T) { entry.Addr += "2" { - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if ok { - t.Error("expected unsuccessful address resolution") - } - if len(linkAddr) != 0 { - t.Fatalf("got linkAddr = %s, want = \"\"", linkAddr) - } - if t.Failed() { - t.FailNow() + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{Success: false}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1627,15 +1609,9 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) { t.Fatal("store.entry(0) not found") } - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if ok { - t.Error("expected unsuccessful address resolution") - } - if len(linkAddr) != 0 { - t.Fatalf("got linkAddr = %s, want = \"\"", linkAddr) - } - if t.Failed() { - t.FailNow() + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{Success: false}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1674,15 +1650,9 @@ func TestNeighborCacheRetryResolution(t *testing.T) { // Perform address resolution with a faulty link, which will fail. { - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if ok { - t.Error("expected unsuccessful address resolution") - } - if len(linkAddr) != 0 { - t.Fatalf("got linkAddr = %s, want = \"\"", linkAddr) - } - if t.Failed() { - t.FailNow() + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{Success: false}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1713,9 +1683,9 @@ func TestNeighborCacheRetryResolution(t *testing.T) { // Retry address resolution with a working link. linkRes.dropReplies = false { - incompleteEntry, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if linkAddr != entry.LinkAddr { - t.Fatalf("got linkAddr = %s, want = %s", linkAddr, entry.LinkAddr) + incompleteEntry, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{LinkAddress: entry.LinkAddr, Success: true}, r); diff != "" { + t.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { @@ -1772,12 +1742,9 @@ func BenchmarkCacheClear(b *testing.B) { b.Fatalf("store.entry(%d) not found", i) } - _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(linkAddr tcpip.LinkAddress, ok bool) { - if !ok { - b.Fatal("expected successful address resolution") - } - if linkAddr != entry.LinkAddr { - b.Fatalf("got linkAddr = %s, want = %s", linkAddr, entry.LinkAddr) + _, ch, err := neigh.entry(entry.Addr, "", linkRes, func(r LinkResolutionResult) { + if diff := cmp.Diff(LinkResolutionResult{LinkAddress: entry.LinkAddr, Success: true}, r); diff != "" { + b.Fatalf("got link resolution result mismatch (-want +got):\n%s", diff) } }) if err != tcpip.ErrWouldBlock { diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go index 75afb3001..53ac9bb6e 100644 --- a/pkg/tcpip/stack/neighbor_entry.go +++ b/pkg/tcpip/stack/neighbor_entry.go @@ -96,7 +96,7 @@ type neighborEntry struct { done chan struct{} // onResolve is called with the result of address resolution. - onResolve []func(tcpip.LinkAddress, bool) + onResolve []func(LinkResolutionResult) isRouter bool job *tcpip.Job @@ -143,13 +143,22 @@ func newStaticNeighborEntry(nic *NIC, addr tcpip.Address, linkAddr tcpip.LinkAdd // // Precondition: e.mu MUST be locked. func (e *neighborEntry) notifyCompletionLocked(succeeded bool) { + res := LinkResolutionResult{LinkAddress: e.neigh.LinkAddr, Success: succeeded} for _, callback := range e.onResolve { - callback(e.neigh.LinkAddr, succeeded) + callback(res) } e.onResolve = nil if ch := e.done; ch != nil { close(ch) e.done = nil + // Dequeue the pending packets in a new goroutine to not hold up the current + // goroutine as writing packets may be a costly operation. + // + // At the time of writing, when writing packets, a neighbor's link address + // is resolved (which ends up obtaining the entry's lock) while holding the + // link resolution queue's lock. Dequeuing packets in a new goroutine avoids + // a lock ordering violation. + go e.nic.linkResQueue.dequeue(ch, e.neigh.LinkAddr, succeeded) } } diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index f2bca93d3..1bbfe6213 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -139,9 +139,9 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC context: ctx, stats: makeNICStats(), networkEndpoints: make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint), - linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts), } - nic.linkResQueue.init() + nic.linkResQueue.init(nic) + nic.linkAddrCache = newLinkAddrCache(nic, ageLimit, resolutionTimeout, resolutionAttempts) nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber]*packetEndpointList) // Check for Neighbor Unreachability Detection support. @@ -303,6 +303,10 @@ func (n *NIC) IsLoopback() bool { // WritePacket implements NetworkLinkEndpoint. func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error { + _, err := n.enqueuePacketBuffer(r, gso, protocol, pkt) + return err +} +func (n *NIC) enqueuePacketBuffer(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) (int, *tcpip.Error) { // As per relevant RFCs, we should queue packets while we wait for link // resolution to complete. // @@ -320,16 +324,7 @@ func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumb // be limited to some small value. When a queue overflows, the new arrival // SHOULD replace the oldest entry. Once address resolution completes, the // node transmits any queued packets. - if ch, err := r.Resolve(nil); err != nil { - if err == tcpip.ErrWouldBlock { - r.Acquire() - n.linkResQueue.enqueue(ch, r, protocol, pkt) - return nil - } - return err - } - - return n.writePacket(r.Fields(), gso, protocol, pkt) + return n.linkResQueue.enqueue(r, gso, protocol, pkt) } // WritePacketToRemote implements NetworkInterface. @@ -344,6 +339,9 @@ func (n *NIC) writePacket(r RouteInfo, gso *GSO, protocol tcpip.NetworkProtocolN // WritePacket takes ownership of pkt, calculate numBytes first. numBytes := pkt.Size() + pkt.EgressRoute = r + pkt.GSOOptions = gso + pkt.NetworkProtocolNumber = protocol if err := n.LinkEndpoint.WritePacket(r, gso, protocol, pkt); err != nil { return err } @@ -355,9 +353,17 @@ func (n *NIC) writePacket(r RouteInfo, gso *GSO, protocol tcpip.NetworkProtocolN // WritePackets implements NetworkLinkEndpoint. func (n *NIC) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { - // TODO(gvisor.dev/issue/4458): Queue packets whie link address resolution - // is being peformed like WritePacket. - writtenPackets, err := n.LinkEndpoint.WritePackets(r.Fields(), gso, pkts, protocol) + return n.enqueuePacketBuffer(r, gso, protocol, &pkts) +} + +func (n *NIC) writePackets(r RouteInfo, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkts PacketBufferList) (int, *tcpip.Error) { + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + pkt.EgressRoute = r + pkt.GSOOptions = gso + pkt.NetworkProtocolNumber = protocol + } + + writtenPackets, err := n.LinkEndpoint.WritePackets(r, gso, pkts, protocol) n.stats.Tx.Packets.IncrementBy(uint64(writtenPackets)) writtenBytes := 0 for i, pb := 0, pkts.Front(); i < writtenPackets && pb != nil; i, pb = i+1, pb.Next() { @@ -555,7 +561,7 @@ func (n *NIC) removeAddress(addr tcpip.Address) *tcpip.Error { return tcpip.ErrBadLocalAddress } -func (n *NIC) getNeighborLinkAddress(addr, localAddr tcpip.Address, linkRes LinkAddressResolver, onResolve func(tcpip.LinkAddress, bool)) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { +func (n *NIC) getNeighborLinkAddress(addr, localAddr tcpip.Address, linkRes LinkAddressResolver, onResolve func(LinkResolutionResult)) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { if n.neigh != nil { entry, ch, err := n.neigh.entry(addr, localAddr, linkRes, onResolve) return entry.LinkAddr, ch, err diff --git a/pkg/tcpip/stack/pending_packets.go b/pkg/tcpip/stack/pending_packets.go index 81d8ff6e8..c4769b17e 100644 --- a/pkg/tcpip/stack/pending_packets.go +++ b/pkg/tcpip/stack/pending_packets.go @@ -28,119 +28,219 @@ const ( maxPendingPacketsPerResolution = 256 ) +// pendingPacketBuffer is a pending packet buffer. +// +// TODO(gvisor.dev/issue/5331): Drop this when we drop WritePacket and only use +// WritePackets so we can use a PacketBufferList everywhere. +type pendingPacketBuffer interface { + len() int +} + +func (*PacketBuffer) len() int { + return 1 +} + +func (p *PacketBufferList) len() int { + return p.Len() +} + type pendingPacket struct { - route *Route - proto tcpip.NetworkProtocolNumber - pkt *PacketBuffer + routeInfo RouteInfo + gso *GSO + proto tcpip.NetworkProtocolNumber + pkt pendingPacketBuffer } // packetsPendingLinkResolution is a queue of packets pending link resolution. // // Once link resolution completes successfully, the packets will be written. type packetsPendingLinkResolution struct { - sync.Mutex + nic *NIC - // The packets to send once the resolver completes. - packets map[<-chan struct{}][]pendingPacket + mu struct { + sync.Mutex - // FIFO of channels used to cancel the oldest goroutine waiting for - // link-address resolution. - cancelChans []chan struct{} -} + // The packets to send once the resolver completes. + // + // The link resolution channel is used as the key for this map. + packets map[<-chan struct{}][]pendingPacket -func (f *packetsPendingLinkResolution) init() { - f.Lock() - defer f.Unlock() - f.packets = make(map[<-chan struct{}][]pendingPacket) + // FIFO of channels used to cancel the oldest goroutine waiting for + // link-address resolution. + // + // cancelChans holds the same channels that are used as keys to packets. + cancelChans []<-chan struct{} + } } -func incrementOutgoingPacketErrors(r *Route, proto tcpip.NetworkProtocolNumber) { - r.Stats().IP.OutgoingPacketErrors.Increment() +func (f *packetsPendingLinkResolution) incrementOutgoingPacketErrors(proto tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) { + n := uint64(pkt.len()) + f.nic.stack.stats.IP.OutgoingPacketErrors.IncrementBy(n) - // ok may be false if the endpoint's stats do not collect IP-related data. - if ipEndpointStats, ok := r.outgoingNIC.getNetworkEndpoint(proto).Stats().(IPNetworkEndpointStats); ok { - ipEndpointStats.IPStats().OutgoingPacketErrors.Increment() + if ipEndpointStats, ok := f.nic.getNetworkEndpoint(proto).Stats().(IPNetworkEndpointStats); ok { + ipEndpointStats.IPStats().OutgoingPacketErrors.IncrementBy(n) } } -func (f *packetsPendingLinkResolution) enqueue(ch <-chan struct{}, r *Route, proto tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { - f.Lock() - defer f.Unlock() +func (f *packetsPendingLinkResolution) init(nic *NIC) { + f.mu.Lock() + defer f.mu.Unlock() + f.nic = nic + f.mu.packets = make(map[<-chan struct{}][]pendingPacket) +} - packets, ok := f.packets[ch] - if len(packets) == maxPendingPacketsPerResolution { - p := packets[0] - packets[0] = pendingPacket{} - packets = packets[1:] +// dequeue any pending packets associated with ch. +// +// If success is true, packets will be written and sent to the given remote link +// address. +func (f *packetsPendingLinkResolution) dequeue(ch <-chan struct{}, linkAddr tcpip.LinkAddress, success bool) { + f.mu.Lock() + packets, ok := f.mu.packets[ch] + delete(f.mu.packets, ch) - incrementOutgoingPacketErrors(r, proto) + if ok { + for i, cancelChan := range f.mu.cancelChans { + if cancelChan == ch { + f.mu.cancelChans = append(f.mu.cancelChans[:i], f.mu.cancelChans[i+1:]...) + break + } + } + } + + f.mu.Unlock() + + if ok { + f.dequeuePackets(packets, linkAddr, success) + } +} - p.route.Release() +func (f *packetsPendingLinkResolution) writePacketBuffer(r RouteInfo, gso *GSO, proto tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) (int, *tcpip.Error) { + switch pkt := pkt.(type) { + case *PacketBuffer: + if err := f.nic.writePacket(r, gso, proto, pkt); err != nil { + return 0, err + } + return 1, nil + case *PacketBufferList: + return f.nic.writePackets(r, gso, proto, *pkt) + default: + panic(fmt.Sprintf("unrecognized pending packet buffer type = %T", pkt)) } +} - if l := len(packets); l >= maxPendingPacketsPerResolution { - panic(fmt.Sprintf("max pending packets for resolution reached; got %d packets, max = %d", l, maxPendingPacketsPerResolution)) +// enqueue a packet to be sent once link resolution completes. +// +// If the maximum number of pending resolutions is reached, the packets +// associated with the oldest link resolution will be dequeued as if they failed +// link resolution. +func (f *packetsPendingLinkResolution) enqueue(r *Route, gso *GSO, proto tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) (int, *tcpip.Error) { + f.mu.Lock() + // Make sure we attempt resolution while holding f's lock so that we avoid + // a race where link resolution completes before we enqueue the packets. + // + // A @ T1: Call ResolvedFields (get link resolution channel) + // B @ T2: Complete link resolution, dequeue pending packets + // C @ T1: Enqueue packet that already completed link resolution (which will + // never dequeue) + // + // To make sure B does not interleave with A and C, we make sure A and C are + // done while holding the lock. + routeInfo, ch, err := r.resolvedFields(nil) + switch err { + case nil: + // The route resolved immediately, so we don't need to wait for link + // resolution to send the packet. + f.mu.Unlock() + return f.writePacketBuffer(routeInfo, gso, proto, pkt) + case tcpip.ErrWouldBlock: + // We need to wait for link resolution to complete. + default: + f.mu.Unlock() + return 0, err } - f.packets[ch] = append(packets, pendingPacket{ - route: r, - proto: proto, - pkt: pkt, + defer f.mu.Unlock() + + packets, ok := f.mu.packets[ch] + packets = append(packets, pendingPacket{ + routeInfo: routeInfo, + gso: gso, + proto: proto, + pkt: pkt, }) - if ok { - return - } + if len(packets) > maxPendingPacketsPerResolution { + f.incrementOutgoingPacketErrors(packets[0].proto, packets[0].pkt) + packets[0] = pendingPacket{} + packets = packets[1:] - // Wait for the link-address resolution to complete. - cancel := f.newCancelChannelLocked() - go func() { - cancelled := false - select { - case <-ch: - case <-cancel: - cancelled = true + if numPackets := len(packets); numPackets != maxPendingPacketsPerResolution { + panic(fmt.Sprintf("holding more queued packets than expected; got = %d, want <= %d", numPackets, maxPendingPacketsPerResolution)) } + } - f.Lock() - packets, ok := f.packets[ch] - delete(f.packets, ch) - f.Unlock() + f.mu.packets[ch] = packets - if !ok { - panic(fmt.Sprintf("link-resolution goroutine woke up but no entry exists in the queue of packets")) - } + if ok { + return pkt.len(), nil + } - for _, p := range packets { - if cancelled || p.route.IsResolutionRequired() { - incrementOutgoingPacketErrors(r, proto) + cancelledPackets := f.newCancelChannelLocked(ch) - if linkResolvableEP, ok := p.route.outgoingNIC.getNetworkEndpoint(p.route.NetProto).(LinkResolvableNetworkEndpoint); ok { - linkResolvableEP.HandleLinkResolutionFailure(pkt) - } - } else { - p.route.outgoingNIC.writePacket(p.route.Fields(), nil /* gso */, p.proto, p.pkt) - } - p.route.Release() - } - }() + if len(cancelledPackets) != 0 { + // Dequeue the pending packets in a new goroutine to not hold up the current + // goroutine as handing link resolution failures may be a costly operation. + go f.dequeuePackets(cancelledPackets, "" /* linkAddr */, false /* success */) + } + + return pkt.len(), nil } -// newCancelChannel creates a channel that can cancel a pending forwarding -// activity. The oldest channel is closed if the number of open channels would -// exceed maxPendingResolutions. -func (f *packetsPendingLinkResolution) newCancelChannelLocked() chan struct{} { - if len(f.cancelChans) == maxPendingResolutions { - ch := f.cancelChans[0] - f.cancelChans[0] = nil - f.cancelChans = f.cancelChans[1:] - close(ch) +// newCancelChannelLocked appends the link resolution channel to a FIFO. If the +// maximum number of pending resolutions is reached, the oldest channel will be +// removed and its associated pending packets will be returned. +func (f *packetsPendingLinkResolution) newCancelChannelLocked(newCH <-chan struct{}) []pendingPacket { + f.mu.cancelChans = append(f.mu.cancelChans, newCH) + if len(f.mu.cancelChans) <= maxPendingResolutions { + return nil } - if l := len(f.cancelChans); l >= maxPendingResolutions { + + ch := f.mu.cancelChans[0] + f.mu.cancelChans[0] = nil + f.mu.cancelChans = f.mu.cancelChans[1:] + if l := len(f.mu.cancelChans); l > maxPendingResolutions { panic(fmt.Sprintf("max pending resolutions reached; got %d active resolutions, max = %d", l, maxPendingResolutions)) } - ch := make(chan struct{}) - f.cancelChans = append(f.cancelChans, ch) - return ch + packets, ok := f.mu.packets[ch] + if !ok { + panic("must have a packet queue for an uncancelled channel") + } + delete(f.mu.packets, ch) + + return packets +} + +func (f *packetsPendingLinkResolution) dequeuePackets(packets []pendingPacket, linkAddr tcpip.LinkAddress, success bool) { + for _, p := range packets { + if success { + p.routeInfo.RemoteLinkAddress = linkAddr + _, _ = f.writePacketBuffer(p.routeInfo, p.gso, p.proto, p.pkt) + } else { + f.incrementOutgoingPacketErrors(p.proto, p.pkt) + + if linkResolvableEP, ok := f.nic.getNetworkEndpoint(p.proto).(LinkResolvableNetworkEndpoint); ok { + switch pkt := p.pkt.(type) { + case *PacketBuffer: + linkResolvableEP.HandleLinkResolutionFailure(pkt) + case *PacketBufferList: + for pb := pkt.Front(); pb != nil; pb = pb.Next() { + linkResolvableEP.HandleLinkResolutionFailure(pb) + } + default: + panic(fmt.Sprintf("unrecognized pending packet buffer type = %T", p.pkt)) + } + } + } + } } diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 1ff7b3a37..d9a8554e2 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -86,12 +86,21 @@ type RouteInfo struct { RemoteLinkAddress tcpip.LinkAddress } -// Fields returns a RouteInfo with all of r's exported fields. This allows -// callers to store the route's fields without retaining a reference to it. +// Fields returns a RouteInfo with all of the known values for the route's +// fields. +// +// If any fields are unknown (e.g. remote link address when it is waiting for +// link address resolution), they will be unset. func (r *Route) Fields() RouteInfo { + r.mu.RLock() + defer r.mu.RUnlock() + return r.fieldsLocked() +} + +func (r *Route) fieldsLocked() RouteInfo { return RouteInfo{ routeInfo: r.routeInfo, - RemoteLinkAddress: r.RemoteLinkAddress(), + RemoteLinkAddress: r.mu.remoteLinkAddress, } } @@ -306,29 +315,45 @@ func (r *Route) ResolveWith(addr tcpip.LinkAddress) { r.mu.remoteLinkAddress = addr } -// Resolve attempts to resolve the link address if necessary. +// ResolvedFieldsResult is the result of a route resolution attempt. +type ResolvedFieldsResult struct { + RouteInfo RouteInfo + Success bool +} + +// ResolvedFields attempts to resolve the remote link address if it is not +// known. // -// Returns tcpip.ErrWouldBlock if address resolution requires blocking (e.g. -// waiting for ARP reply). If address resolution is required, a notification -// channel is also returned for the caller to block on. The channel is closed -// once address resolution is complete (successful or not). If a callback is -// provided, it will be called when address resolution is complete, regardless +// If a callback is provided, it will be called before ResolvedFields returns +// when address resolution is not required. If address resolution is required, +// the callback will be called once address resolution is complete, regardless // of success or failure. -func (r *Route) Resolve(afterResolve func()) (<-chan struct{}, *tcpip.Error) { - r.mu.Lock() +// +// Note, the route will not cache the remote link address when address +// resolution completes. +func (r *Route) ResolvedFields(afterResolve func(ResolvedFieldsResult)) *tcpip.Error { + _, _, err := r.resolvedFields(afterResolve) + return err +} - if !r.isResolutionRequiredRLocked() { - // Nothing to do if there is no cache (which does the resolution on cache miss) or - // link address is already known. - r.mu.Unlock() - return nil, nil +// resolvedFields is like ResolvedFields but also returns a notification channel +// when address resolution is required. This channel will become readable once +// address resolution is complete. +// +// The route's fields will also be returned, regardless of whether address +// resolution is required or not. +func (r *Route) resolvedFields(afterResolve func(ResolvedFieldsResult)) (RouteInfo, <-chan struct{}, *tcpip.Error) { + r.mu.RLock() + fields := r.fieldsLocked() + resolutionRequired := r.isResolutionRequiredRLocked() + r.mu.RUnlock() + if !resolutionRequired { + if afterResolve != nil { + afterResolve(ResolvedFieldsResult{RouteInfo: fields, Success: true}) + } + return fields, nil, nil } - // Increment the route's reference count because finishResolution retains a - // reference to the route and releases it when called. - r.acquireLocked() - r.mu.Unlock() - nextAddr := r.NextHop if nextAddr == "" { nextAddr = r.RemoteAddress @@ -341,18 +366,20 @@ func (r *Route) Resolve(afterResolve func()) (<-chan struct{}, *tcpip.Error) { linkAddressResolutionRequestLocalAddr = r.LocalAddress } - finishResolution := func(linkAddress tcpip.LinkAddress, ok bool) { - if ok { - r.ResolveWith(linkAddress) - } + afterResolveFields := fields + linkAddr, ch, err := r.outgoingNIC.getNeighborLinkAddress(nextAddr, linkAddressResolutionRequestLocalAddr, r.linkRes, func(r LinkResolutionResult) { if afterResolve != nil { - afterResolve() + if r.Success { + afterResolveFields.RemoteLinkAddress = r.LinkAddress + } + + afterResolve(ResolvedFieldsResult{RouteInfo: afterResolveFields, Success: r.Success}) } - r.Release() + }) + if err == nil { + fields.RemoteLinkAddress = linkAddr } - - _, ch, err := r.outgoingNIC.getNeighborLinkAddress(nextAddr, linkAddressResolutionRequestLocalAddr, r.linkRes, finishResolution) - return ch, err + return fields, ch, err } // local returns true if the route is a local route. @@ -371,11 +398,7 @@ func (r *Route) IsResolutionRequired() bool { } func (r *Route) isResolutionRequiredRLocked() bool { - if !r.isValidForOutgoingRLocked() || r.mu.remoteLinkAddress != "" || r.local() { - return false - } - - return r.linkRes != nil + return len(r.mu.remoteLinkAddress) == 0 && r.linkRes != nil && r.isValidForOutgoingRLocked() && !r.local() } func (r *Route) isValidForOutgoing() bool { diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 4685fa4cf..e9c5db4c3 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -444,7 +444,7 @@ type Stack struct { // sendBufferSize holds the min/default/max send buffer sizes for // endpoints other than TCP. - sendBufferSize SendBufferSizeOption + sendBufferSize tcpip.SendBufferSizeOption // receiveBufferSize holds the min/default/max receive buffer sizes for // endpoints other than TCP. @@ -646,7 +646,7 @@ func New(opts Options) *Stack { uniqueIDGenerator: opts.UniqueID, nudDisp: opts.NUDDisp, randomGenerator: mathrand.New(randSrc), - sendBufferSize: SendBufferSizeOption{ + sendBufferSize: tcpip.SendBufferSizeOption{ Min: MinBufferSize, Default: DefaultBufferSize, Max: DefaultMaxBufferSize, @@ -1196,19 +1196,19 @@ func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress { // GetMainNICAddress returns the first non-deprecated primary address and prefix // for the given NIC and protocol. If no non-deprecated primary address exists, -// a deprecated primary address and prefix will be returned. Returns an error if +// a deprecated primary address and prefix will be returned. Returns false if // the NIC doesn't exist and an empty value if the NIC doesn't have a primary // address for the given protocol. -func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) { +func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, bool) { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { - return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID + return tcpip.AddressWithPrefix{}, false } - return nic.primaryAddress(protocol), nil + return nic.primaryAddress(protocol), true } func (s *Stack) getAddressEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) AssignableAddressEndpoint { @@ -1527,9 +1527,13 @@ func (s *Stack) AddLinkAddress(nicID tcpip.NICID, neighbor tcpip.Address, linkAd return nil } -// GetLinkAddress finds the link address corresponding to a neighbor's address. -// -// Returns a link address for the remote address, if readily available. +// LinkResolutionResult is the result of a link address resolution attempt. +type LinkResolutionResult struct { + LinkAddress tcpip.LinkAddress + Success bool +} + +// GetLinkAddress finds the link address corresponding to a network address. // // Returns ErrNotSupported if the stack is not configured with a link address // resolver for the specified network protocol. @@ -1538,30 +1542,33 @@ func (s *Stack) AddLinkAddress(nicID tcpip.NICID, neighbor tcpip.Address, linkAd // with a notification channel for the caller to block on. Triggers address // resolution asynchronously. // -// If onResolve is provided, it will be called either immediately, if -// resolution is not required, or when address resolution is complete, with -// the resolved link address and whether resolution succeeded. After any -// callbacks have been called, the returned notification channel is closed. +// onResolve will be called either immediately, if resolution is not required, +// or when address resolution is complete, with the resolved link address and +// whether resolution succeeded. // // If specified, the local address must be an address local to the interface // the neighbor cache belongs to. The local address is the source address of // a packet prompting NUD/link address resolution. -// -// TODO(gvisor.dev/issue/5151): Don't return the link address. -func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, onResolve func(tcpip.LinkAddress, bool)) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { +func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, onResolve func(LinkResolutionResult)) *tcpip.Error { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { - return "", nil, tcpip.ErrUnknownNICID + return tcpip.ErrUnknownNICID } linkRes, ok := s.linkAddrResolvers[protocol] if !ok { - return "", nil, tcpip.ErrNotSupported + return tcpip.ErrNotSupported + } + + if linkAddr, ok := linkRes.ResolveStaticAddress(addr); ok { + onResolve(LinkResolutionResult{LinkAddress: linkAddr, Success: true}) + return nil } - return nic.getNeighborLinkAddress(addr, localAddr, linkRes, onResolve) + _, _, err := nic.getNeighborLinkAddress(addr, localAddr, linkRes, onResolve) + return err } // Neighbors returns all IP to MAC address associations. @@ -1622,25 +1629,25 @@ func (s *Stack) ClearNeighbors(nicID tcpip.NICID) *tcpip.Error { // transport dispatcher. Received packets that match the provided id will be // delivered to the given endpoint; specifying a nic is optional, but // nic-specific IDs have precedence over global ones. -func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error { +func (s *Stack) RegisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error { return s.demux.registerEndpoint(netProtos, protocol, id, ep, flags, bindToDevice) } // CheckRegisterTransportEndpoint checks if an endpoint can be registered with // the stack transport dispatcher. -func (s *Stack) CheckRegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error { +func (s *Stack) CheckRegisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error { return s.demux.checkEndpoint(netProtos, protocol, id, flags, bindToDevice) } // UnregisterTransportEndpoint removes the endpoint with the given id from the // stack transport dispatcher. -func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { +func (s *Stack) UnregisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice) } // StartTransportEndpointCleanup removes the endpoint with the given id from // the stack transport dispatcher. It also transitions it to the cleanup stage. -func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { +func (s *Stack) StartTransportEndpointCleanup(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { s.cleanupEndpointsMu.Lock() s.cleanupEndpoints[ep] = struct{}{} s.cleanupEndpointsMu.Unlock() @@ -1665,13 +1672,13 @@ func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, tran // RegisterRawTransportEndpoint registers the given endpoint with the stack // transport dispatcher. Received packets that match the provided transport // protocol will be delivered to the given endpoint. -func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { +func (s *Stack) RegisterRawTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { return s.demux.registerRawEndpoint(netProto, transProto, ep) } // UnregisterRawTransportEndpoint removes the endpoint for the transport // protocol from the stack transport dispatcher. -func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { +func (s *Stack) UnregisterRawTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { s.demux.unregisterRawEndpoint(netProto, transProto, ep) } diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go index 0b093e6c5..92e70f94e 100644 --- a/pkg/tcpip/stack/stack_options.go +++ b/pkg/tcpip/stack/stack_options.go @@ -14,7 +14,9 @@ package stack -import "gvisor.dev/gvisor/pkg/tcpip" +import ( + "gvisor.dev/gvisor/pkg/tcpip" +) const ( // MinBufferSize is the smallest size of a receive or send buffer. @@ -29,14 +31,6 @@ const ( DefaultMaxBufferSize = 4 << 20 // 4 MiB ) -// SendBufferSizeOption is used by stack.(Stack*).Option/SetOption to -// get/set the default, min and max send buffer sizes. -type SendBufferSizeOption struct { - Min int - Default int - Max int -} - // ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to // get/set the default, min and max receive buffer sizes. type ReceiveBufferSizeOption struct { @@ -48,7 +42,7 @@ type ReceiveBufferSizeOption struct { // SetOption allows setting stack wide options. func (s *Stack) SetOption(option interface{}) *tcpip.Error { switch v := option.(type) { - case SendBufferSizeOption: + case tcpip.SendBufferSizeOption: // Make sure we don't allow lowering the buffer below minimum // required for stack to work. if v.Min < MinBufferSize { @@ -88,7 +82,7 @@ func (s *Stack) SetOption(option interface{}) *tcpip.Error { // Option allows retrieving stack wide options. func (s *Stack) Option(option interface{}) *tcpip.Error { switch v := option.(type) { - case *SendBufferSizeOption: + case *tcpip.SendBufferSizeOption: s.mu.RLock() *v = s.sendBufferSize s.mu.RUnlock() diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index b9ef455e5..0f02f1d53 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -60,6 +60,15 @@ const ( protocolNumberOffset = 2 ) +func checkGetMainNICAddress(s *stack.Stack, nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber, want tcpip.AddressWithPrefix) error { + if addr, ok := s.GetMainNICAddress(nicID, proto); !ok { + return fmt.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, false), want = (_, true)", nicID, proto) + } else if addr != want { + return fmt.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, true), want = (%s, true)", nicID, proto, addr, want) + } + return nil +} + // fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and // received packets; the counts of all endpoints are aggregated in the protocol // descriptor. @@ -1873,20 +1882,20 @@ func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) { // Check that GetMainNICAddress returns an address if at least // one primary address was added. In that case make sure the // address/prefixLen matches what we added. - gotAddr, err := s.GetMainNICAddress(1, fakeNetNumber) - if err != nil { - t.Fatal("GetMainNICAddress failed:", err) + gotAddr, ok := s.GetMainNICAddress(1, fakeNetNumber) + if !ok { + t.Fatalf("got GetMainNICAddress(1, %d) = (_, false), want = (_, true)", fakeNetNumber) } if len(primaryAddrAdded) == 0 { // No primary addresses present. if wantAddr := (tcpip.AddressWithPrefix{}); gotAddr != wantAddr { - t.Fatalf("GetMainNICAddress: got addr = %s, want = %s", gotAddr, wantAddr) + t.Fatalf("got GetMainNICAddress(1, %d) = (%s, true), want = (%s, true)", fakeNetNumber, gotAddr, wantAddr) } } else { // At least one primary address was added, verify the returned // address is in the list of primary addresses we added. if _, ok := primaryAddrAdded[gotAddr]; !ok { - t.Fatalf("GetMainNICAddress: got = %s, want any in {%v}", gotAddr, primaryAddrAdded) + t.Fatalf("got GetMainNICAddress(1, %d) = (%s, true), want = (%s, true)", fakeNetNumber, gotAddr, primaryAddrAdded) } } }) @@ -1927,12 +1936,8 @@ func TestGetMainNICAddressAddRemove(t *testing.T) { } // Check that we get the right initial address and prefix length. - gotAddr, err := s.GetMainNICAddress(1, fakeNetNumber) - if err != nil { - t.Fatal("GetMainNICAddress failed:", err) - } - if wantAddr := protocolAddress.AddressWithPrefix; gotAddr != wantAddr { - t.Fatalf("got s.GetMainNICAddress(...) = %s, want = %s", gotAddr, wantAddr) + if err := checkGetMainNICAddress(s, 1, fakeNetNumber, protocolAddress.AddressWithPrefix); err != nil { + t.Fatal(err) } if err := s.RemoveAddress(1, protocolAddress.AddressWithPrefix.Address); err != nil { @@ -1940,12 +1945,8 @@ func TestGetMainNICAddressAddRemove(t *testing.T) { } // Check that we get no address after removal. - gotAddr, err = s.GetMainNICAddress(1, fakeNetNumber) - if err != nil { - t.Fatal("GetMainNICAddress failed:", err) - } - if wantAddr := (tcpip.AddressWithPrefix{}); gotAddr != wantAddr { - t.Fatalf("got GetMainNICAddress(...) = %s, want = %s", gotAddr, wantAddr) + if err := checkGetMainNICAddress(s, 1, fakeNetNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } }) } @@ -2486,12 +2487,12 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) { } } - gotMainAddr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err) + // Check that we get no address after removal. + if err := checkGetMainNICAddress(s, 1, fakeNetNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } - if gotMainAddr != expectedMainAddr { - t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", gotMainAddr, expectedMainAddr) + if err := checkGetMainNICAddress(s, 1, header.IPv6ProtocolNumber, expectedMainAddr); err != nil { + t.Fatal(err) } }) } @@ -2537,12 +2538,8 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) { t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err) } - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Errorf("got stack.GetMainNICAddress(%d, _) = %s, want = %s", nicID, addr, want) + if err := checkGetMainNICAddress(s, 1, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } }) } @@ -2573,12 +2570,8 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) { // Address should not be considered bound to the // NIC yet (DAD ongoing). - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } linkLocalAddr := header.LinkLocalAddr(linkAddr1) @@ -2596,12 +2589,8 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) { t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); err != nil { + t.Fatal(err) } } @@ -2633,17 +2622,17 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", pi); err != nil { t.Fatal("AddAddressWithOptions failed:", err) } - addr, err := s.GetMainNICAddress(1, fakeNetNumber) - if err != nil { - t.Fatal("s.GetMainNICAddress failed:", err) + addr, ok := s.GetMainNICAddress(1, fakeNetNumber) + if !ok { + t.Fatalf("GetMainNICAddress(1, %d) = (_, false), want = (_, true)", fakeNetNumber) } if pi == stack.NeverPrimaryEndpoint { if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want) + t.Fatalf("got GetMainNICAddress(1, %d) = (%s, true), want = (%s, true)", fakeNetNumber, addr, want) } } else if addr.Address != "\x01" { - t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address) + t.Fatalf("got GetMainNICAddress(1, %d) = (%s, true), want = (1, true)", fakeNetNumber, addr.Address) } { @@ -2722,18 +2711,17 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { if err := s.RemoveAddress(1, "\x03"); err != nil { t.Fatalf("RemoveAddress failed: %v", err) } - addr, err = s.GetMainNICAddress(1, fakeNetNumber) - if err != nil { - t.Fatalf("s.GetMainNICAddress failed: %v", err) + addr, ok = s.GetMainNICAddress(1, fakeNetNumber) + if !ok { + t.Fatalf("got GetMainNICAddress(1, %d) = (_, false), want = (_, true)", fakeNetNumber) } if ps == stack.NeverPrimaryEndpoint { if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want) - + t.Fatalf("got GetMainNICAddress(1, %d) = (%s, true), want = (%s, true)", fakeNetNumber, addr, want) } } else { if addr.Address != "\x01" { - t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address) + t.Fatalf("got GetMainNICAddress(1, %d) = (%s, true), want = (1, true)", fakeNetNumber, addr.Address) } } }) @@ -3259,12 +3247,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) { } // Address should be tentative so it should not be a main address. - got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); got != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Enabling the NIC should start DAD for the address. @@ -3276,12 +3260,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) { } // Address should not be considered bound to the NIC yet (DAD ongoing). - got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if want := (tcpip.AddressWithPrefix{}); got != want { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, tcpip.AddressWithPrefix{}); err != nil { + t.Fatal(err) } // Wait for DAD to resolve. @@ -3296,12 +3276,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) { if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr) } - got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if got != addr.AddressWithPrefix { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addr.AddressWithPrefix); err != nil { + t.Fatal(err) } // Enabling the NIC again should be a no-op. @@ -3311,12 +3287,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) { if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr) } - got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) - } - if got != addr.AddressWithPrefix { - t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix) + if err := checkGetMainNICAddress(s, nicID, header.IPv6ProtocolNumber, addr.AddressWithPrefix); err != nil { + t.Fatal(err) } } @@ -3364,21 +3336,21 @@ func TestStackSendBufferSizeOption(t *testing.T) { const sMin = stack.MinBufferSize testCases := []struct { name string - ss stack.SendBufferSizeOption + ss tcpip.SendBufferSizeOption err *tcpip.Error }{ // Invalid configurations. - {"min_below_zero", stack.SendBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue}, - {"min_zero", stack.SendBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue}, - {"default_below_min", stack.SendBufferSizeOption{Min: 0, Default: sMin - 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue}, - {"default_above_max", stack.SendBufferSizeOption{Min: 0, Default: sMin + 1, Max: sMin}, tcpip.ErrInvalidOptionValue}, - {"max_below_min", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue}, + {"min_below_zero", tcpip.SendBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue}, + {"min_zero", tcpip.SendBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue}, + {"default_below_min", tcpip.SendBufferSizeOption{Min: 0, Default: sMin - 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue}, + {"default_above_max", tcpip.SendBufferSizeOption{Min: 0, Default: sMin + 1, Max: sMin}, tcpip.ErrInvalidOptionValue}, + {"max_below_min", tcpip.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue}, // Valid Configurations - {"in_ascending_order", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil}, - {"all_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil}, - {"min_default_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil}, - {"default_max_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil}, + {"in_ascending_order", tcpip.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil}, + {"all_equal", tcpip.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil}, + {"min_default_equal", tcpip.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil}, + {"default_max_equal", tcpip.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil}, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { @@ -3387,7 +3359,7 @@ func TestStackSendBufferSizeOption(t *testing.T) { if err := s.SetOption(tc.ss); err != tc.err { t.Fatalf("s.SetOption(%+v) = %v, want: %v", tc.ss, err, tc.err) } - var ss stack.SendBufferSizeOption + var ss tcpip.SendBufferSizeOption if tc.err == nil { if err := s.Option(&ss); err != nil { t.Fatalf("s.Option(%+v) = %v, want: nil", ss, err) @@ -3790,20 +3762,16 @@ func TestGetMainNICAddressWhenNICDisabled(t *testing.T) { } // Check that we get the right initial address and prefix length. - if gotAddr, err := s.GetMainNICAddress(nicID, fakeNetNumber); err != nil { - t.Fatalf("GetMainNICAddress(%d, %d): %s", nicID, fakeNetNumber, err) - } else if gotAddr != protocolAddress.AddressWithPrefix { - t.Fatalf("got GetMainNICAddress(%d, %d) = %s, want = %s", nicID, fakeNetNumber, gotAddr, protocolAddress.AddressWithPrefix) + if err := checkGetMainNICAddress(s, nicID, fakeNetNumber, protocolAddress.AddressWithPrefix); err != nil { + t.Fatal(err) } // Should still get the address when the NIC is diabled. if err := s.DisableNIC(nicID); err != nil { t.Fatalf("DisableNIC(%d): %s", nicID, err) } - if gotAddr, err := s.GetMainNICAddress(nicID, fakeNetNumber); err != nil { - t.Fatalf("GetMainNICAddress(%d, %d): %s", nicID, fakeNetNumber, err) - } else if gotAddr != protocolAddress.AddressWithPrefix { - t.Fatalf("got GetMainNICAddress(%d, %d) = %s, want = %s", nicID, fakeNetNumber, gotAddr, protocolAddress.AddressWithPrefix) + if err := checkGetMainNICAddress(s, nicID, fakeNetNumber, protocolAddress.AddressWithPrefix); err != nil { + t.Fatal(err) } } @@ -4384,10 +4352,58 @@ func TestGetLinkAddressErrors(t *testing.T) { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if addr, _, err := s.GetLinkAddress(unknownNICID, "", "", ipv4.ProtocolNumber, nil); err != tcpip.ErrUnknownNICID { - t.Errorf("got s.GetLinkAddress(%d, '', '', %d, nil) = (%s, _, %s), want = (_, _, %s)", unknownNICID, ipv4.ProtocolNumber, addr, err, tcpip.ErrUnknownNICID) + if err := s.GetLinkAddress(unknownNICID, "", "", ipv4.ProtocolNumber, nil); err != tcpip.ErrUnknownNICID { + t.Errorf("got s.GetLinkAddress(%d, '', '', %d, nil) = %s, want = %s", unknownNICID, ipv4.ProtocolNumber, err, tcpip.ErrUnknownNICID) + } + if err := s.GetLinkAddress(nicID, "", "", ipv4.ProtocolNumber, nil); err != tcpip.ErrNotSupported { + t.Errorf("got s.GetLinkAddress(%d, '', '', %d, nil) = %s, want = %s", unknownNICID, ipv4.ProtocolNumber, err, tcpip.ErrNotSupported) + } +} + +func TestStaticGetLinkAddress(t *testing.T) { + const ( + nicID = 1 + ) + + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol}, + }) + if err := s.CreateNIC(nicID, channel.New(0, 0, "")); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + + tests := []struct { + name string + proto tcpip.NetworkProtocolNumber + addr tcpip.Address + expectedLinkAddr tcpip.LinkAddress + }{ + { + name: "IPv4", + proto: ipv4.ProtocolNumber, + addr: header.IPv4Broadcast, + expectedLinkAddr: header.EthernetBroadcastAddress, + }, + { + name: "IPv6", + proto: ipv6.ProtocolNumber, + addr: header.IPv6AllNodesMulticastAddress, + expectedLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllNodesMulticastAddress), + }, } - if addr, _, err := s.GetLinkAddress(nicID, "", "", ipv4.ProtocolNumber, nil); err != tcpip.ErrNotSupported { - t.Errorf("got s.GetLinkAddress(%d, '', '', %d, nil) = (%s, _, %s), want = (_, _, %s)", unknownNICID, ipv4.ProtocolNumber, addr, err, tcpip.ErrNotSupported) + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ch := make(chan stack.LinkResolutionResult, 1) + if err := s.GetLinkAddress(nicID, test.addr, "", test.proto, func(r stack.LinkResolutionResult) { + ch <- r + }); err != nil { + t.Fatalf("s.GetLinkAddress(%d, %s, '', %d, _): %s", nicID, test.addr, test.proto, err) + } + + if diff := cmp.Diff(stack.LinkResolutionResult{LinkAddress: test.expectedLinkAddr, Success: true}, <-ch); diff != "" { + t.Fatalf("link resolution result mismatch (-want +got):\n%s", diff) + } + }) } } diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go index 57e1f8354..de4b5fbdc 100644 --- a/pkg/tcpip/stack/transport_demuxer_test.go +++ b/pkg/tcpip/stack/transport_demuxer_test.go @@ -194,7 +194,7 @@ func TestTransportDemuxerRegister(t *testing.T) { if !ok { t.Fatalf("%T does not implement stack.TransportEndpoint", ep) } - if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, ports.Flags{}, 0), test.want; got != want { + if got, want := s.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, ports.Flags{}, 0), test.want; got != want { t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want) } }) diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index 9d39533a1..c49427c4c 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -15,6 +15,7 @@ package stack_test import ( + "bytes" "io" "testing" @@ -67,9 +68,9 @@ func (f *fakeTransportEndpoint) SocketOptions() *tcpip.SocketOptions { return &f.ops } -func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint { - ep := &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID} - ep.ops.InitHandler(ep) +func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, s *stack.Stack) tcpip.Endpoint { + ep := &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: s.UniqueID()} + ep.ops.InitHandler(ep, s, tcpip.GetStackSendBufferLimits) return ep } @@ -95,10 +96,11 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions return 0, tcpip.ErrNoRoute } - v, err := p.FullPayload() - if err != nil { - return 0, err + v := make([]byte, p.Len()) + if _, err := io.ReadFull(p, v); err != nil { + return 0, tcpip.ErrBadBuffer } + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(f.route.MaxHeaderLength()) + fakeTransHeaderLen, Data: buffer.View(v).ToVectorisedView(), @@ -147,7 +149,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { // Try to register so that we can start receiving packets. f.ID.RemoteAddress = addr.Addr - err = f.proto.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */) + err = f.proto.stack.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */) if err != nil { r.Release() return err @@ -188,7 +190,6 @@ func (f *fakeTransportEndpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *wai func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error { if err := f.proto.stack.RegisterTransportEndpoint( - a.NIC, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, stack.TransportEndpointID{LocalAddress: a.Addr}, @@ -232,7 +233,7 @@ func (f *fakeTransportEndpoint) HandlePacket(id stack.TransportEndpointID, pkt * peerAddr: route.RemoteAddress, route: route, } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, f.proto.stack, tcpip.GetStackSendBufferLimits) f.acceptQueue = append(f.acceptQueue, ep) } @@ -280,7 +281,7 @@ func (*fakeTransportProtocol) Number() tcpip.TransportProtocolNumber { } func (f *fakeTransportProtocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { - return newFakeTransportEndpoint(f, netProto, f.stack.UniqueID()), nil + return newFakeTransportEndpoint(f, netProto, f.stack), nil } func (*fakeTransportProtocol) NewRawEndpoint(tcpip.NetworkProtocolNumber, *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { @@ -520,8 +521,10 @@ func TestTransportSend(t *testing.T) { } // Create buffer that will hold the payload. - view := buffer.NewView(30) - if _, err := ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + b := make([]byte, 30) + var r bytes.Reader + r.Reset(b) + if _, err := ep.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("write failed: %v", err) } diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 4f59e4ff7..812ee36ed 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -29,6 +29,7 @@ package tcpip import ( + "bytes" "errors" "fmt" "io" @@ -194,7 +195,7 @@ type ErrSaveRejection struct { } // Error returns a sensible description of the save rejection error. -func (e ErrSaveRejection) Error() string { +func (e *ErrSaveRejection) Error() string { return "save rejected due to unsupported networking state: " + e.Err.Error() } @@ -471,30 +472,15 @@ type FullAddress struct { // This interface allows the endpoint to request the amount of data it needs // based on internal buffers without exposing them. type Payloader interface { - // FullPayload returns all available bytes. - FullPayload() ([]byte, *Error) + io.Reader - // Payload returns a slice containing at most size bytes. - Payload(size int) ([]byte, *Error) + // Len returns the number of bytes of the unread portion of the + // Reader. + Len() int } -// SlicePayload implements Payloader for slices. -// -// This is typically used for tests. -type SlicePayload []byte - -// FullPayload implements Payloader.FullPayload. -func (s SlicePayload) FullPayload() ([]byte, *Error) { - return s, nil -} - -// Payload implements Payloader.Payload. -func (s SlicePayload) Payload(size int) ([]byte, *Error) { - if size > len(s) { - size = len(s) - } - return s[:size], nil -} +var _ Payloader = (*bytes.Buffer)(nil) +var _ Payloader = (*bytes.Reader)(nil) var _ io.Writer = (*SliceWriter)(nil) @@ -840,10 +826,6 @@ const ( // number of unread bytes in the input buffer should be returned. ReceiveQueueSizeOption - // SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to - // specify the send buffer size option. - SendBufferSizeOption - // ReceiveBufferSizeOption is used by SetSockOptInt/GetSockOptInt to // specify the receive buffer size option. ReceiveBufferSizeOption @@ -1248,6 +1230,31 @@ type IPPacketInfo struct { DestinationAddr Address } +// SendBufferSizeOption is used by stack.(Stack*).Option/SetOption to +// get/set the default, min and max send buffer sizes. +type SendBufferSizeOption struct { + // Min is the minimum size for send buffer. + Min int + + // Default is the default size for send buffer. + Default int + + // Max is the maximum size for send buffer. + Max int +} + +// GetSendBufferLimits is used to get the send buffer size limits. +type GetSendBufferLimits func(StackHandler) SendBufferSizeOption + +// GetStackSendBufferLimits is used to get default, min and max send buffer size. +func GetStackSendBufferLimits(so StackHandler) SendBufferSizeOption { + var ss SendBufferSizeOption + if err := so.Option(&ss); err != nil { + panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err)) + } + return ss +} + // Route is a row in the routing table. It specifies through which NIC (and // gateway) sets of packets should be routed. A row is considered viable if the // masked target address matches the destination address in the row. @@ -1681,6 +1688,8 @@ type IPStats struct { // ARPStats collects ARP-specific stats. type ARPStats struct { + // LINT.IfChange(ARPStats) + // PacketsReceived is the number of ARP packets received from the link layer. PacketsReceived *StatCounter @@ -1708,10 +1717,6 @@ type ARPStats struct { // ARP request with a bad local address. OutgoingRequestBadLocalAddressErrors *StatCounter - // OutgoingRequestNetworkUnreachableErrors is the number of failures to send - // an ARP request with a network unreachable error. - OutgoingRequestNetworkUnreachableErrors *StatCounter - // OutgoingRequestsDropped is the number of ARP requests which failed to write // to a link-layer endpoint. OutgoingRequestsDropped *StatCounter @@ -1730,6 +1735,8 @@ type ARPStats struct { // OutgoingRepliesSent is the number of ARP replies successfully written to a // link-layer endpoint. OutgoingRepliesSent *StatCounter + + // LINT.ThenChange(network/arp/stats.go:multiCounterARPStats) } // TCPStats collects TCP-specific stats. diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD index 1742a178d..218b218e7 100644 --- a/pkg/tcpip/tests/integration/BUILD +++ b/pkg/tcpip/tests/integration/BUILD @@ -7,6 +7,7 @@ go_test( size = "small", srcs = [ "forward_test.go", + "iptables_test.go", "link_resolution_test.go", "loopback_test.go", "multicast_broadcast_test.go", diff --git a/pkg/tcpip/tests/integration/forward_test.go b/pkg/tcpip/tests/integration/forward_test.go index ac9670f9a..aedf1845e 100644 --- a/pkg/tcpip/tests/integration/forward_test.go +++ b/pkg/tcpip/tests/integration/forward_test.go @@ -436,9 +436,10 @@ func TestForwarding(t *testing.T) { write := func(ep tcpip.Endpoint, data []byte) { t.Helper() - dataPayload := tcpip.SlicePayload(data) + var r bytes.Reader + r.Reset(data) var wOpts tcpip.WriteOptions - n, err := ep.Write(dataPayload, wOpts) + n, err := ep.Write(&r, wOpts) if err != nil { t.Fatalf("ep.Write(_, %#v): %s", wOpts, err) } @@ -486,7 +487,7 @@ func TestForwarding(t *testing.T) { read(serverCH, serverEP, data, clientAddr) - data = tcpip.SlicePayload([]byte{5, 6, 7, 8, 9, 10, 11, 12}) + data = []byte{5, 6, 7, 8, 9, 10, 11, 12} write(serverEP, data) read(epsAndAddrs.clientReadableCH, epsAndAddrs.clientEP, data, serverAddr) }) diff --git a/pkg/tcpip/tests/integration/iptables_test.go b/pkg/tcpip/tests/integration/iptables_test.go new file mode 100644 index 000000000..21a8dd291 --- /dev/null +++ b/pkg/tcpip/tests/integration/iptables_test.go @@ -0,0 +1,336 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration_test + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +type inputIfNameMatcher struct { + name string +} + +var _ stack.Matcher = (*inputIfNameMatcher)(nil) + +func (*inputIfNameMatcher) Name() string { + return "inputIfNameMatcher" +} + +func (im *inputIfNameMatcher) Match(hook stack.Hook, _ *stack.PacketBuffer, inNicName, _ string) (bool, bool) { + return (hook == stack.Input && im.name != "" && im.name == inNicName), false +} + +const ( + nicID = 1 + nicName = "nic1" + anotherNicName = "nic2" + linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") + srcAddrV4 = "\x0a\x00\x00\x01" + dstAddrV4 = "\x0a\x00\x00\x02" + srcAddrV6 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" + dstAddrV6 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + payloadSize = 20 +) + +func genStackV6(t *testing.T) (*stack.Stack, *channel.Endpoint) { + t.Helper() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol}, + }) + e := channel.New(0, header.IPv6MinimumMTU, linkAddr) + nicOpts := stack.NICOptions{Name: nicName} + if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { + t.Fatalf("CreateNICWithOptions(%d, _, %#v) = %s", nicID, nicOpts, err) + } + if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, dstAddrV6); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, dstAddrV6, err) + } + return s, e +} + +func genStackV4(t *testing.T) (*stack.Stack, *channel.Endpoint) { + t.Helper() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, + }) + e := channel.New(0, header.IPv4MinimumMTU, linkAddr) + nicOpts := stack.NICOptions{Name: nicName} + if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { + t.Fatalf("CreateNICWithOptions(%d, _, %#v) = %s", nicID, nicOpts, err) + } + if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, dstAddrV4); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, dstAddrV4, err) + } + return s, e +} + +func genPacketV6() *stack.PacketBuffer { + pktSize := header.IPv6MinimumSize + payloadSize + hdr := buffer.NewPrependable(pktSize) + ip := header.IPv6(hdr.Prepend(pktSize)) + ip.Encode(&header.IPv6Fields{ + PayloadLength: payloadSize, + TransportProtocol: 99, + HopLimit: 255, + SrcAddr: srcAddrV6, + DstAddr: dstAddrV6, + }) + vv := hdr.View().ToVectorisedView() + return stack.NewPacketBuffer(stack.PacketBufferOptions{Data: vv}) +} + +func genPacketV4() *stack.PacketBuffer { + pktSize := header.IPv4MinimumSize + payloadSize + hdr := buffer.NewPrependable(pktSize) + ip := header.IPv4(hdr.Prepend(pktSize)) + ip.Encode(&header.IPv4Fields{ + TOS: 0, + TotalLength: uint16(pktSize), + ID: 1, + Flags: 0, + FragmentOffset: 16, + TTL: 48, + Protocol: 99, + SrcAddr: srcAddrV4, + DstAddr: dstAddrV4, + }) + ip.SetChecksum(0) + ip.SetChecksum(^ip.CalculateChecksum()) + vv := hdr.View().ToVectorisedView() + return stack.NewPacketBuffer(stack.PacketBufferOptions{Data: vv}) +} + +func TestIPTablesStatsForInput(t *testing.T) { + tests := []struct { + name string + setupStack func(*testing.T) (*stack.Stack, *channel.Endpoint) + setupFilter func(*testing.T, *stack.Stack) + genPacket func() *stack.PacketBuffer + proto tcpip.NetworkProtocolNumber + expectReceived int + expectInputDropped int + }{ + { + name: "IPv6 Accept", + setupStack: genStackV6, + setupFilter: func(*testing.T, *stack.Stack) { /* no filter */ }, + genPacket: genPacketV6, + proto: header.IPv6ProtocolNumber, + expectReceived: 1, + expectInputDropped: 0, + }, + { + name: "IPv4 Accept", + setupStack: genStackV4, + setupFilter: func(*testing.T, *stack.Stack) { /* no filter */ }, + genPacket: genPacketV4, + proto: header.IPv4ProtocolNumber, + expectReceived: 1, + expectInputDropped: 0, + }, + { + name: "IPv6 Drop (input interface matches)", + setupStack: genStackV6, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, true /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{InputInterface: nicName} + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&inputIfNameMatcher{nicName}} + // Make sure the packet is not dropped by the next rule. + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, true /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, true, err) + } + }, + genPacket: genPacketV6, + proto: header.IPv6ProtocolNumber, + expectReceived: 1, + expectInputDropped: 1, + }, + { + name: "IPv4 Drop (input interface matches)", + setupStack: genStackV4, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, false /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{InputInterface: nicName} + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&inputIfNameMatcher{nicName}} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, false /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, false, err) + } + }, + genPacket: genPacketV4, + proto: header.IPv4ProtocolNumber, + expectReceived: 1, + expectInputDropped: 1, + }, + { + name: "IPv6 Accept (input interface does not match)", + setupStack: genStackV6, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, true /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{InputInterface: anotherNicName} + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, true /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, true, err) + } + }, + genPacket: genPacketV6, + proto: header.IPv6ProtocolNumber, + expectReceived: 1, + expectInputDropped: 0, + }, + { + name: "IPv4 Accept (input interface does not match)", + setupStack: genStackV4, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, false /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{InputInterface: anotherNicName} + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, false /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, false, err) + } + }, + genPacket: genPacketV4, + proto: header.IPv4ProtocolNumber, + expectReceived: 1, + expectInputDropped: 0, + }, + { + name: "IPv6 Drop (input interface does not match but invert is true)", + setupStack: genStackV6, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, true /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{ + InputInterface: anotherNicName, + InputInterfaceInvert: true, + } + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, true /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, true, err) + } + }, + genPacket: genPacketV6, + proto: header.IPv6ProtocolNumber, + expectReceived: 1, + expectInputDropped: 1, + }, + { + name: "IPv4 Drop (input interface does not match but invert is true)", + setupStack: genStackV4, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, false /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{ + InputInterface: anotherNicName, + InputInterfaceInvert: true, + } + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, false /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, false, err) + } + }, + genPacket: genPacketV4, + proto: header.IPv4ProtocolNumber, + expectReceived: 1, + expectInputDropped: 1, + }, + { + name: "IPv6 Accept (input interface does not match using a matcher)", + setupStack: genStackV6, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, true /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&inputIfNameMatcher{anotherNicName}} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, true /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, true, err) + } + }, + genPacket: genPacketV6, + proto: header.IPv6ProtocolNumber, + expectReceived: 1, + expectInputDropped: 0, + }, + { + name: "IPv4 Accept (input interface does not match using a matcher)", + setupStack: genStackV4, + setupFilter: func(t *testing.T, s *stack.Stack) { + t.Helper() + ipt := s.IPTables() + filter := ipt.GetTable(stack.FilterID, false /* ipv6 */) + ruleIdx := filter.BuiltinChains[stack.Input] + filter.Rules[ruleIdx].Target = &stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&inputIfNameMatcher{anotherNicName}} + filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterID, filter, false /* ipv6 */); err != nil { + t.Fatalf("ipt.RelaceTable(%d, _, %t): %s", stack.FilterID, false, err) + } + }, + genPacket: genPacketV4, + proto: header.IPv4ProtocolNumber, + expectReceived: 1, + expectInputDropped: 0, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s, e := test.setupStack(t) + test.setupFilter(t, s) + e.InjectInbound(test.proto, test.genPacket()) + + if got := int(s.Stats().IP.PacketsReceived.Value()); got != test.expectReceived { + t.Errorf("got PacketReceived = %d, want = %d", got, test.expectReceived) + } + if got := int(s.Stats().IP.IPTablesInputDropped.Value()); got != test.expectInputDropped { + t.Errorf("got IPTablesInputDropped = %d, want = %d", got, test.expectInputDropped) + } + }) + } +} diff --git a/pkg/tcpip/tests/integration/link_resolution_test.go b/pkg/tcpip/tests/integration/link_resolution_test.go index af32d3009..f85164c5b 100644 --- a/pkg/tcpip/tests/integration/link_resolution_test.go +++ b/pkg/tcpip/tests/integration/link_resolution_test.go @@ -23,6 +23,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/pipe" @@ -32,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) @@ -207,8 +209,10 @@ func TestPing(t *testing.T) { defer ep.Close() icmpBuf := test.icmpBuf(t) + var r bytes.Reader + r.Reset(icmpBuf) wOpts := tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: test.remoteAddr}} - if n, err := ep.Write(tcpip.SlicePayload(icmpBuf), wOpts); err != nil { + if n, err := ep.Write(&r, wOpts); err != nil { t.Fatalf("ep.Write(_, _): %s", err) } else if want := int64(len(icmpBuf)); n != want { t.Fatalf("got ep.Write(_, _) = (%d, _), want = (%d, _)", n, want) @@ -358,9 +362,11 @@ func TestTCPLinkResolutionFailure(t *testing.T) { // Wait for an error due to link resolution failing, or the endpoint to be // writable. <-ch + var r bytes.Reader + r.Reset([]byte{0}) var wOpts tcpip.WriteOptions - if n, err := clientEP.Write(tcpip.SlicePayload(nil), wOpts); err != test.expectedWriteErr { - t.Errorf("got clientEP.Write(nil, %#v) = (%d, %s), want = (_, %s)", wOpts, n, err, test.expectedWriteErr) + if n, err := clientEP.Write(&r, wOpts); err != test.expectedWriteErr { + t.Errorf("got clientEP.Write(_, %#v) = (%d, %s), want = (_, %s)", wOpts, n, err, test.expectedWriteErr) } if test.expectedWriteErr == nil { @@ -404,20 +410,34 @@ func TestGetLinkAddress(t *testing.T) { ) tests := []struct { - name string - netProto tcpip.NetworkProtocolNumber - remoteAddr tcpip.Address - expectedLinkAddr bool + name string + netProto tcpip.NetworkProtocolNumber + remoteAddr tcpip.Address + expectedOk bool }{ { - name: "IPv4", + name: "IPv4 resolvable", netProto: ipv4.ProtocolNumber, remoteAddr: ipv4Addr2.AddressWithPrefix.Address, + expectedOk: true, }, { - name: "IPv6", + name: "IPv6 resolvable", netProto: ipv6.ProtocolNumber, remoteAddr: ipv6Addr2.AddressWithPrefix.Address, + expectedOk: true, + }, + { + name: "IPv4 not resolvable", + netProto: ipv4.ProtocolNumber, + remoteAddr: ipv4Addr3.AddressWithPrefix.Address, + expectedOk: false, + }, + { + name: "IPv6 not resolvable", + netProto: ipv6.ProtocolNumber, + remoteAddr: ipv6Addr3.AddressWithPrefix.Address, + expectedOk: false, }, } @@ -432,27 +452,279 @@ func TestGetLinkAddress(t *testing.T) { host1Stack, _ := setupStack(t, stackOpts, host1NICID, host2NICID) - for i := 0; i < 2; i++ { - addr, ch, err := host1Stack.GetLinkAddress(host1NICID, test.remoteAddr, "", test.netProto, func(tcpip.LinkAddress, bool) {}) - var want *tcpip.Error - if i == 0 { - want = tcpip.ErrWouldBlock + ch := make(chan stack.LinkResolutionResult, 1) + if err := host1Stack.GetLinkAddress(host1NICID, test.remoteAddr, "", test.netProto, func(r stack.LinkResolutionResult) { + ch <- r + }); err != tcpip.ErrWouldBlock { + t.Fatalf("got host1Stack.GetLinkAddress(%d, %s, '', %d, _) = %s, want = %s", host1NICID, test.remoteAddr, test.netProto, err, tcpip.ErrWouldBlock) + } + wantRes := stack.LinkResolutionResult{Success: test.expectedOk} + if test.expectedOk { + wantRes.LinkAddress = linkAddr2 + } + if diff := cmp.Diff(wantRes, <-ch); diff != "" { + t.Fatalf("link resolution result mismatch (-want +got):\n%s", diff) + } + }) + } + }) + } +} + +func TestRouteResolvedFields(t *testing.T) { + const ( + host1NICID = 1 + host2NICID = 4 + ) + + tests := []struct { + name string + netProto tcpip.NetworkProtocolNumber + localAddr tcpip.Address + remoteAddr tcpip.Address + immediatelyResolvable bool + expectedSuccess bool + expectedLinkAddr tcpip.LinkAddress + }{ + { + name: "IPv4 immediately resolvable", + netProto: ipv4.ProtocolNumber, + localAddr: ipv4Addr1.AddressWithPrefix.Address, + remoteAddr: header.IPv4AllSystems, + immediatelyResolvable: true, + expectedSuccess: true, + expectedLinkAddr: header.EthernetAddressFromMulticastIPv4Address(header.IPv4AllSystems), + }, + { + name: "IPv6 immediately resolvable", + netProto: ipv6.ProtocolNumber, + localAddr: ipv6Addr1.AddressWithPrefix.Address, + remoteAddr: header.IPv6AllNodesMulticastAddress, + immediatelyResolvable: true, + expectedSuccess: true, + expectedLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllNodesMulticastAddress), + }, + { + name: "IPv4 resolvable", + netProto: ipv4.ProtocolNumber, + localAddr: ipv4Addr1.AddressWithPrefix.Address, + remoteAddr: ipv4Addr2.AddressWithPrefix.Address, + immediatelyResolvable: false, + expectedSuccess: true, + expectedLinkAddr: linkAddr2, + }, + { + name: "IPv6 resolvable", + netProto: ipv6.ProtocolNumber, + localAddr: ipv6Addr1.AddressWithPrefix.Address, + remoteAddr: ipv6Addr2.AddressWithPrefix.Address, + immediatelyResolvable: false, + expectedSuccess: true, + expectedLinkAddr: linkAddr2, + }, + { + name: "IPv4 not resolvable", + netProto: ipv4.ProtocolNumber, + localAddr: ipv4Addr1.AddressWithPrefix.Address, + remoteAddr: ipv4Addr3.AddressWithPrefix.Address, + immediatelyResolvable: false, + expectedSuccess: false, + }, + { + name: "IPv6 not resolvable", + netProto: ipv6.ProtocolNumber, + localAddr: ipv6Addr1.AddressWithPrefix.Address, + remoteAddr: ipv6Addr3.AddressWithPrefix.Address, + immediatelyResolvable: false, + expectedSuccess: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for _, useNeighborCache := range []bool{true, false} { + t.Run(fmt.Sprintf("UseNeighborCache=%t", useNeighborCache), func(t *testing.T) { + stackOpts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol}, + UseNeighborCache: useNeighborCache, + } + + host1Stack, _ := setupStack(t, stackOpts, host1NICID, host2NICID) + r, err := host1Stack.FindRoute(host1NICID, "", test.remoteAddr, test.netProto, false /* multicastLoop */) + if err != nil { + t.Fatalf("host1Stack.FindRoute(%d, '', %s, %d, false): %s", host1NICID, test.remoteAddr, test.netProto, err) + } + defer r.Release() + + var wantRouteInfo stack.RouteInfo + wantRouteInfo.LocalLinkAddress = linkAddr1 + wantRouteInfo.LocalAddress = test.localAddr + wantRouteInfo.RemoteAddress = test.remoteAddr + wantRouteInfo.NetProto = test.netProto + wantRouteInfo.Loop = stack.PacketOut + wantRouteInfo.RemoteLinkAddress = test.expectedLinkAddr + + ch := make(chan stack.ResolvedFieldsResult, 1) + + if !test.immediatelyResolvable { + wantUnresolvedRouteInfo := wantRouteInfo + wantUnresolvedRouteInfo.RemoteLinkAddress = "" + + if err := r.ResolvedFields(func(r stack.ResolvedFieldsResult) { + ch <- r + }); err != tcpip.ErrWouldBlock { + t.Errorf("got r.ResolvedFields(_) = %s, want = %s", err, tcpip.ErrWouldBlock) } - if err != want { - t.Fatalf("got host1Stack.GetLinkAddress(%d, %s, '', %d, _) = (%s, _, %s), want = (_, _, %s)", host1NICID, test.remoteAddr, test.netProto, addr, err, want) + if diff := cmp.Diff(stack.ResolvedFieldsResult{RouteInfo: wantRouteInfo, Success: test.expectedSuccess}, <-ch, cmp.AllowUnexported(stack.RouteInfo{})); diff != "" { + t.Errorf("route resolve result mismatch (-want +got):\n%s", diff) } - if i == 0 { - <-ch - continue + if !test.expectedSuccess { + return } - if addr != linkAddr2 { - t.Fatalf("got addr = %s, want = %s", addr, linkAddr2) + // At this point the neighbor table should be populated so the route + // should be immediately resolvable. + } + + if err := r.ResolvedFields(func(r stack.ResolvedFieldsResult) { + ch <- r + }); err != nil { + t.Errorf("r.ResolvedFields(_): %s", err) + } + select { + case routeResolveRes := <-ch: + if diff := cmp.Diff(stack.ResolvedFieldsResult{RouteInfo: wantRouteInfo, Success: true}, routeResolveRes, cmp.AllowUnexported(stack.RouteInfo{})); diff != "" { + t.Errorf("route resolve result from resolved route mismatch (-want +got):\n%s", diff) } + default: + t.Fatal("expected route to be immediately resolvable") } }) } }) } } + +func TestWritePacketsLinkResolution(t *testing.T) { + const ( + host1NICID = 1 + host2NICID = 4 + ) + + tests := []struct { + name string + netProto tcpip.NetworkProtocolNumber + remoteAddr tcpip.Address + expectedWriteErr *tcpip.Error + }{ + { + name: "IPv4", + netProto: ipv4.ProtocolNumber, + remoteAddr: ipv4Addr2.AddressWithPrefix.Address, + expectedWriteErr: nil, + }, + { + name: "IPv6", + netProto: ipv6.ProtocolNumber, + remoteAddr: ipv6Addr2.AddressWithPrefix.Address, + expectedWriteErr: nil, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + stackOpts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + } + + host1Stack, host2Stack := setupStack(t, stackOpts, host1NICID, host2NICID) + + var serverWQ waiter.Queue + serverWE, serverCH := waiter.NewChannelEntry(nil) + serverWQ.EventRegister(&serverWE, waiter.EventIn) + serverEP, err := host2Stack.NewEndpoint(udp.ProtocolNumber, test.netProto, &serverWQ) + if err != nil { + t.Fatalf("host2Stack.NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.netProto, err) + } + defer serverEP.Close() + + serverAddr := tcpip.FullAddress{Port: 1234} + if err := serverEP.Bind(serverAddr); err != nil { + t.Fatalf("serverEP.Bind(%#v): %s", serverAddr, err) + } + + r, err := host1Stack.FindRoute(host1NICID, "", test.remoteAddr, test.netProto, false /* multicastLoop */) + if err != nil { + t.Fatalf("host1Stack.FindRoute(%d, '', %s, %d, false): %s", host1NICID, test.remoteAddr, test.netProto, err) + } + defer r.Release() + + data := []byte{1, 2} + var pkts stack.PacketBufferList + for _, d := range data { + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: header.UDPMinimumSize + int(r.MaxHeaderLength()), + Data: buffer.View([]byte{d}).ToVectorisedView(), + }) + pkt.TransportProtocolNumber = udp.ProtocolNumber + length := uint16(pkt.Size()) + udpHdr := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize)) + udpHdr.Encode(&header.UDPFields{ + SrcPort: 5555, + DstPort: serverAddr.Port, + Length: length, + }) + xsum := r.PseudoHeaderChecksum(udp.ProtocolNumber, length) + for _, v := range pkt.Data.Views() { + xsum = header.Checksum(v, xsum) + } + udpHdr.SetChecksum(^udpHdr.CalculateChecksum(xsum)) + + pkts.PushBack(pkt) + } + + params := stack.NetworkHeaderParams{ + Protocol: udp.ProtocolNumber, + TTL: 64, + TOS: stack.DefaultTOS, + } + + if n, err := r.WritePackets(nil /* gso */, pkts, params); err != nil { + t.Fatalf("r.WritePackets(nil, %#v, _): %s", params, err) + } else if want := pkts.Len(); want != n { + t.Fatalf("got r.WritePackets(nil, %#v, _) = %d, want = %d", n, params, want) + } + + var writer bytes.Buffer + count := 0 + for { + var rOpts tcpip.ReadOptions + res, err := serverEP.Read(&writer, rOpts) + if err != nil { + if err == tcpip.ErrWouldBlock { + // Should not have anymore bytes to read after we read the sent + // number of bytes. + if count == len(data) { + break + } + + <-serverCH + continue + } + + t.Fatalf("serverEP.Read(_, %#v): %s", rOpts, err) + } + count += res.Count + } + + if got, want := host2Stack.Stats().UDP.PacketsReceived.Value(), uint64(len(data)); got != want { + t.Errorf("got host2Stack.Stats().UDP.PacketsReceived.Value() = %d, want = %d", got, want) + } + if diff := cmp.Diff(data, writer.Bytes()); diff != "" { + t.Errorf("read bytes mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go index 3b13ba04d..761283b66 100644 --- a/pkg/tcpip/tests/integration/loopback_test.go +++ b/pkg/tcpip/tests/integration/loopback_test.go @@ -232,7 +232,9 @@ func TestLoopbackAcceptAllInSubnetUDP(t *testing.T) { Port: localPort, }, } - n, err := sep.Write(tcpip.SlicePayload(data), wopts) + var r bytes.Reader + r.Reset(data) + n, err := sep.Write(&r, wopts) if err != nil { t.Fatalf("sep.Write(_, _): %s", err) } diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go index ce7c16bd1..9cc12fa58 100644 --- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go +++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go @@ -586,8 +586,10 @@ func TestReuseAddrAndBroadcast(t *testing.T) { Port: localPort, }, } - data := tcpip.SlicePayload([]byte{byte(i), 2, 3, 4}) - if n, err := wep.ep.Write(data, writeOpts); err != nil { + data := []byte{byte(i), 2, 3, 4} + var r bytes.Reader + r.Reset(data) + if n, err := wep.ep.Write(&r, writeOpts); err != nil { t.Fatalf("eps[%d].Write(_, _): %s", i, err) } else if want := int64(len(data)); n != want { t.Fatalf("got eps[%d].Write(_, _) = (%d, nil), want = (%d, nil)", i, n, want) diff --git a/pkg/tcpip/tests/integration/route_test.go b/pkg/tcpip/tests/integration/route_test.go index b222d2b05..35ee7437a 100644 --- a/pkg/tcpip/tests/integration/route_test.go +++ b/pkg/tcpip/tests/integration/route_test.go @@ -194,9 +194,11 @@ func TestLocalPing(t *testing.T) { return } - payload := tcpip.SlicePayload(test.icmpBuf(t)) + payload := test.icmpBuf(t) + var r bytes.Reader + r.Reset(payload) var wOpts tcpip.WriteOptions - if n, err := ep.Write(payload, wOpts); err != nil { + if n, err := ep.Write(&r, wOpts); err != nil { t.Fatalf("ep.Write(%#v, %#v): %s", payload, wOpts, err) } else if n != int64(len(payload)) { t.Fatalf("got ep.Write(%#v, %#v) = (%d, nil), want = (%d, nil)", payload, wOpts, n, len(payload)) @@ -329,12 +331,14 @@ func TestLocalUDP(t *testing.T) { Port: 80, } - clientPayload := tcpip.SlicePayload([]byte{1, 2, 3, 4}) + clientPayload := []byte{1, 2, 3, 4} { + var r bytes.Reader + r.Reset(clientPayload) wOpts := tcpip.WriteOptions{ To: &serverAddr, } - if n, err := client.Write(clientPayload, wOpts); err != subTest.expectedWriteErr { + if n, err := client.Write(&r, wOpts); err != subTest.expectedWriteErr { t.Fatalf("got client.Write(%#v, %#v) = (%d, %s), want = (_, %s)", clientPayload, wOpts, n, err, subTest.expectedWriteErr) } else if subTest.expectedWriteErr != nil { // Nothing else to test if we expected not to be able to send the @@ -376,12 +380,14 @@ func TestLocalUDP(t *testing.T) { } } - serverPayload := tcpip.SlicePayload([]byte{1, 2, 3, 4}) + serverPayload := []byte{1, 2, 3, 4} { + var r bytes.Reader + r.Reset(serverPayload) wOpts := tcpip.WriteOptions{ To: &clientAddr, } - if n, err := server.Write(serverPayload, wOpts); err != nil { + if n, err := server.Write(&r, wOpts); err != nil { t.Fatalf("server.Write(%#v, %#v): %s", serverPayload, wOpts, err) } else if n != int64(len(serverPayload)) { t.Fatalf("got server.Write(%#v, %#v) = (%d, nil), want = (%d, nil)", serverPayload, wOpts, n, len(serverPayload)) diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 256e19296..e4bcd3120 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -69,8 +69,7 @@ type endpoint struct { rcvClosed bool // The following fields are protected by the mu mutex. - mu sync.RWMutex `state:"nosave"` - sndBufSize int + mu sync.RWMutex `state:"nosave"` // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags state endpointState @@ -94,11 +93,17 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt }, waiterQueue: waiterQueue, rcvBufSizeMax: 32 * 1024, - sndBufSize: 32 * 1024, state: stateInitial, uniqueID: s.UniqueID(), } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits) + ep.ops.SetSendBufferSize(32*1024, false /* notify */) + + // Override with stack defaults. + var ss tcpip.SendBufferSizeOption + if err := s.Option(&ss); err == nil { + ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) + } return ep, nil } @@ -119,7 +124,7 @@ func (e *endpoint) Close() { e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite switch e.state { case stateBound, stateConnected: - e.stack.UnregisterTransportEndpoint(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, ports.Flags{}, 0 /* bindToDevice */) + e.stack.UnregisterTransportEndpoint([]tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, ports.Flags{}, 0 /* bindToDevice */) } // Close the receive list and drain it. @@ -313,11 +318,12 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, *tc route = r } - v, err := p.FullPayload() - if err != nil { - return 0, err + v := make([]byte, p.Len()) + if _, err := io.ReadFull(p, v); err != nil { + return 0, tcpip.ErrBadBuffer } + var err *tcpip.Error switch e.NetProto { case header.IPv4ProtocolNumber: err = send4(route, e.ID.LocalPort, v, e.ttl, e.owner) @@ -362,11 +368,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { } e.rcvMu.Unlock() return v, nil - case tcpip.SendBufferSizeOption: - e.mu.Lock() - v := e.sndBufSize - e.mu.Unlock() - return v, nil case tcpip.ReceiveBufferSizeOption: e.rcvMu.Lock() @@ -578,14 +579,14 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ if id.LocalPort != 0 { // The endpoint already has a local port, just attempt to // register it. - err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindToDevice */) + err := e.stack.RegisterTransportEndpoint(netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindToDevice */) return id, err } // We need to find a port for the endpoint. _, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) { id.LocalPort = p - err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindtodevice */) + err := e.stack.RegisterTransportEndpoint(netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindtodevice */) switch err { case nil: return true, nil diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go index 9d263c0ec..afe96998a 100644 --- a/pkg/tcpip/transport/icmp/endpoint_state.go +++ b/pkg/tcpip/transport/icmp/endpoint_state.go @@ -69,6 +69,7 @@ func (e *endpoint) afterLoad() { // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume(s *stack.Stack) { e.stack = s + e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits) if e.state != stateBound && e.state != stateConnected { return diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index c0d6fb442..d48877677 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -79,13 +79,11 @@ type endpoint struct { rcvClosed bool // The following fields are protected by mu. - mu sync.RWMutex `state:"nosave"` - sndBufSize int - sndBufSizeMax int - closed bool - stats tcpip.TransportEndpointStats `state:"nosave"` - bound bool - boundNIC tcpip.NICID + mu sync.RWMutex `state:"nosave"` + closed bool + stats tcpip.TransportEndpointStats `state:"nosave"` + bound bool + boundNIC tcpip.NICID // lastErrorMu protects lastError. lastErrorMu sync.Mutex `state:"nosave"` @@ -106,14 +104,13 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb netProto: netProto, waiterQueue: waiterQueue, rcvBufSizeMax: 32 * 1024, - sndBufSize: 32 * 1024, } - ep.ops.InitHandler(ep) + ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits) // Override with stack defaults. - var ss stack.SendBufferSizeOption + var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { - ep.sndBufSizeMax = ss.Default + ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs stack.ReceiveBufferSizeOption @@ -207,7 +204,7 @@ func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResul return res, nil } -func (*endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, *tcpip.Error) { +func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, *tcpip.Error) { // TODO(gvisor.dev/issue/173): Implement. return 0, tcpip.ErrInvalidOptionValue } @@ -320,24 +317,6 @@ func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { - case tcpip.SendBufferSizeOption: - // Make sure the send buffer size is within the min and max - // allowed. - var ss stack.SendBufferSizeOption - if err := ep.stack.Option(&ss); err != nil { - panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err)) - } - if v > ss.Max { - v = ss.Max - } - if v < ss.Min { - v = ss.Min - } - ep.mu.Lock() - ep.sndBufSizeMax = v - ep.mu.Unlock() - return nil - case tcpip.ReceiveBufferSizeOption: // Make sure the receive buffer size is within the min and max // allowed. @@ -395,12 +374,6 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { ep.rcvMu.Unlock() return v, nil - case tcpip.SendBufferSizeOption: - ep.mu.Lock() - v := ep.sndBufSizeMax - ep.mu.Unlock() - return v, nil - case tcpip.ReceiveBufferSizeOption: ep.rcvMu.Lock() v := ep.rcvBufSizeMax diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go index e2fa96d17..4d98fb051 100644 --- a/pkg/tcpip/transport/packet/endpoint_state.go +++ b/pkg/tcpip/transport/packet/endpoint_state.go @@ -63,8 +63,8 @@ func (ep *endpoint) loadRcvBufSizeMax(max int) { // afterLoad is invoked by stateify. func (ep *endpoint) afterLoad() { - // StackFromEnv is a stack used specifically for save/restore. ep.stack = stack.StackFromEnv + ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits) // TODO(gvisor.dev/173): Once bind is supported, choose the right NIC. if err := ep.stack.RegisterPacketEndpoint(0, ep.netProto, ep); err != nil { diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index ae743f75e..6c6d45188 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -76,12 +76,10 @@ type endpoint struct { rcvClosed bool // The following fields are protected by mu. - mu sync.RWMutex `state:"nosave"` - sndBufSize int - sndBufSizeMax int - closed bool - connected bool - bound bool + mu sync.RWMutex `state:"nosave"` + closed bool + connected bool + bound bool // route is the route to a remote network endpoint. It is set via // Connect(), and is valid only when conneted is true. route *stack.Route `state:"manual"` @@ -112,16 +110,16 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt }, waiterQueue: waiterQueue, rcvBufSizeMax: 32 * 1024, - sndBufSizeMax: 32 * 1024, associated: associated, } - e.ops.InitHandler(e) + e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits) e.ops.SetHeaderIncluded(!associated) + e.ops.SetSendBufferSize(32*1024, false /* notify */) // Override with stack defaults. - var ss stack.SendBufferSizeOption + var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { - e.sndBufSizeMax = ss.Default + e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs stack.ReceiveBufferSizeOption @@ -138,7 +136,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt return e, nil } - if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil { + if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { return nil, err } @@ -159,7 +157,7 @@ func (e *endpoint) Close() { return } - e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e) + e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e) e.rcvMu.Lock() defer e.rcvMu.Unlock() @@ -280,9 +278,9 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, *tc return 0, tcpip.ErrInvalidEndpointState } - payloadBytes, err := p.FullPayload() - if err != nil { - return 0, err + payloadBytes := make([]byte, p.Len()) + if _, err := io.ReadFull(p, payloadBytes); err != nil { + return 0, tcpip.ErrBadBuffer } // If this is an unassociated socket and callee provided a nonzero @@ -407,15 +405,18 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { if e.associated { // Re-register the endpoint with the appropriate NIC. - if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil { + if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { route.Release() return err } - e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e) + e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e) e.RegisterNICID = nic } - // Save the route we've connected via. + if e.route != nil { + // If the endpoint was previously connected then release any previous route. + e.route.Release() + } e.route = route e.connected = true @@ -449,16 +450,16 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error { defer e.mu.Unlock() // If a local address was specified, verify that it's valid. - if len(addr.Addr) != 0 && e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 { + if len(addr.Addr) != 0 && e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, addr.Addr) == 0 { return tcpip.ErrBadLocalAddress } if e.associated { // Re-register the endpoint with the appropriate NIC. - if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil { + if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { return err } - e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e) + e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e) e.RegisterNICID = addr.NIC e.BindNICID = addr.NIC } @@ -511,24 +512,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { - case tcpip.SendBufferSizeOption: - // Make sure the send buffer size is within the min and max - // allowed. - var ss stack.SendBufferSizeOption - if err := e.stack.Option(&ss); err != nil { - panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err)) - } - if v > ss.Max { - v = ss.Max - } - if v < ss.Min { - v = ss.Min - } - e.mu.Lock() - e.sndBufSizeMax = v - e.mu.Unlock() - return nil - case tcpip.ReceiveBufferSizeOption: // Make sure the receive buffer size is within the min and max // allowed. @@ -570,12 +553,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.rcvMu.Unlock() return v, nil - case tcpip.SendBufferSizeOption: - e.mu.Lock() - v := e.sndBufSizeMax - e.mu.Unlock() - return v, nil - case tcpip.ReceiveBufferSizeOption: e.rcvMu.Lock() v := e.rcvBufSizeMax diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go index 4a7e1c039..65c64d99f 100644 --- a/pkg/tcpip/transport/raw/endpoint_state.go +++ b/pkg/tcpip/transport/raw/endpoint_state.go @@ -69,6 +69,7 @@ func (e *endpoint) afterLoad() { // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume(s *stack.Stack) { e.stack = s + e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits) // If the endpoint is connected, re-connect. if e.connected { @@ -93,7 +94,7 @@ func (e *endpoint) Resume(s *stack.Stack) { } if e.associated { - if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil { + if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { panic(err) } } diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 7e81203ba..fcdd032c5 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -99,7 +99,6 @@ go_test( "//pkg/rand", "//pkg/sync", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/checker", "//pkg/tcpip/header", "//pkg/tcpip/link/loopback", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 6921de0f1..e475c36f3 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -288,7 +288,7 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q } // Register new endpoint so that packets are routed to it. - if err := ep.stack.RegisterTransportEndpoint(ep.boundNICID, ep.effectiveNetProtos, ProtocolNumber, ep.ID, ep, ep.boundPortFlags, ep.boundBindToDevice); err != nil { + if err := ep.stack.RegisterTransportEndpoint(ep.effectiveNetProtos, ProtocolNumber, ep.ID, ep, ep.boundPortFlags, ep.boundBindToDevice); err != nil { ep.mu.Unlock() ep.Close() @@ -692,7 +692,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er } // Register new endpoint so that packets are routed to it. - if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.boundPortFlags, n.boundBindToDevice); err != nil { + if err := n.stack.RegisterTransportEndpoint(n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.boundPortFlags, n.boundBindToDevice); err != nil { n.mu.Unlock() n.Close() diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index f711cd4df..62954d7e4 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -895,43 +895,46 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn return err } -func (e *endpoint) handleWrite() *tcpip.Error { - // Move packets from send queue to send list. The queue is accessible - // from other goroutines and protected by the send mutex, while the send - // list is only accessible from the handler goroutine, so it needs no - // mutexes. +func (e *endpoint) handleWrite() { e.sndBufMu.Lock() + next := e.drainSendQueueLocked() + e.sndBufMu.Unlock() + + e.sendData(next) +} +// Move packets from send queue to send list. +// +// Precondition: e.sndBufMu must be locked. +func (e *endpoint) drainSendQueueLocked() *segment { first := e.sndQueue.Front() if first != nil { e.snd.writeList.PushBackList(&e.sndQueue) e.sndBufInQueue = 0 } + return first +} - e.sndBufMu.Unlock() - +// Precondition: e.mu must be locked. +func (e *endpoint) sendData(next *segment) { // Initialize the next segment to write if it's currently nil. if e.snd.writeNext == nil { - e.snd.writeNext = first + e.snd.writeNext = next } // Push out any new packets. e.snd.sendData() - - return nil } -func (e *endpoint) handleClose() *tcpip.Error { +func (e *endpoint) handleClose() { if !e.EndpointState().connected() { - return nil + return } // Drain the send queue. e.handleWrite() // Mark send side as closed. e.snd.closed = true - - return nil } // resetConnectionLocked puts the endpoint in an error state with the given @@ -1348,11 +1351,17 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ }{ { w: &e.sndWaker, - f: e.handleWrite, + f: func() *tcpip.Error { + e.handleWrite() + return nil + }, }, { w: &e.sndCloseWaker, - f: e.handleClose, + f: func() *tcpip.Error { + e.handleClose() + return nil + }, }, { w: &closeWaker, diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go index 1d1b01a6c..809c88732 100644 --- a/pkg/tcpip/transport/tcp/dual_stack_test.go +++ b/pkg/tcpip/transport/tcp/dual_stack_test.go @@ -15,11 +15,11 @@ package tcp_test import ( + "strings" "testing" "time" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -415,8 +415,10 @@ func testV4Accept(t *testing.T, c *context.Context) { t.Fatalf("Unexpected remote address: got %v, want %v", addr.Addr, context.TestAddr) } + var r strings.Reader data := "Don't panic" - nep.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{}) + r.Reset(data) + nep.Write(&r, tcpip.WriteOptions{}) b = c.GetPacket() tcp = header.TCP(header.IPv4(b).Payload()) if string(tcp.Payload()) != data { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index ea509ac73..b6bd6d455 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -557,7 +557,6 @@ type endpoint struct { // When the send side is closed, the protocol goroutine is notified via // sndCloseWaker, and sndClosed is set to true. sndBufMu sync.Mutex `state:"nosave"` - sndBufSize int sndBufUsed int sndClosed bool sndBufInQueue seqnum.Size @@ -869,7 +868,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue waiterQueue: waiterQueue, state: StateInitial, rcvBufSize: DefaultReceiveBufferSize, - sndBufSize: DefaultSendBufferSize, sndMTU: int(math.MaxInt32), keepalive: keepalive{ // Linux defaults. @@ -882,13 +880,14 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue windowClamp: DefaultReceiveBufferSize, maxSynRetries: DefaultSynRetries, } - e.ops.InitHandler(e) + e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits) e.ops.SetMulticastLoop(true) e.ops.SetQuickAck(true) + e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) var ss tcpip.TCPSendBufferSizeRangeOption if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { - e.sndBufSize = ss.Default + e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs tcpip.TCPReceiveBufferSizeRangeOption @@ -967,7 +966,8 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // Determine if the endpoint is writable if requested. if (mask & waiter.EventOut) != 0 { e.sndBufMu.Lock() - if e.sndClosed || e.sndBufUsed < e.sndBufSize { + sndBufSize := e.getSendBufferSize() + if e.sndClosed || e.sndBufUsed < sndBufSize { result |= waiter.EventOut } e.sndBufMu.Unlock() @@ -1087,7 +1087,7 @@ func (e *endpoint) closeNoShutdownLocked() { // in Listen() when trying to register. if e.EndpointState() == StateListen && e.isPortReserved { if e.isRegistered { - e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) + e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) e.isRegistered = false } @@ -1161,7 +1161,7 @@ func (e *endpoint) cleanupLocked() { e.workerCleanup = false if e.isRegistered { - e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) + e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) e.isRegistered = false } @@ -1499,7 +1499,8 @@ func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { return 0, tcpip.ErrClosedForSend } - avail := e.sndBufSize - e.sndBufUsed + sndBufSize := e.getSendBufferSize() + avail := sndBufSize - e.sndBufUsed if avail <= 0 { return 0, tcpip.ErrWouldBlock } @@ -1513,69 +1514,79 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, *tc // and opts.EndOfRecord are also ignored. e.LockUser() - e.sndBufMu.Lock() - - avail, err := e.isEndpointWritableLocked() - if err != nil { - e.sndBufMu.Unlock() - e.UnlockUser() - e.stats.WriteErrors.WriteClosed.Increment() - return 0, err - } - - // We can release locks while copying data. - // - // This is not possible if atomic is set, because we can't allow the - // available buffer space to be consumed by some other caller while we - // are copying data in. - if !opts.Atomic { - e.sndBufMu.Unlock() - e.UnlockUser() - } - - // Fetch data. - v, perr := p.Payload(avail) - if perr != nil || len(v) == 0 { - // Note that perr may be nil if len(v) == 0. - if opts.Atomic { - e.sndBufMu.Unlock() - e.UnlockUser() - } - return 0, perr - } + defer e.UnlockUser() - if !opts.Atomic { - // Since we released locks in between it's possible that the - // endpoint transitioned to a CLOSED/ERROR states so make - // sure endpoint is still writable before trying to write. - e.LockUser() + nextSeg, n, err := func() (*segment, int, *tcpip.Error) { e.sndBufMu.Lock() + defer e.sndBufMu.Unlock() + avail, err := e.isEndpointWritableLocked() if err != nil { - e.sndBufMu.Unlock() - e.UnlockUser() e.stats.WriteErrors.WriteClosed.Increment() - return 0, err + return nil, 0, err } - // Discard any excess data copied in due to avail being reduced due - // to a simultaneous write call to the socket. - if avail < len(v) { - v = v[:avail] + v, err := func() ([]byte, *tcpip.Error) { + // We can release locks while copying data. + // + // This is not possible if atomic is set, because we can't allow the + // available buffer space to be consumed by some other caller while we + // are copying data in. + if !opts.Atomic { + e.sndBufMu.Unlock() + defer e.sndBufMu.Lock() + + e.UnlockUser() + defer e.LockUser() + } + + // Fetch data. + if l := p.Len(); l < avail { + avail = l + } + if avail == 0 { + return nil, nil + } + v := make([]byte, avail) + if _, err := io.ReadFull(p, v); err != nil { + return nil, tcpip.ErrBadBuffer + } + return v, nil + }() + if len(v) == 0 || err != nil { + return nil, 0, err + } + + if !opts.Atomic { + // Since we released locks in between it's possible that the + // endpoint transitioned to a CLOSED/ERROR states so make + // sure endpoint is still writable before trying to write. + avail, err := e.isEndpointWritableLocked() + if err != nil { + e.stats.WriteErrors.WriteClosed.Increment() + return nil, 0, err + } + + // Discard any excess data copied in due to avail being reduced due + // to a simultaneous write call to the socket. + if avail < len(v) { + v = v[:avail] + } } - } - // Add data to the send queue. - s := newOutgoingSegment(e.ID, v) - e.sndBufUsed += len(v) - e.sndBufInQueue += seqnum.Size(len(v)) - e.sndQueue.PushBack(s) - e.sndBufMu.Unlock() + // Add data to the send queue. + s := newOutgoingSegment(e.ID, v) + e.sndBufUsed += len(v) + e.sndBufInQueue += seqnum.Size(len(v)) + e.sndQueue.PushBack(s) - // Do the work inline. - e.handleWrite() - e.UnlockUser() - return int64(len(v)), nil + return e.drainSendQueueLocked(), len(v), nil + }() + if err != nil { + return 0, err + } + e.sendData(nextSeg) + return int64(n), nil } // selectWindowLocked returns the new window without checking for shrinking or scaling @@ -1682,6 +1693,14 @@ func (e *endpoint) OnCorkOptionSet(v bool) { } } +func (e *endpoint) getSendBufferSize() int { + sndBufSize, err := e.ops.GetSendBufferSize() + if err != nil { + panic(fmt.Sprintf("e.ops.GetSendBufferSize() = %s", err)) + } + return int(sndBufSize) +} + // SetSockOptInt sets a socket option. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 @@ -1775,31 +1794,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { e.rcvListMu.Unlock() e.UnlockUser() - case tcpip.SendBufferSizeOption: - // Make sure the send buffer size is within the min and max - // allowed. - var ss tcpip.TCPSendBufferSizeRangeOption - if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err != nil { - panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &ss, err)) - } - - if v > ss.Max { - v = ss.Max - } - - if v < math.MaxInt32/SegOverheadFactor { - v *= SegOverheadFactor - if v < ss.Min { - v = ss.Min - } - } else { - v = math.MaxInt32 - } - - e.sndBufMu.Lock() - e.sndBufSize = v - e.sndBufMu.Unlock() - case tcpip.TTLOption: e.LockUser() e.ttl = uint8(v) @@ -1985,12 +1979,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { case tcpip.ReceiveQueueSizeOption: return e.readyReceiveSize() - case tcpip.SendBufferSizeOption: - e.sndBufMu.Lock() - v := e.sndBufSize - e.sndBufMu.Unlock() - return v, nil - case tcpip.ReceiveBufferSizeOption: e.rcvListMu.Lock() v := e.rcvBufSize @@ -2190,7 +2178,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc if e.ID.LocalPort != 0 { // The endpoint is bound to a port, attempt to register it. - err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) + err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) if err != nil { return err } @@ -2278,7 +2266,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc id := e.ID id.LocalPort = p - if err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { + if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { e.stack.ReleasePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, bindToDevice, addr) if err == tcpip.ErrPortInUse { return false, nil @@ -2482,7 +2470,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error { } // Register the endpoint. - if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { + if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { return err } @@ -2604,7 +2592,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) { // demuxer. Further connected endpoints always have a remote // address/port. Hence this will only return an error if there is a matching // listening endpoint. - if err := e.stack.CheckRegisterTransportEndpoint(nic, netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { + if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { return false } return true @@ -2739,13 +2727,14 @@ func (e *endpoint) HandleControlPacket(typ stack.ControlType, extra uint32, pkt // updateSndBufferUsage is called by the protocol goroutine when room opens up // in the send buffer. The number of newly available bytes is v. func (e *endpoint) updateSndBufferUsage(v int) { + sendBufferSize := e.getSendBufferSize() e.sndBufMu.Lock() - notify := e.sndBufUsed >= e.sndBufSize>>1 + notify := e.sndBufUsed >= sendBufferSize>>1 e.sndBufUsed -= v - // We only notify when there is half the sndBufSize available after + // We only notify when there is half the sendBufferSize available after // a full buffer event occurs. This ensures that we don't wake up // writers to queue just 1-2 segments and go back to sleep. - notify = notify && e.sndBufUsed < e.sndBufSize>>1 + notify = notify && e.sndBufUsed < int(sendBufferSize)>>1 e.sndBufMu.Unlock() if notify { @@ -2957,8 +2946,9 @@ func (e *endpoint) completeState() stack.TCPEndpointState { s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() // Copy endpoint send state. + sndBufSize := e.getSendBufferSize() e.sndBufMu.Lock() - s.SndBufSize = e.sndBufSize + s.SndBufSize = sndBufSize s.SndBufUsed = e.sndBufUsed s.SndClosed = e.sndClosed s.SndBufInQueue = e.sndBufInQueue @@ -3103,3 +3093,17 @@ func (e *endpoint) Wait() { func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } + +// GetTCPSendBufferLimits is used to get send buffer size limits for TCP. +func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { + var ss tcpip.TCPSendBufferSizeRangeOption + if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { + panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) + } + + return tcpip.SendBufferSizeOption{ + Min: ss.Min, + Default: ss.Default, + Max: ss.Max, + } +} diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index ba67176b5..4a01c81b4 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -55,7 +55,9 @@ func (e *endpoint) beforeSave() { case epState.connected() || epState.handshake(): if !e.route.HasSaveRestoreCapability() { if !e.route.HasDisconncetOkCapability() { - panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.ID.LocalAddress, e.ID.LocalPort, e.ID.RemoteAddress, e.ID.RemotePort)}) + panic(&tcpip.ErrSaveRejection{ + Err: fmt.Errorf("endpoint cannot be saved in connected state: local %s:%d, remote %s:%d", e.ID.LocalAddress, e.ID.LocalPort, e.ID.RemoteAddress, e.ID.RemotePort), + }) } e.resetConnectionLocked(tcpip.ErrConnectionAborted) e.mu.Unlock() @@ -179,14 +181,16 @@ func (e *endpoint) afterLoad() { // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume(s *stack.Stack) { e.stack = s + e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits) e.segmentQueue.thaw() epState := e.origEndpointState switch epState { case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished: var ss tcpip.TCPSendBufferSizeRangeOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil { - if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max { - panic(fmt.Sprintf("endpoint.sndBufSize %d is outside the min and max allowed [%d, %d]", e.sndBufSize, ss.Min, ss.Max)) + sendBufferSize := e.getSendBufferSize() + if sendBufferSize < ss.Min || sendBufferSize > ss.Max { + panic(fmt.Sprintf("endpoint sendBufferSize %d is outside the min and max allowed [%d, %d]", sendBufferSize, ss.Min, ss.Max)) } } diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go index f7aaee23f..ced3a9c58 100644 --- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go +++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go @@ -21,13 +21,13 @@ package tcp_test import ( + "bytes" "fmt" "math" "testing" "time" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context" @@ -42,14 +42,16 @@ func TestFastRecovery(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) const iterations = 3 - data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1))) + data := make([]byte, 2*maxPayload*(tcp.InitialCwnd<<(iterations+1))) for i := range data { data[i] = byte(i) } // Write all the data in one shot. Packets will only be written at the // MTU size though. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -207,14 +209,16 @@ func TestExponentialIncreaseDuringSlowStart(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) const iterations = 3 - data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1))) + data := make([]byte, maxPayload*(tcp.InitialCwnd<<(iterations+1))) for i := range data { data[i] = byte(i) } // Write all the data in one shot. Packets will only be written at the // MTU size though. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -249,14 +253,16 @@ func TestCongestionAvoidance(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) const iterations = 3 - data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1))) + data := make([]byte, 2*maxPayload*(tcp.InitialCwnd<<(iterations+1))) for i := range data { data[i] = byte(i) } // Write all the data in one shot. Packets will only be written at the // MTU size though. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -353,15 +359,16 @@ func TestCubicCongestionAvoidance(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) const iterations = 3 - data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1))) - + data := make([]byte, 2*maxPayload*(tcp.InitialCwnd<<(iterations+1))) for i := range data { data[i] = byte(i) } // Write all the data in one shot. Packets will only be written at the // MTU size though. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -462,19 +469,20 @@ func TestRetransmit(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) const iterations = 3 - data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1))) + data := make([]byte, maxPayload*(tcp.InitialCwnd<<(iterations+1))) for i := range data { data[i] = byte(i) } // Write all the data in two shots. Packets will only be written at the // MTU size though. - half := data[:len(data)/2] - if _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data[:len(data)/2]) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } - half = data[len(data)/2:] - if _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil { + r.Reset(data[len(data)/2:]) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go index 342eb5eb8..af915203b 100644 --- a/pkg/tcpip/transport/tcp/tcp_rack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go @@ -15,11 +15,11 @@ package tcp_test import ( + "bytes" "testing" "time" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -61,14 +61,16 @@ func TestRACKUpdate(t *testing.T) { setStackSACKPermitted(t, c, true) createConnectedWithSACKAndTS(c) - data := buffer.NewView(maxPayload) + data := make([]byte, maxPayload) for i := range data { data[i] = byte(i) } // Write the data. xmitTime = time.Now() - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -114,13 +116,15 @@ func TestRACKDetectReorder(t *testing.T) { }) setStackSACKPermitted(t, c, true) createConnectedWithSACKAndTS(c) - data := buffer.NewView(ackNumToVerify * maxPayload) + data := make([]byte, ackNumToVerify*maxPayload) for i := range data { data[i] = byte(i) } // Write the data. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -141,17 +145,19 @@ func TestRACKDetectReorder(t *testing.T) { <-probeDone } -func sendAndReceive(t *testing.T, c *context.Context, numPackets int) buffer.View { +func sendAndReceive(t *testing.T, c *context.Context, numPackets int) []byte { setStackSACKPermitted(t, c, true) createConnectedWithSACKAndTS(c) - data := buffer.NewView(numPackets * maxPayload) + data := make([]byte, numPackets*maxPayload) for i := range data { data[i] = byte(i) } // Write the data. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go index 6635bb815..5024bc925 100644 --- a/pkg/tcpip/transport/tcp/tcp_sack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go @@ -15,6 +15,7 @@ package tcp_test import ( + "bytes" "fmt" "log" "reflect" @@ -22,7 +23,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -395,14 +395,16 @@ func TestSACKRecovery(t *testing.T) { createConnectedWithSACKAndTS(c) const iterations = 3 - data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1))) + data := make([]byte, 2*maxPayload*(tcp.InitialCwnd<<(iterations+1))) for i := range data { data[i] = byte(i) } // Write all the data in one shot. Packets will only be written at the // MTU size though. - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 93683b921..87ff2b909 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -19,6 +19,7 @@ import ( "fmt" "io/ioutil" "math" + "strings" "testing" "time" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" @@ -1347,10 +1347,9 @@ func TestTOSV4(t *testing.T) { testV4Connect(t, c, checker.TOS(tos, 0)) data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -1396,10 +1395,9 @@ func TestTrafficClassV6(t *testing.T) { testV6Connect(t, c, checker.TOS(tos, 0)) data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2176,10 +2174,9 @@ func TestSimpleSend(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2217,10 +2214,9 @@ func TestZeroWindowSend(t *testing.T) { c.CreateConnected(789 /* iss */, 0 /* rcvWnd */, -1 /* epRcvBuf */) data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2285,10 +2281,9 @@ func TestScaledWindowConnect(t *testing.T) { }) data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2317,10 +2312,9 @@ func TestNonScaledWindowConnect(t *testing.T) { c.CreateConnected(789, 30000, 65535*3) data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2391,10 +2385,9 @@ func TestScaledWindowAccept(t *testing.T) { } data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2465,10 +2458,9 @@ func TestNonScaledWindowAccept(t *testing.T) { } data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -2632,9 +2624,10 @@ func TestSegmentMerging(t *testing.T) { // Send tcp.InitialCwnd number of segments to fill up // InitialWindow but don't ACK. That should prevent // anymore packets from going out. + var r bytes.Reader for i := 0; i < tcp.InitialCwnd; i++ { - view := buffer.NewViewFromBytes([]byte{0}) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + r.Reset([]byte{0}) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write #%d failed: %s", i+1, err) } } @@ -2644,8 +2637,8 @@ func TestSegmentMerging(t *testing.T) { var allData []byte for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} { allData = append(allData, data...) - view := buffer.NewViewFromBytes(data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write #%d failed: %s", i+1, err) } } @@ -2714,8 +2707,9 @@ func TestDelay(t *testing.T) { var allData []byte for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} { allData = append(allData, data...) - view := buffer.NewViewFromBytes(data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write #%d failed: %s", i+1, err) } } @@ -2761,8 +2755,9 @@ func TestUndelay(t *testing.T) { allData := [][]byte{{0}, {1, 2, 3}} for i, data := range allData { - view := buffer.NewViewFromBytes(data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write #%d failed: %s", i+1, err) } } @@ -2845,8 +2840,9 @@ func TestMSSNotDelayed(t *testing.T) { allData := [][]byte{{0}, make([]byte, maxPayload), make([]byte, maxPayload)} for i, data := range allData { - view := buffer.NewViewFromBytes(data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write #%d failed: %s", i+1, err) } } @@ -2894,10 +2890,9 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) { data[i] = byte(i) } - view := buffer.NewView(len(data)) - copy(view, data) - - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -3328,8 +3323,9 @@ func TestSendOnResetConnection(t *testing.T) { time.Sleep(1 * time.Second) // Try to write. - view := buffer.NewView(10) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != tcpip.ErrConnectionReset { + var r bytes.Reader + r.Reset(make([]byte, 10)) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != tcpip.ErrConnectionReset { t.Fatalf("got c.EP.Write(...) = %s, want = %s", err, tcpip.ErrConnectionReset) } } @@ -3352,7 +3348,9 @@ func TestMaxRetransmitsTimeout(t *testing.T) { c.WQ.EventRegister(&waitEntry, waiter.EventHUp) defer c.WQ.EventUnregister(&waitEntry) - _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{}) + var r bytes.Reader + r.Reset(make([]byte, 1)) + _, err := c.EP.Write(&r, tcpip.WriteOptions{}) if err != nil { t.Fatalf("Write failed: %s", err) } @@ -3409,7 +3407,9 @@ func TestMaxRTO(t *testing.T) { c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */) - _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{}) + var r bytes.Reader + r.Reset(make([]byte, 1)) + _, err := c.EP.Write(&r, tcpip.WriteOptions{}) if err != nil { t.Fatalf("Write failed: %s", err) } @@ -3458,7 +3458,9 @@ func TestRetransmitIPv4IDUniqueness(t *testing.T) { t.Fatalf("disabling PMTU discovery via sockopt to force DF=0 failed: %s", err) } - if _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(tc.size)), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(make([]byte, tc.size)) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } pkt := c.GetPacket() @@ -3595,8 +3597,10 @@ func TestFinWithNoPendingData(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) // Write something out, and have it acknowledged. - view := buffer.NewView(10) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + view := make([]byte, 10) + var r bytes.Reader + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -3667,9 +3671,11 @@ func TestFinWithPendingDataCwndFull(t *testing.T) { // Write enough segments to fill the congestion window before ACK'ing // any of them. - view := buffer.NewView(10) + view := make([]byte, 10) + var r bytes.Reader for i := tcp.InitialCwnd; i > 0; i-- { - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } } @@ -3754,8 +3760,10 @@ func TestFinWithPendingData(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) // Write something out, and acknowledge it to get cwnd to 2. - view := buffer.NewView(10) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + view := make([]byte, 10) + var r bytes.Reader + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -3781,7 +3789,8 @@ func TestFinWithPendingData(t *testing.T) { }) // Write new data, but don't acknowledge it. - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -3841,8 +3850,10 @@ func TestFinWithPartialAck(t *testing.T) { // Write something out, and acknowledge it to get cwnd to 2. Also send // FIN from the test side. - view := buffer.NewView(10) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + view := make([]byte, 10) + var r bytes.Reader + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -3879,7 +3890,8 @@ func TestFinWithPartialAck(t *testing.T) { ) // Write new data, but don't acknowledge it. - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -3985,8 +3997,10 @@ func scaledSendWindow(t *testing.T, scale uint8) { }) // Send some data. Check that it's capped by the window size. - view := buffer.NewView(65535) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + view := make([]byte, 65535) + var r bytes.Reader + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -4351,9 +4365,9 @@ func checkRecvBufferSize(t *testing.T, ep tcpip.Endpoint, v int) { func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) { t.Helper() - s, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption) + s, err := ep.SocketOptions().GetSendBufferSize() if err != nil { - t.Fatalf("GetSockOpt failed: %s", err) + t.Fatalf("GetSendBufferSize failed: %s", err) } if int(s) != v { @@ -4459,9 +4473,7 @@ func TestMinMaxBufferSizes(t *testing.T) { checkRecvBufferSize(t, ep, 200) - if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 149); err != nil { - t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err) - } + ep.SocketOptions().SetSendBufferSize(149, true) checkSendBufferSize(t, ep, 300) @@ -4473,9 +4485,7 @@ func TestMinMaxBufferSizes(t *testing.T) { // Values above max are capped at max and then doubled. checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20*2) - if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil { - t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err) - } + ep.SocketOptions().SetSendBufferSize(1+tcp.DefaultSendBufferSize*30, true) // Values above max are capped at max and then doubled. checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30*2) @@ -4610,9 +4620,9 @@ func TestSelfConnect(t *testing.T) { // Write something. data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - if _, err := ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := ep.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -4785,12 +4795,13 @@ func TestPathMTUDiscovery(t *testing.T) { // Send 3200 bytes of data. const writeSize = 3200 - data := buffer.NewView(writeSize) + data := make([]byte, writeSize) for i := range data { data[i] = byte(i) } - - if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -5078,8 +5089,10 @@ func TestKeepalive(t *testing.T) { // Send some data and wait before ACKing it. Keepalives should be disabled // during this period. - view := buffer.NewView(3) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + view := make([]byte, 3) + var r bytes.Reader + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -5358,7 +5371,9 @@ func TestListenBacklogFull(t *testing.T) { // Now verify that the TCP socket is usable and in a connected state. data := "Don't panic" - newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{}) + var r strings.Reader + r.Reset(data) + newEP.Write(&r, tcpip.WriteOptions{}) b := c.GetPacket() tcp := header.TCP(header.IPv4(b).Payload()) if string(tcp.Payload()) != data { @@ -5674,7 +5689,9 @@ func TestListenSynRcvdQueueFull(t *testing.T) { // Now verify that the TCP socket is usable and in a connected state. data := "Don't panic" - newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{}) + var r strings.Reader + r.Reset(data) + newEP.Write(&r, tcpip.WriteOptions{}) pkt := c.GetPacket() tcp = header.TCP(header.IPv4(pkt).Payload()) if string(tcp.Payload()) != data { @@ -5908,7 +5925,9 @@ func TestSynRcvdBadSeqNumber(t *testing.T) { // Now verify that the TCP socket is usable and in a connected state. data := "Don't panic" - if _, err := newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{}); err != nil { + var r strings.Reader + r.Reset(data) + if _, err := newEP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -7103,10 +7122,10 @@ func TestTCPCloseWithData(t *testing.T) { // Now write a few bytes and then close the endpoint. data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } @@ -7204,8 +7223,10 @@ func TestTCPUserTimeout(t *testing.T) { } // Send some data and wait before ACKing it. - view := buffer.NewView(3) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + view := make([]byte, 3) + var r bytes.Reader + r.Reset(view) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Write failed: %s", err) } diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go index b65091c3c..5a9745ad7 100644 --- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go @@ -22,7 +22,6 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" @@ -152,10 +151,10 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS // Now send some data and validate that timestamp is echoed correctly in the ACK. data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Unexpected error from Write: %s", err) } @@ -215,10 +214,10 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd // Now send some data with the accepted connection endpoint and validate // that no timestamp option is sent in the TCP segment. data := []byte{1, 2, 3} - view := buffer.NewView(len(data)) - copy(view, data) - if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil { + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { t.Fatalf("Unexpected error from Write: %s", err) } diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 9f9b3d510..4988ba29b 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -97,9 +97,7 @@ type endpoint struct { rcvClosed bool // The following fields are protected by the mu mutex. - mu sync.RWMutex `state:"nosave"` - sndBufSize int - sndBufSizeMax int + mu sync.RWMutex `state:"nosave"` // state must be read/set using the EndpointState()/setEndpointState() // methods. state EndpointState @@ -176,18 +174,18 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue // Linux defaults to TTL=1. multicastTTL: 1, rcvBufSizeMax: 32 * 1024, - sndBufSizeMax: 32 * 1024, multicastMemberships: make(map[multicastMembership]struct{}), state: StateInitial, uniqueID: s.UniqueID(), } - e.ops.InitHandler(e) + e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits) e.ops.SetMulticastLoop(true) + e.ops.SetSendBufferSize(32*1024, false /* notify */) // Override with stack defaults. - var ss stack.SendBufferSizeOption + var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { - e.sndBufSizeMax = ss.Default + e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs stack.ReceiveBufferSizeOption @@ -246,7 +244,7 @@ func (e *endpoint) Close() { switch e.EndpointState() { case StateBound, StateConnected: - e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) + e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{}) e.boundBindToDevice = 0 e.boundPortFlags = ports.Flags{} @@ -514,9 +512,9 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, *tc return 0, tcpip.ErrBroadcastDisabled } - v, err := p.FullPayload() - if err != nil { - return 0, err + v := make([]byte, p.Len()) + if _, err := io.ReadFull(p, v); err != nil { + return 0, tcpip.ErrBadBuffer } if len(v) > header.UDPMaximumPacketSize { // Payload can't possibly fit in a packet. @@ -632,25 +630,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { e.rcvBufSizeMax = v e.mu.Unlock() return nil - case tcpip.SendBufferSizeOption: - // Make sure the send buffer size is within the min and max - // allowed. - var ss stack.SendBufferSizeOption - if err := e.stack.Option(&ss); err != nil { - panic(fmt.Sprintf("e.stack.Option(%#v) = %s", ss, err)) - } - - if v < ss.Min { - v = ss.Min - } - if v > ss.Max { - v = ss.Max - } - - e.mu.Lock() - e.sndBufSizeMax = v - e.mu.Unlock() - return nil } return nil @@ -811,12 +790,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.rcvMu.Unlock() return v, nil - case tcpip.SendBufferSizeOption: - e.mu.Lock() - v := e.sndBufSizeMax - e.mu.Unlock() - return v, nil - case tcpip.ReceiveBufferSizeOption: e.rcvMu.Lock() v := e.rcvBufSizeMax @@ -935,7 +908,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { LocalPort: e.ID.LocalPort, LocalAddress: e.ID.LocalAddress, } - id, btd, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id) + id, btd, err = e.registerWithStack(e.effectiveNetProtos, id) if err != nil { return err } @@ -950,7 +923,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { e.setEndpointState(StateInitial) } - e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice) + e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice) e.ID = id e.boundBindToDevice = btd e.route.Release() @@ -1023,7 +996,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { oldPortFlags := e.boundPortFlags - id, btd, err := e.registerWithStack(nicID, netProtos, id) + id, btd, err := e.registerWithStack(netProtos, id) if err != nil { r.Release() return err @@ -1031,11 +1004,16 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { // Remove the old registration. if e.ID.LocalPort != 0 { - e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice) + e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice) } e.ID = id e.boundBindToDevice = btd + if e.route != nil { + // If the endpoint was already connected then make sure we release the + // previous route. + e.route.Release() + } e.route = r e.dstPort = addr.Port e.RegisterNICID = nicID @@ -1093,7 +1071,7 @@ func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcp return nil, nil, tcpip.ErrNotSupported } -func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) { +func (e *endpoint) registerWithStack(netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) { bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) if e.ID.LocalPort == 0 { port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, bindToDevice, tcpip.FullAddress{}, nil /* testPort */) @@ -1104,7 +1082,7 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ } e.boundPortFlags = e.portFlags - err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.boundPortFlags, bindToDevice) + err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.boundPortFlags, bindToDevice) if err != nil { e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, bindToDevice, tcpip.FullAddress{}) e.boundPortFlags = ports.Flags{} @@ -1148,7 +1126,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { LocalPort: addr.Port, LocalAddress: addr.Addr, } - id, btd, err := e.registerWithStack(nicID, netProtos, id) + id, btd, err := e.registerWithStack(netProtos, id) if err != nil { return err } diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 13b72dc88..feb53b553 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -91,6 +91,7 @@ func (e *endpoint) Resume(s *stack.Stack) { defer e.mu.Unlock() e.stack = s + e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits) for m := range e.multicastMemberships { if err := e.stack.JoinGroup(e.NetProto, m.nicID, m.multicastAddr); err != nil { @@ -131,7 +132,7 @@ func (e *endpoint) Resume(s *stack.Stack) { // pass it to the reservation machinery. id := e.ID e.ID.LocalPort = 0 - e.ID, e.boundBindToDevice, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id) + e.ID, e.boundBindToDevice, err = e.registerWithStack(e.effectiveNetProtos, id) if err != nil { panic(err) } diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go index 49e673d58..aae794506 100644 --- a/pkg/tcpip/transport/udp/forwarder.go +++ b/pkg/tcpip/transport/udp/forwarder.go @@ -77,7 +77,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, } ep := newEndpoint(r.stack, r.pkt.NetworkProtocolNumber, queue) - if err := r.stack.RegisterTransportEndpoint(r.pkt.NICID, []tcpip.NetworkProtocolNumber{r.pkt.NetworkProtocolNumber}, ProtocolNumber, r.id, ep, ep.portFlags, tcpip.NICID(ep.ops.GetBindToDevice())); err != nil { + if err := r.stack.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{r.pkt.NetworkProtocolNumber}, ProtocolNumber, r.id, ep, ep.portFlags, tcpip.NICID(ep.ops.GetBindToDevice())); err != nil { ep.Close() route.Release() return nil, err diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 4e2123fe9..c4794e876 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -966,8 +966,9 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) { h := flow.header4Tuple(outgoing) writeDstAddr := flow.mapAddrIfApplicable(h.dstAddr.Addr) - payload := buffer.View(newPayload()) - _, gotErr := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{ + var r bytes.Reader + r.Reset(newPayload()) + _, gotErr := c.ep.Write(&r, tcpip.WriteOptions{ To: &tcpip.FullAddress{Addr: writeDstAddr, Port: h.dstAddr.Port}, }) c.checkEndpointWriteStats(1, epstats, gotErr) @@ -1007,8 +1008,10 @@ func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View To: &tcpip.FullAddress{Addr: writeDstAddr, Port: h.dstAddr.Port}, } } - payload := buffer.View(newPayload()) - n, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts) + var r bytes.Reader + payload := newPayload() + r.Reset(payload) + n, err := c.ep.Write(&r, writeOpts) if err != nil { c.t.Fatalf("Write failed: %s", err) } @@ -1183,8 +1186,10 @@ func TestWriteOnConnectedInvalidPort(t *testing.T) { writeOpts := tcpip.WriteOptions{ To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}, } - payload := buffer.View(newPayload()) - n, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts) + var r bytes.Reader + payload := newPayload() + r.Reset(payload) + n, err := c.ep.Write(&r, writeOpts) if err != nil { c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err) } @@ -2497,7 +2502,8 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { } defer ep.Close() - data := tcpip.SlicePayload([]byte{1, 2, 3, 4}) + var r bytes.Reader + data := []byte{1, 2, 3, 4} to := tcpip.FullAddress{ Addr: test.remoteAddr, Port: 80, @@ -2508,19 +2514,22 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { expectedErrWithoutBcastOpt = nil } - if n, err := ep.Write(data, opts); err != expectedErrWithoutBcastOpt { + r.Reset(data) + if n, err := ep.Write(&r, opts); err != expectedErrWithoutBcastOpt { t.Fatalf("got ep.Write(_, %#v) = (%d, %s), want = (_, %s)", opts, n, err, expectedErrWithoutBcastOpt) } ep.SocketOptions().SetBroadcast(true) - if n, err := ep.Write(data, opts); err != nil { + r.Reset(data) + if n, err := ep.Write(&r, opts); err != nil { t.Fatalf("got ep.Write(_, %#v) = (%d, %s), want = (_, nil)", opts, n, err) } ep.SocketOptions().SetBroadcast(false) - if n, err := ep.Write(data, opts); err != expectedErrWithoutBcastOpt { + r.Reset(data) + if n, err := ep.Write(&r, opts); err != expectedErrWithoutBcastOpt { t.Fatalf("got ep.Write(_, %#v) = (%d, %s), want = (_, %s)", opts, n, err, expectedErrWithoutBcastOpt) } }) diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index 79db8895b..dc2571154 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -517,28 +517,29 @@ func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, er // Reader returns an io.Reader that reads from s. Reads beyond the end of s // return io.EOF. The preconditions that apply to s.CopyIn also apply to the // returned io.Reader.Read. -func (s IOSequence) Reader(ctx context.Context) io.Reader { - return &ioSequenceReadWriter{ctx, s} +func (s IOSequence) Reader(ctx context.Context) *IOSequenceReadWriter { + return &IOSequenceReadWriter{ctx, s} } // Writer returns an io.Writer that writes to s. Writes beyond the end of s // return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also // apply to the returned io.Writer.Write. -func (s IOSequence) Writer(ctx context.Context) io.Writer { - return &ioSequenceReadWriter{ctx, s} +func (s IOSequence) Writer(ctx context.Context) *IOSequenceReadWriter { + return &IOSequenceReadWriter{ctx, s} } // ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when // attempting to write beyond the end of the IOSequence. var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence") -type ioSequenceReadWriter struct { +// IOSequenceReadWriter implements io.Reader and io.Writer for an IOSequence. +type IOSequenceReadWriter struct { ctx context.Context s IOSequence } // Read implements io.Reader.Read. -func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) { +func (rw *IOSequenceReadWriter) Read(dst []byte) (int, error) { n, err := rw.s.CopyIn(rw.ctx, dst) rw.s = rw.s.DropFirst(n) if err == nil && rw.s.NumBytes() == 0 { @@ -547,8 +548,13 @@ func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) { return n, err } +// Len implements tcpip.Payloader. +func (rw *IOSequenceReadWriter) Len() int { + return int(rw.s.NumBytes()) +} + // Write implements io.Writer.Write. -func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) { +func (rw *IOSequenceReadWriter) Write(src []byte) (int, error) { n, err := rw.s.CopyOut(rw.ctx, src) rw.s = rw.s.DropFirst(n) if err == nil && n < len(src) { diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index a92ae046d..3bbf86534 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -52,7 +52,7 @@ func waitForProcessList(cont *Container, want []*control.Process) error { cb := func() error { got, err := cont.Processes() if err != nil { - err = fmt.Errorf("error getting process data from container: %v", err) + err = fmt.Errorf("error getting process data from container: %w", err) return &backoff.PermanentError{Err: err} } if !procListsEqual(got, want) { @@ -64,11 +64,30 @@ func waitForProcessList(cont *Container, want []*control.Process) error { return testutil.Poll(cb, 30*time.Second) } +// waitForProcess waits for the given process to show up in the container. +func waitForProcess(cont *Container, want *control.Process) error { + cb := func() error { + gots, err := cont.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %w", err) + return &backoff.PermanentError{Err: err} + } + for _, got := range gots { + if procEqual(got, want) { + return nil + } + } + return fmt.Errorf("container got process list: %s, want: %+v", procListToString(gots), want) + } + // Gives plenty of time as tests can run slow under --race. + return testutil.Poll(cb, 30*time.Second) +} + func waitForProcessCount(cont *Container, want int) error { cb := func() error { pss, err := cont.Processes() if err != nil { - err = fmt.Errorf("error getting process data from container: %v", err) + err = fmt.Errorf("error getting process data from container: %w", err) return &backoff.PermanentError{Err: err} } if got := len(pss); got != want { @@ -101,28 +120,32 @@ func procListsEqual(gots, wants []*control.Process) bool { return false } for i := range gots { - got := gots[i] - want := wants[i] - - if want.UID != math.MaxUint32 && want.UID != got.UID { - return false - } - if want.PID != -1 && want.PID != got.PID { - return false - } - if want.PPID != -1 && want.PPID != got.PPID { - return false - } - if len(want.TTY) != 0 && want.TTY != got.TTY { - return false - } - if len(want.Cmd) != 0 && want.Cmd != got.Cmd { + if !procEqual(gots[i], wants[i]) { return false } } return true } +func procEqual(got, want *control.Process) bool { + if want.UID != math.MaxUint32 && want.UID != got.UID { + return false + } + if want.PID != -1 && want.PID != got.PID { + return false + } + if want.PPID != -1 && want.PPID != got.PPID { + return false + } + if len(want.TTY) != 0 && want.TTY != got.TTY { + return false + } + if len(want.Cmd) != 0 && want.Cmd != got.Cmd { + return false + } + return true +} + type processBuilder struct { process control.Process } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 044eec6fe..bc802e075 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -1708,12 +1708,9 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { t.Errorf("wait on child container: %v", err) } - // Wait for the root container to run. - expectedPL := []*control.Process{ - newProcessBuilder().Cmd("sh").Process(), - newProcessBuilder().Cmd("sleep").Process(), - } - if err := waitForProcessList(containers[0], expectedPL); err != nil { + // Wait until after `env` has executed. + expectedProc := newProcessBuilder().Cmd("sleep").Process() + if err := waitForProcess(containers[0], expectedProc); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } @@ -1831,7 +1828,7 @@ func TestDuplicateEnvVariable(t *testing.T) { cmd1 := fmt.Sprintf("env > %q; sleep 1000", files[0].Name()) cmd2 := fmt.Sprintf("env > %q", files[1].Name()) cmdExec := fmt.Sprintf("env > %q", files[2].Name()) - testSpecs, ids := createSpecs([]string{"/bin/bash", "-c", cmd1}, []string{"/bin/bash", "-c", cmd2}) + testSpecs, ids := createSpecs([]string{"/bin/sh", "-c", cmd1}, []string{"/bin/sh", "-c", cmd2}) testSpecs[0].Process.Env = append(testSpecs[0].Process.Env, "VAR=foo", "VAR=bar") testSpecs[1].Process.Env = append(testSpecs[1].Process.Env, "VAR=foo", "VAR=bar") @@ -1841,12 +1838,9 @@ func TestDuplicateEnvVariable(t *testing.T) { } defer cleanup() - // Wait for the `env` from the root container to finish. - expectedPL := []*control.Process{ - newProcessBuilder().Cmd("bash").Process(), - newProcessBuilder().Cmd("sleep").Process(), - } - if err := waitForProcessList(containers[0], expectedPL); err != nil { + // Wait until after `env` has executed. + expectedProc := newProcessBuilder().Cmd("sleep").Process() + if err := waitForProcess(containers[0], expectedProc); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } if ws, err := containers[1].Wait(); err != nil { @@ -1856,8 +1850,8 @@ func TestDuplicateEnvVariable(t *testing.T) { } execArgs := &control.ExecArgs{ - Filename: "/bin/bash", - Argv: []string{"/bin/bash", "-c", cmdExec}, + Filename: "/bin/sh", + Argv: []string{"/bin/sh", "-c", cmdExec}, Envv: []string{"VAR=foo", "VAR=bar"}, } if ws, err := containers[0].executeSync(execArgs); err != nil || ws.ExitStatus() != 0 { diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index c56e1d4d0..3280b74fe 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -12,7 +12,6 @@ go_library( ], visibility = ["//runsc:__subpackages__"], deps = [ - "//pkg/abi/linux", "//pkg/cleanup", "//pkg/fd", "//pkg/log", diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index c3bba0973..cfa3796b1 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -31,7 +31,6 @@ import ( "strconv" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" @@ -49,13 +48,6 @@ const ( allowedOpenFlags = unix.O_TRUNC ) -var ( - // Remember the process uid/gid to skip chown calls when file owner/group - // doesn't need to be changed. - processUID = p9.UID(os.Getuid()) - processGID = p9.GID(os.Getgid()) -) - // join is equivalent to path.Join() but skips path.Clean() which is expensive. func join(parent, child string) string { if child == "." || child == ".." { @@ -374,7 +366,24 @@ func fstat(fd int) (unix.Stat_t, error) { } func fchown(fd int, uid p9.UID, gid p9.GID) error { - return unix.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) + return unix.Fchownat(fd, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) +} + +func setOwnerIfNeeded(fd int, uid p9.UID, gid p9.GID) (unix.Stat_t, error) { + stat, err := fstat(fd) + if err != nil { + return unix.Stat_t{}, err + } + + // Change ownership if not set accordinly. + if uint32(uid) != stat.Uid || uint32(gid) != stat.Gid { + if err := fchown(fd, uid, gid); err != nil { + return unix.Stat_t{}, err + } + stat.Uid = uint32(uid) + stat.Gid = uint32(gid) + } + return stat, nil } // Open implements p9.File. @@ -457,12 +466,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, }) defer cu.Clean() - if uid != processUID || gid != processGID { - if err := fchown(child.FD(), uid, gid); err != nil { - return nil, nil, p9.QID{}, 0, extractErrno(err) - } - } - stat, err := fstat(child.FD()) + stat, err := setOwnerIfNeeded(child.FD(), uid, gid) if err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } @@ -505,12 +509,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) } defer f.Close() - if uid != processUID || gid != processGID { - if err := fchown(f.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) - } - } - stat, err := fstat(f.FD()) + stat, err := setOwnerIfNeeded(f.FD(), uid, gid) if err != nil { return p9.QID{}, extractErrno(err) } @@ -734,15 +733,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { if valid.ATime || valid.MTime { utimes := [2]unix.Timespec{ - {Sec: 0, Nsec: linux.UTIME_OMIT}, - {Sec: 0, Nsec: linux.UTIME_OMIT}, + {Sec: 0, Nsec: unix.UTIME_OMIT}, + {Sec: 0, Nsec: unix.UTIME_OMIT}, } if valid.ATime { if valid.ATimeNotSystemTime { utimes[0].Sec = int64(attr.ATimeSeconds) utimes[0].Nsec = int64(attr.ATimeNanoSeconds) } else { - utimes[0].Nsec = linux.UTIME_NOW + utimes[0].Nsec = unix.UTIME_NOW } } if valid.MTime { @@ -750,7 +749,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { utimes[1].Sec = int64(attr.MTimeSeconds) utimes[1].Nsec = int64(attr.MTimeNanoSeconds) } else { - utimes[1].Nsec = linux.UTIME_NOW + utimes[1].Nsec = unix.UTIME_NOW } } @@ -764,7 +763,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } defer unix.Close(parent) - if tErr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); tErr != nil { + if tErr := utimensat(parent, path.Base(l.hostPath), utimes, unix.AT_SYMLINK_NOFOLLOW); tErr != nil { log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, tErr) err = extractErrno(tErr) } @@ -779,15 +778,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } if valid.UID || valid.GID { - uid := -1 + uid := p9.NoUID if valid.UID { - uid = int(attr.UID) + uid = attr.UID } - gid := -1 + gid := p9.NoGID if valid.GID { - gid = int(attr.GID) + gid = attr.GID } - if oErr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oErr != nil { + if oErr := fchown(f.FD(), uid, gid); oErr != nil { log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oErr) err = extractErrno(oErr) } @@ -900,12 +899,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. } defer f.Close() - if uid != processUID || gid != processGID { - if err := fchown(f.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) - } - } - stat, err := fstat(f.FD()) + stat, err := setOwnerIfNeeded(f.FD(), uid, gid) if err != nil { return p9.QID{}, extractErrno(err) } @@ -921,7 +915,7 @@ func (l *localFile) Link(target p9.File, newName string) error { } targetFile := target.(*localFile) - if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil { + if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, unix.AT_EMPTY_PATH); err != nil { return extractErrno(err) } return nil @@ -959,12 +953,7 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid } defer child.Close() - if uid != processUID || gid != processGID { - if err := fchown(child.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) - } - } - stat, err := fstat(child.FD()) + stat, err := setOwnerIfNeeded(child.FD(), uid, gid) if err != nil { return p9.QID{}, extractErrno(err) } @@ -1113,7 +1102,8 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { // mappings, the app path may have fit in the sockaddr, but we can't // fit f.path in our sockaddr. We'd need to redirect through a shorter // path in order to actually connect to this socket. - if len(l.hostPath) > linux.UnixPathMax { + const UNIX_PATH_MAX = 108 // defined in afunix.h + if len(l.hostPath) > UNIX_PATH_MAX { return nil, unix.ECONNREFUSED } diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go index c46958185..29ebf8500 100644 --- a/runsc/fsgofer/fsgofer_amd64_unsafe.go +++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go @@ -20,7 +20,6 @@ import ( "unsafe" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" ) @@ -39,7 +38,7 @@ func statAt(dirFd int, name string) (unix.Stat_t, error) { uintptr(dirFd), uintptr(namePtr), uintptr(statPtr), - linux.AT_SYMLINK_NOFOLLOW, + unix.AT_SYMLINK_NOFOLLOW, 0, 0); errno != 0 { diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go index 491460718..9fd5d0871 100644 --- a/runsc/fsgofer/fsgofer_arm64_unsafe.go +++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go @@ -20,7 +20,6 @@ import ( "unsafe" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" ) @@ -39,7 +38,7 @@ func statAt(dirFd int, name string) (unix.Stat_t, error) { uintptr(dirFd), uintptr(namePtr), uintptr(statPtr), - linux.AT_SYMLINK_NOFOLLOW, + unix.AT_SYMLINK_NOFOLLOW, 0, 0); errno != 0 { diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index c5daebe5e..99ea9bd32 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -32,6 +32,9 @@ import ( "gvisor.dev/gvisor/runsc/specutils" ) +// Nodoby is the standard UID/GID for the nobody user/group. +const nobody = 65534 + var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} var ( @@ -281,6 +284,92 @@ func TestCreate(t *testing.T) { }) } +func checkIDs(f p9.File, uid, gid int) error { + _, _, stat, err := f.GetAttr(p9.AttrMask{UID: true, GID: true}) + if err != nil { + return fmt.Errorf("GetAttr() failed, err: %v", err) + } + if want := p9.UID(uid); stat.UID != want { + return fmt.Errorf("Wrong UID, want: %v, got: %v", want, stat.UID) + } + if want := p9.GID(gid); stat.GID != want { + return fmt.Errorf("Wrong GID, want: %v, got: %v", want, stat.GID) + } + return nil +} + +// TestCreateSetGID checks files/dirs/symlinks are created with the proper +// owner when the parent directory has setgid set, +func TestCreateSetGID(t *testing.T) { + if !specutils.HasCapabilities(capability.CAP_CHOWN) { + t.Skipf("Test requires CAP_CHOWN") + } + + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + // Change group and set setgid to the parent dir. + if err := unix.Chown(s.file.hostPath, os.Getuid(), nobody); err != nil { + t.Fatalf("Chown() failed: %v", err) + } + if err := unix.Chmod(s.file.hostPath, 02777); err != nil { + t.Fatalf("Chmod() failed: %v", err) + } + + t.Run("create", func(t *testing.T) { + _, l, _, _, err := s.file.Create("test", p9.ReadOnly, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("WriteAt() failed: %v", err) + } + defer l.Close() + if err := checkIDs(l, os.Getuid(), os.Getgid()); err != nil { + t.Error(err) + } + }) + + t.Run("mkdir", func(t *testing.T) { + _, err := s.file.Mkdir("test-dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("WriteAt() failed: %v", err) + } + _, l, err := s.file.Walk([]string{"test-dir"}) + if err != nil { + t.Fatalf("Walk() failed: %v", err) + } + defer l.Close() + if err := checkIDs(l, os.Getuid(), os.Getgid()); err != nil { + t.Error(err) + } + }) + + t.Run("symlink", func(t *testing.T) { + if _, err := s.file.Symlink("/some/target", "symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("Symlink() failed: %v", err) + } + _, l, err := s.file.Walk([]string{"symlink"}) + if err != nil { + t.Fatalf("Walk() failed, err: %v", err) + } + defer l.Close() + if err := checkIDs(l, os.Getuid(), os.Getgid()); err != nil { + t.Error(err) + } + }) + + t.Run("mknod", func(t *testing.T) { + if _, err := s.file.Mknod("nod", p9.ModeRegular|0777, 0, 0, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("Mknod() failed: %v", err) + } + _, l, err := s.file.Walk([]string{"nod"}) + if err != nil { + t.Fatalf("Walk() failed, err: %v", err) + } + defer l.Close() + if err := checkIDs(l, os.Getuid(), os.Getgid()); err != nil { + t.Error(err) + } + }) + }) +} + // TestReadWriteDup tests that a file opened in any mode can be dup'ed and // reopened in any other mode. func TestReadWriteDup(t *testing.T) { @@ -458,7 +547,7 @@ func TestSetAttrTime(t *testing.T) { } func TestSetAttrOwner(t *testing.T) { - if os.Getuid() != 0 { + if !specutils.HasCapabilities(capability.CAP_CHOWN) { t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid()) } @@ -477,7 +566,7 @@ func TestSetAttrOwner(t *testing.T) { } func TestLink(t *testing.T) { - if os.Getuid() != 0 { + if !specutils.HasCapabilities(capability.CAP_DAC_READ_SEARCH) { t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid()) } runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { @@ -995,7 +1084,6 @@ func BenchmarkCreateDiffOwner(b *testing.B) { files := make([]p9.File, 0, 500) fds := make([]*fd.FD, 0, 500) gid := p9.GID(os.Getgid()) - const nobody = 65534 b.ResetTimer() for i := 0; i < b.N; i++ { diff --git a/runsc/mitigate/BUILD b/runsc/mitigate/BUILD new file mode 100644 index 000000000..3b0342d18 --- /dev/null +++ b/runsc/mitigate/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "mitigate", + srcs = [ + "cpu.go", + "mitigate.go", + ], + deps = ["@in_gopkg_yaml_v2//:go_default_library"], +) + +go_test( + name = "mitigate_test", + size = "small", + srcs = ["cpu_test.go"], + library = ":mitigate", + deps = ["@com_github_google_go_cmp//cmp:go_default_library"], +) diff --git a/runsc/mitigate/cpu.go b/runsc/mitigate/cpu.go new file mode 100644 index 000000000..113b98159 --- /dev/null +++ b/runsc/mitigate/cpu.go @@ -0,0 +1,235 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mitigate + +import ( + "fmt" + "regexp" + "strconv" + "strings" +) + +const ( + // constants of coomm + meltdown = "cpu_meltdown" + l1tf = "l1tf" + mds = "mds" + swapgs = "swapgs" + taa = "taa" +) + +const ( + processorKey = "processor" + vendorIDKey = "vendor_id" + cpuFamilyKey = "cpu family" + modelKey = "model" + coreIDKey = "core id" + bugsKey = "bugs" +) + +// getCPUSet returns cpu structs from reading /proc/cpuinfo. +func getCPUSet(data string) ([]*cpu, error) { + // Each processor entry should start with the + // processor key. Find the beginings of each. + r := buildRegex(processorKey, `\d+`) + indices := r.FindAllStringIndex(data, -1) + if len(indices) < 1 { + return nil, fmt.Errorf("no cpus found for: %s", data) + } + + // Add the ending index for last entry. + indices = append(indices, []int{len(data), -1}) + + // Valid cpus are now defined by strings in between + // indexes (e.g. data[index[i], index[i+1]]). + // There should be len(indicies) - 1 CPUs + // since the last index is the end of the string. + var cpus = make([]*cpu, 0, len(indices)-1) + // Find each string that represents a CPU. These begin "processor". + for i := 1; i < len(indices); i++ { + start := indices[i-1][0] + end := indices[i][0] + // Parse the CPU entry, which should be between start/end. + c, err := getCPU(data[start:end]) + if err != nil { + return nil, err + } + cpus = append(cpus, c) + } + return cpus, nil +} + +// type cpu represents pertinent info about a cpu. +type cpu struct { + processorNumber int64 // the processor number of this CPU. + vendorID string // the vendorID of CPU (e.g. AuthenticAMD). + cpuFamily int64 // CPU family number (e.g. 6 for CascadeLake/Skylake). + model int64 // CPU model number (e.g. 85 for CascadeLake/Skylake). + coreID int64 // This CPU's core id to match Hyperthread Pairs + bugs map[string]struct{} // map of vulnerabilities parsed from the 'bugs' field. +} + +// getCPU parses a CPU from a single cpu entry from /proc/cpuinfo. +func getCPU(data string) (*cpu, error) { + processor, err := parseProcessor(data) + if err != nil { + return nil, err + } + + vendorID, err := parseVendorID(data) + if err != nil { + return nil, err + } + + cpuFamily, err := parseCPUFamily(data) + if err != nil { + return nil, err + } + + model, err := parseModel(data) + if err != nil { + return nil, err + } + + coreID, err := parseCoreID(data) + if err != nil { + return nil, err + } + + bugs, err := parseBugs(data) + if err != nil { + return nil, err + } + + return &cpu{ + processorNumber: processor, + vendorID: vendorID, + cpuFamily: cpuFamily, + model: model, + coreID: coreID, + bugs: bugs, + }, nil +} + +// List of pertinent side channel vulnerablilites. +// For mds, see: https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html. +var vulnerabilities = []string{ + meltdown, + l1tf, + mds, + swapgs, + taa, +} + +// isVulnerable checks if a CPU is vulnerable to pertinent bugs. +func (c *cpu) isVulnerable() bool { + for _, bug := range vulnerabilities { + if _, ok := c.bugs[bug]; ok { + return true + } + } + return false +} + +// similarTo checks family/model/bugs fields for equality of two +// processors. +func (c *cpu) similarTo(other *cpu) bool { + if c.vendorID != other.vendorID { + return false + } + + if other.cpuFamily != c.cpuFamily { + return false + } + + if other.model != c.model { + return false + } + + if len(other.bugs) != len(c.bugs) { + return false + } + + for bug := range c.bugs { + if _, ok := other.bugs[bug]; !ok { + return false + } + } + return true +} + +// parseProcessor grabs the processor field from /proc/cpuinfo output. +func parseProcessor(data string) (int64, error) { + return parseIntegerResult(data, processorKey) +} + +// parseVendorID grabs the vendor_id field from /proc/cpuinfo output. +func parseVendorID(data string) (string, error) { + return parseRegex(data, vendorIDKey, `[\w\d]+`) +} + +// parseCPUFamily grabs the cpu family field from /proc/cpuinfo output. +func parseCPUFamily(data string) (int64, error) { + return parseIntegerResult(data, cpuFamilyKey) +} + +// parseModel grabs the model field from /proc/cpuinfo output. +func parseModel(data string) (int64, error) { + return parseIntegerResult(data, modelKey) +} + +// parseCoreID parses the core id field. +func parseCoreID(data string) (int64, error) { + return parseIntegerResult(data, coreIDKey) +} + +// parseBugs grabs the bugs field from /proc/cpuinfo output. +func parseBugs(data string) (map[string]struct{}, error) { + result, err := parseRegex(data, bugsKey, `[\d\w\s]*`) + if err != nil { + return nil, err + } + bugs := strings.Split(result, " ") + ret := make(map[string]struct{}, len(bugs)) + for _, bug := range bugs { + ret[bug] = struct{}{} + } + return ret, nil +} + +// parseIntegerResult parses fields expecting an integer. +func parseIntegerResult(data, key string) (int64, error) { + result, err := parseRegex(data, key, `\d+`) + if err != nil { + return 0, err + } + return strconv.ParseInt(result, 0, 64) +} + +// buildRegex builds a regex for parsing each CPU field. +func buildRegex(key, match string) *regexp.Regexp { + reg := fmt.Sprintf(`(?m)^%s\s*:\s*(.*)$`, key) + return regexp.MustCompile(reg) +} + +// parseRegex parses data with key inserted into a standard regex template. +func parseRegex(data, key, match string) (string, error) { + r := buildRegex(key, match) + matches := r.FindStringSubmatch(data) + if len(matches) < 2 { + return "", fmt.Errorf("failed to match key %s: %s", key, data) + } + return matches[1], nil +} diff --git a/runsc/mitigate/cpu_test.go b/runsc/mitigate/cpu_test.go new file mode 100644 index 000000000..77b714a02 --- /dev/null +++ b/runsc/mitigate/cpu_test.go @@ -0,0 +1,368 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mitigate + +import ( + "io/ioutil" + "strings" + "testing" +) + +// CPU info for a Intel CascadeLake processor. Both Skylake and CascadeLake have +// the same family/model numbers, but with different bugs (e.g. skylake has +// cpu_meltdown). +var cascadeLake = &cpu{ + vendorID: "GenuineIntel", + cpuFamily: 6, + model: 85, + bugs: map[string]struct{}{ + "spectre_v1": struct{}{}, + "spectre_v2": struct{}{}, + "spec_store_bypass": struct{}{}, + mds: struct{}{}, + swapgs: struct{}{}, + taa: struct{}{}, + }, +} + +// TestGetCPU tests basic parsing of single CPU strings from reading +// /proc/cpuinfo. +func TestGetCPU(t *testing.T) { + data := `processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 85 +core id : 0 +bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa itlb_multihit +` + want := cpu{ + processorNumber: 0, + vendorID: "GenuineIntel", + cpuFamily: 6, + model: 85, + coreID: 0, + bugs: map[string]struct{}{ + "cpu_meltdown": struct{}{}, + "spectre_v1": struct{}{}, + "spectre_v2": struct{}{}, + "spec_store_bypass": struct{}{}, + "l1tf": struct{}{}, + "mds": struct{}{}, + "swapgs": struct{}{}, + "taa": struct{}{}, + "itlb_multihit": struct{}{}, + }, + } + + got, err := getCPU(data) + if err != nil { + t.Fatalf("getCpu failed with error: %v", err) + } + + if !want.similarTo(got) { + t.Fatalf("Failed cpus not similar: got: %+v, want: %+v", got, want) + } + + if !got.isVulnerable() { + t.Fatalf("Failed: cpu should be vulnerable.") + } +} + +func TestInvalid(t *testing.T) { + result, err := getCPUSet(`something not a processor`) + if err == nil { + t.Fatalf("getCPU set didn't return an error: %+v", result) + } + + if !strings.Contains(err.Error(), "no cpus") { + t.Fatalf("Incorrect error returned: %v", err) + } +} + +// TestCPUSet tests getting the right number of CPUs from +// parsing full output of /proc/cpuinfo. +func TestCPUSet(t *testing.T) { + data := `processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 63 +model name : Intel(R) Xeon(R) CPU @ 2.30GHz +stepping : 0 +microcode : 0x1 +cpu MHz : 2299.998 +cache size : 46080 KB +physical id : 0 +siblings : 2 +core id : 0 +cpu cores : 1 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities +bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs +bogomips : 4599.99 +clflush size : 64 +cache_alignment : 64 +address sizes : 46 bits physical, 48 bits virtual +power management: + +processor : 1 +vendor_id : GenuineIntel +cpu family : 6 +model : 63 +model name : Intel(R) Xeon(R) CPU @ 2.30GHz +stepping : 0 +microcode : 0x1 +cpu MHz : 2299.998 +cache size : 46080 KB +physical id : 0 +siblings : 2 +core id : 0 +cpu cores : 1 +apicid : 1 +initial apicid : 1 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities +bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs +bogomips : 4599.99 +clflush size : 64 +cache_alignment : 64 +address sizes : 46 bits physical, 48 bits virtual +power management: +` + cpuSet, err := getCPUSet(data) + if err != nil { + t.Fatalf("getCPUSet failed: %v", err) + } + + wantCPULen := 2 + if len(cpuSet) != wantCPULen { + t.Fatalf("Num CPU mismatch: want: %d, got: %d", wantCPULen, len(cpuSet)) + } + + wantCPU := cpu{ + vendorID: "GenuineIntel", + cpuFamily: 6, + model: 63, + bugs: map[string]struct{}{ + "cpu_meltdown": struct{}{}, + "spectre_v1": struct{}{}, + "spectre_v2": struct{}{}, + "spec_store_bypass": struct{}{}, + "l1tf": struct{}{}, + "mds": struct{}{}, + "swapgs": struct{}{}, + }, + } + + for _, c := range cpuSet { + if !wantCPU.similarTo(c) { + t.Fatalf("Failed cpus not equal: got: %+v, want: %+v", c, wantCPU) + } + } +} + +// TestReadFile is a smoke test for parsing methods. +func TestReadFile(t *testing.T) { + data, err := ioutil.ReadFile("/proc/cpuinfo") + if err != nil { + t.Fatalf("Failed to read cpuinfo: %v", err) + } + + set, err := getCPUSet(string(data)) + if err != nil { + t.Fatalf("Failed to parse CPU data %v\n%s", err, data) + } + + if len(set) < 1 { + t.Fatalf("Failed to parse any CPUs: %d", len(set)) + } + + for _, c := range set { + t.Logf("CPU: %+v: %t", c, c.isVulnerable()) + } +} + +// TestVulnerable tests if the isVulnerable method is correct +// among known CPUs in GCP. +func TestVulnerable(t *testing.T) { + const haswell = `processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 63 +model name : Intel(R) Xeon(R) CPU @ 2.30GHz +stepping : 0 +microcode : 0x1 +cpu MHz : 2299.998 +cache size : 46080 KB +physical id : 0 +siblings : 4 +core id : 0 +cpu cores : 2 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities +bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs +bogomips : 4599.99 +clflush size : 64 +cache_alignment : 64 +address sizes : 46 bits physical, 48 bits virtual +power management:` + + const skylake = `processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 85 +model name : Intel(R) Xeon(R) CPU @ 2.00GHz +stepping : 3 +microcode : 0x1 +cpu MHz : 2000.180 +cache size : 39424 KB +physical id : 0 +siblings : 2 +core id : 0 +cpu cores : 1 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities +bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa +bogomips : 4000.36 +clflush size : 64 +cache_alignment : 64 +address sizes : 46 bits physical, 48 bits virtual +power management:` + + const cascade = `processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 85 +model name : Intel(R) Xeon(R) CPU +stepping : 7 +microcode : 0x1 +cpu MHz : 2800.198 +cache size : 33792 KB +physical id : 0 +siblings : 2 +core id : 0 +cpu cores : 1 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 + ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmu +lqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowpr +efetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid r +tm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves a +rat avx512_vnni md_clear arch_capabilities +bugs : spectre_v1 spectre_v2 spec_store_bypass mds swapgs taa +bogomips : 5600.39 +clflush size : 64 +cache_alignment : 64 +address sizes : 46 bits physical, 48 bits virtual +power management:` + + const amd = `processor : 0 +vendor_id : AuthenticAMD +cpu family : 23 +model : 49 +model name : AMD EPYC 7B12 +stepping : 0 +microcode : 0x1000065 +cpu MHz : 2250.000 +cache size : 512 KB +physical id : 0 +siblings : 2 +core id : 0 +cpu cores : 1 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibrs ibpb stibp vmmcall fsgsbase tsc_adjust bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr arat npt nrip_save umip rdpid +bugs : sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass +bogomips : 4500.00 +TLB size : 3072 4K pages +clflush size : 64 +cache_alignment : 64 +address sizes : 48 bits physical, 48 bits virtual +power management:` + + for _, tc := range []struct { + name string + cpuString string + vulnerable bool + }{ + { + name: "haswell", + cpuString: haswell, + vulnerable: true, + }, { + name: "skylake", + cpuString: skylake, + vulnerable: true, + }, { + name: "cascadeLake", + cpuString: cascade, + vulnerable: false, + }, { + name: "amd", + cpuString: amd, + vulnerable: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + set, err := getCPUSet(tc.cpuString) + if err != nil { + t.Fatalf("Failed to getCPUSet:%v\n %s", err, tc.cpuString) + } + + if len(set) < 1 { + t.Fatalf("Returned empty cpu set: %v", set) + } + + for _, c := range set { + got := func() bool { + if cascadeLake.similarTo(c) { + return false + } + return c.isVulnerable() + }() + + if got != tc.vulnerable { + t.Fatalf("Mismatch vulnerable for cpu %+s: got %t want: %t", tc.name, tc.vulnerable, got) + } + } + }) + } +} diff --git a/runsc/mitigate/mitigate.go b/runsc/mitigate/mitigate.go new file mode 100644 index 000000000..51d5449b6 --- /dev/null +++ b/runsc/mitigate/mitigate.go @@ -0,0 +1,20 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mitigate provides libraries for the mitigate command. The +// mitigate command mitigates side channel attacks such as MDS. Mitigate +// shuts down CPUs via /sys/devices/system/cpu/cpu{N}/online. In addition, +// the mitigate also handles computing available CPU in kubernetes kube_config +// files. +package mitigate diff --git a/test/fsstress/fsstress_test.go b/test/fsstress/fsstress_test.go index 63f692ca9..300c21ceb 100644 --- a/test/fsstress/fsstress_test.go +++ b/test/fsstress/fsstress_test.go @@ -41,7 +41,7 @@ func fsstress(t *testing.T, dir string) { image = "basic/fsstress" ) seed := strconv.FormatUint(uint64(rand.Uint32()), 10) - args := []string{"-d", dir, "-n", operations, "-p", processes, "-seed", seed, "-X"} + args := []string{"-d", dir, "-n", operations, "-p", processes, "-s", seed, "-X"} t.Logf("Repro: docker run --rm --runtime=runsc %s %s", image, strings.Join(args, "")) out, err := d.Run(ctx, dockerutil.RunOpts{Image: image}, args...) if err != nil { diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go index 37a1a6694..c47660026 100644 --- a/test/iptables/filter_input.go +++ b/test/iptables/filter_input.go @@ -51,6 +51,12 @@ func init() { RegisterTestCase(FilterInputInvertDestination{}) RegisterTestCase(FilterInputSource{}) RegisterTestCase(FilterInputInvertSource{}) + RegisterTestCase(FilterInputInterfaceAccept{}) + RegisterTestCase(FilterInputInterfaceDrop{}) + RegisterTestCase(FilterInputInterface{}) + RegisterTestCase(FilterInputInterfaceBeginsWith{}) + RegisterTestCase(FilterInputInterfaceInvertDrop{}) + RegisterTestCase(FilterInputInterfaceInvertAccept{}) } // FilterInputDropUDP tests that we can drop UDP traffic. @@ -744,3 +750,195 @@ func (FilterInputInvertSource) ContainerAction(ctx context.Context, ip net.IP, i func (FilterInputInvertSource) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { return sendUDPLoop(ctx, ip, acceptPort) } + +// FilterInputInterfaceAccept tests that packets are accepted from interface +// matching the iptables rule. +type FilterInputInterfaceAccept struct{ localCase } + +var _ TestCase = FilterInputInterfaceAccept{} + +// Name implements TestCase.Name. +func (FilterInputInterfaceAccept) Name() string { + return "FilterInputInterfaceAccept" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputInterfaceAccept) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { + ifname, ok := getInterfaceName() + if !ok { + return fmt.Errorf("no interface is present, except loopback") + } + if err := filterTable(ipv6, "-A", "INPUT", "-p", "udp", "-i", ifname, "-j", "ACCEPT"); err != nil { + return err + } + if err := listenUDP(ctx, acceptPort); err != nil { + return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %w", acceptPort, err) + } + + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputInterfaceAccept) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { + return sendUDPLoop(ctx, ip, acceptPort) +} + +// FilterInputInterfaceDrop tests that packets are dropped from interface +// matching the iptables rule. +type FilterInputInterfaceDrop struct{ localCase } + +var _ TestCase = FilterInputInterfaceDrop{} + +// Name implements TestCase.Name. +func (FilterInputInterfaceDrop) Name() string { + return "FilterInputInterfaceDrop" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputInterfaceDrop) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { + ifname, ok := getInterfaceName() + if !ok { + return fmt.Errorf("no interface is present, except loopback") + } + if err := filterTable(ipv6, "-A", "INPUT", "-p", "udp", "-i", ifname, "-j", "DROP"); err != nil { + return err + } + timedCtx, cancel := context.WithTimeout(ctx, NegativeTimeout) + defer cancel() + if err := listenUDP(timedCtx, acceptPort); err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return nil + } + return fmt.Errorf("error reading: %w", err) + } + return fmt.Errorf("packets should have been dropped, but got a packet") +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputInterfaceDrop) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { + return sendUDPLoop(ctx, ip, acceptPort) +} + +// FilterInputInterface tests that packets are not dropped from interface which +// is not matching the interface name in the iptables rule. +type FilterInputInterface struct{ localCase } + +var _ TestCase = FilterInputInterface{} + +// Name implements TestCase.Name. +func (FilterInputInterface) Name() string { + return "FilterInputInterface" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputInterface) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { + if err := filterTable(ipv6, "-A", "INPUT", "-p", "udp", "-i", "lo", "-j", "DROP"); err != nil { + return err + } + if err := listenUDP(ctx, acceptPort); err != nil { + return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %w", acceptPort, err) + } + return nil +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputInterface) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { + return sendUDPLoop(ctx, ip, acceptPort) +} + +// FilterInputInterfaceBeginsWith tests that packets are dropped from an +// interface which begins with the given interface name. +type FilterInputInterfaceBeginsWith struct{ localCase } + +var _ TestCase = FilterInputInterfaceBeginsWith{} + +// Name implements TestCase.Name. +func (FilterInputInterfaceBeginsWith) Name() string { + return "FilterInputInterfaceBeginsWith" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputInterfaceBeginsWith) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { + if err := filterTable(ipv6, "-A", "INPUT", "-p", "udp", "-i", "e+", "-j", "DROP"); err != nil { + return err + } + timedCtx, cancel := context.WithTimeout(ctx, NegativeTimeout) + defer cancel() + if err := listenUDP(timedCtx, acceptPort); err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return nil + } + return fmt.Errorf("error reading: %w", err) + } + return fmt.Errorf("packets should have been dropped, but got a packet") +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputInterfaceBeginsWith) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { + return sendUDPLoop(ctx, ip, acceptPort) +} + +// FilterInputInterfaceInvertDrop tests that we selectively drop packets from +// interface not matching the interface name. +type FilterInputInterfaceInvertDrop struct{ baseCase } + +var _ TestCase = FilterInputInterfaceInvertDrop{} + +// Name implements TestCase.Name. +func (FilterInputInterfaceInvertDrop) Name() string { + return "FilterInputInterfaceInvertDrop" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputInterfaceInvertDrop) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { + if err := filterTable(ipv6, "-A", "INPUT", "-p", "tcp", "!", "-i", "lo", "-j", "DROP"); err != nil { + return err + } + timedCtx, cancel := context.WithTimeout(ctx, NegativeTimeout) + defer cancel() + if err := listenTCP(timedCtx, acceptPort); err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return nil + } + return fmt.Errorf("error reading: %w", err) + } + return fmt.Errorf("connection on port %d should not be accepted, but was accepted", acceptPort) +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputInterfaceInvertDrop) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { + timedCtx, cancel := context.WithTimeout(ctx, NegativeTimeout) + defer cancel() + if err := connectTCP(timedCtx, ip, acceptPort); err != nil { + var operr *net.OpError + if errors.As(err, &operr) && operr.Timeout() { + return nil + } + return fmt.Errorf("error connecting: %w", err) + } + return fmt.Errorf("connection destined to port %d should not be accepted, but was accepted", acceptPort) +} + +// FilterInputInterfaceInvertAccept tests that we can selectively accept packets +// not matching the specific incoming interface. +type FilterInputInterfaceInvertAccept struct{ baseCase } + +var _ TestCase = FilterInputInterfaceInvertAccept{} + +// Name implements TestCase.Name. +func (FilterInputInterfaceInvertAccept) Name() string { + return "FilterInputInterfaceInvertAccept" +} + +// ContainerAction implements TestCase.ContainerAction. +func (FilterInputInterfaceInvertAccept) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { + if err := filterTable(ipv6, "-A", "INPUT", "-p", "tcp", "!", "-i", "lo", "-j", "ACCEPT"); err != nil { + return err + } + return listenTCP(ctx, acceptPort) +} + +// LocalAction implements TestCase.LocalAction. +func (FilterInputInterfaceInvertAccept) LocalAction(ctx context.Context, ip net.IP, ipv6 bool) error { + return connectTCP(ctx, ip, acceptPort) +} diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index 9a4f60a9a..ef92e3fff 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -392,6 +392,30 @@ func TestInputInvertSource(t *testing.T) { singleTest(t, FilterInputInvertSource{}) } +func TestInputInterfaceAccept(t *testing.T) { + singleTest(t, FilterInputInterfaceAccept{}) +} + +func TestInputInterfaceDrop(t *testing.T) { + singleTest(t, FilterInputInterfaceDrop{}) +} + +func TestInputInterface(t *testing.T) { + singleTest(t, FilterInputInterface{}) +} + +func TestInputInterfaceBeginsWith(t *testing.T) { + singleTest(t, FilterInputInterfaceBeginsWith{}) +} + +func TestInputInterfaceInvertDrop(t *testing.T) { + singleTest(t, FilterInputInterfaceInvertDrop{}) +} + +func TestInputInterfaceInvertAccept(t *testing.T) { + singleTest(t, FilterInputInterfaceInvertAccept{}) +} + func TestFilterAddrs(t *testing.T) { tcs := []struct { ipv6 bool diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go index a6ec5cca3..4cd770a65 100644 --- a/test/iptables/iptables_util.go +++ b/test/iptables/iptables_util.go @@ -171,7 +171,7 @@ func connectTCP(ctx context.Context, ip net.IP, port int) error { return err } if err := testutil.PollContext(ctx, callback); err != nil { - return fmt.Errorf("timed out waiting to connect IP on port %v, most recent error: %v", port, err) + return fmt.Errorf("timed out waiting to connect IP on port %v, most recent error: %w", port, err) } return nil diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index c6c95546a..5c3c569de 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -277,6 +277,10 @@ ALL_TESTS = [ PacketimpactTestInfo( name = "tcp_rcv_buf_space", ), + PacketimpactTestInfo( + name = "tcp_rack", + expect_netstack_failure = True, + ), ] def validate_all_tests(): diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index b1b3c578b..6c6f2bdf7 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -376,6 +376,20 @@ packetimpact_testbench( ], ) +packetimpact_testbench( + name = "tcp_rack", + srcs = ["tcp_rack_test.go"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/tcpip/header", + "//pkg/tcpip/seqnum", + "//pkg/usermem", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + validate_all_tests() [packetimpact_go_test( diff --git a/test/packetimpact/tests/tcp_rack_test.go b/test/packetimpact/tests/tcp_rack_test.go new file mode 100644 index 000000000..0a2381c97 --- /dev/null +++ b/test/packetimpact/tests/tcp_rack_test.go @@ -0,0 +1,221 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_rack_test + +import ( + "flag" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/seqnum" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.Initialize(flag.CommandLine) +} + +const ( + // payloadSize is the size used to send packets. + payloadSize = header.TCPDefaultMSS + + // simulatedRTT is the time delay between packets sent and acked to + // increase the RTT. + simulatedRTT = 30 * time.Millisecond + + // numPktsForRTT is the number of packets sent and acked to establish + // RTT. + numPktsForRTT = 10 +) + +func createSACKConnection(t *testing.T) (testbench.DUT, testbench.TCPIPv4, int32, int32) { + dut := testbench.NewDUT(t) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + + // Enable SACK. + opts := make([]byte, 40) + optsOff := 0 + optsOff += header.EncodeNOP(opts[optsOff:]) + optsOff += header.EncodeNOP(opts[optsOff:]) + optsOff += header.EncodeSACKPermittedOption(opts[optsOff:]) + + conn.ConnectWithOptions(t, opts[:optsOff]) + acceptFd, _ := dut.Accept(t, listenFd) + return dut, conn, acceptFd, listenFd +} + +func closeSACKConnection(t *testing.T, dut testbench.DUT, conn testbench.TCPIPv4, acceptFd, listenFd int32) { + dut.Close(t, acceptFd) + dut.Close(t, listenFd) + conn.Close(t) +} + +func getRTTAndRTO(t *testing.T, dut testbench.DUT, acceptFd int32) (rtt, rto time.Duration) { + info := linux.TCPInfo{} + ret := dut.GetSockOpt(t, acceptFd, unix.SOL_TCP, unix.TCP_INFO, int32(linux.SizeOfTCPInfo)) + binary.Unmarshal(ret, usermem.ByteOrder, &info) + return time.Duration(info.RTT) * time.Microsecond, time.Duration(info.RTO) * time.Microsecond +} + +func sendAndReceive(t *testing.T, dut testbench.DUT, conn testbench.TCPIPv4, numPkts int, acceptFd int32, sendACK bool) time.Time { + seqNum1 := *conn.RemoteSeqNum(t) + payload := make([]byte, payloadSize) + var lastSent time.Time + for i, sn := 0, seqNum1; i < numPkts; i++ { + lastSent = time.Now() + dut.Send(t, acceptFd, payload, 0) + gotOne, err := conn.Expect(t, testbench.TCP{SeqNum: testbench.Uint32(uint32(sn))}, time.Second) + if err != nil { + t.Fatalf("Expect #%d: %s", i+1, err) + continue + } + if gotOne == nil { + t.Fatalf("#%d: expected a packet within a second but got none", i+1) + } + sn.UpdateForward(seqnum.Size(payloadSize)) + + if sendACK { + time.Sleep(simulatedRTT) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(sn))}) + } + } + return lastSent +} + +// TestRACKTLPAllPacketsLost tests TLP when an entire flight of data is lost. +func TestRACKTLPAllPacketsLost(t *testing.T) { + dut, conn, acceptFd, listenFd := createSACKConnection(t) + seqNum1 := *conn.RemoteSeqNum(t) + + // Send ACK for data packets to establish RTT. + sendAndReceive(t, dut, conn, numPktsForRTT, acceptFd, true /* sendACK */) + seqNum1.UpdateForward(seqnum.Size(numPktsForRTT * payloadSize)) + + // We are not sending ACK for these packets. + const numPkts = 5 + lastSent := sendAndReceive(t, dut, conn, numPkts, acceptFd, false /* sendACK */) + + // Probe Timeout (PTO) should be two times RTT. Check that the last + // packet is retransmitted after probe timeout. + rtt, _ := getRTTAndRTO(t, dut, acceptFd) + pto := rtt * 2 + // We expect the 5th packet (the last unacknowledged packet) to be + // retransmitted. + tlpProbe := testbench.Uint32(uint32(seqNum1) + uint32((numPkts-1)*payloadSize)) + if _, err := conn.Expect(t, testbench.TCP{SeqNum: tlpProbe}, time.Second); err != nil { + t.Fatalf("expected payload was not received: %s %v %v", err, rtt, pto) + } + diff := time.Now().Sub(lastSent) + if diff < pto { + t.Fatalf("expected payload was received before the probe timeout, got: %v, want: %v", diff, pto) + } + closeSACKConnection(t, dut, conn, acceptFd, listenFd) +} + +// TestRACKTLPLost tests TLP when there are tail losses. +// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.4 +func TestRACKTLPLost(t *testing.T) { + dut, conn, acceptFd, listenFd := createSACKConnection(t) + seqNum1 := *conn.RemoteSeqNum(t) + + // Send ACK for data packets to establish RTT. + sendAndReceive(t, dut, conn, numPktsForRTT, acceptFd, true /* sendACK */) + seqNum1.UpdateForward(seqnum.Size(numPktsForRTT * payloadSize)) + + // We are not sending ACK for these packets. + const numPkts = 10 + lastSent := sendAndReceive(t, dut, conn, numPkts, acceptFd, false /* sendACK */) + + // Cumulative ACK for #[1-5] packets. + ackNum := seqNum1.Add(seqnum.Size(6 * payloadSize)) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(ackNum))}) + + // Probe Timeout (PTO) should be two times RTT. Check that the last + // packet is retransmitted after probe timeout. + rtt, _ := getRTTAndRTO(t, dut, acceptFd) + pto := rtt * 2 + // We expect the 10th packet (the last unacknowledged packet) to be + // retransmitted. + tlpProbe := testbench.Uint32(uint32(seqNum1) + uint32((numPkts-1)*payloadSize)) + if _, err := conn.Expect(t, testbench.TCP{SeqNum: tlpProbe}, time.Second); err != nil { + t.Fatalf("expected payload was not received: %s", err) + } + diff := time.Now().Sub(lastSent) + if diff < pto { + t.Fatalf("expected payload was received before the probe timeout, got: %v, want: %v", diff, pto) + } + closeSACKConnection(t, dut, conn, acceptFd, listenFd) +} + +// TestRACKTLPWithSACK tests TLP by acknowledging out of order packets. +// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-8.1 +func TestRACKTLPWithSACK(t *testing.T) { + dut, conn, acceptFd, listenFd := createSACKConnection(t) + seqNum1 := *conn.RemoteSeqNum(t) + + // Send ACK for data packets to establish RTT. + sendAndReceive(t, dut, conn, numPktsForRTT, acceptFd, true /* sendACK */) + seqNum1.UpdateForward(seqnum.Size(numPktsForRTT * payloadSize)) + + // We are not sending ACK for these packets. + const numPkts = 3 + lastSent := sendAndReceive(t, dut, conn, numPkts, acceptFd, false /* sendACK */) + + // SACK for #2 packet. + sackBlock := make([]byte, 40) + start := seqNum1.Add(seqnum.Size(payloadSize)) + end := start.Add(seqnum.Size(payloadSize)) + sbOff := 0 + sbOff += header.EncodeNOP(sackBlock[sbOff:]) + sbOff += header.EncodeNOP(sackBlock[sbOff:]) + sbOff += header.EncodeSACKBlocks([]header.SACKBlock{{ + start, end, + }}, sackBlock[sbOff:]) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(seqNum1)), Options: sackBlock[:sbOff]}) + + // RACK marks #1 packet as lost and retransmits it. + if _, err := conn.Expect(t, testbench.TCP{SeqNum: testbench.Uint32(uint32(seqNum1))}, time.Second); err != nil { + t.Fatalf("expected payload was not received: %s", err) + } + + // ACK for #1 packet. + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(end))}) + + // Probe Timeout (PTO) should be two times RTT. TLP will trigger for #3 + // packet. RACK adds an additional timeout of 200ms if the number of + // outstanding packets is equal to 1. + rtt, rto := getRTTAndRTO(t, dut, acceptFd) + pto := rtt*2 + (200 * time.Millisecond) + if rto < pto { + pto = rto + } + // We expect the 3rd packet (the last unacknowledged packet) to be + // retransmitted. + tlpProbe := testbench.Uint32(uint32(seqNum1) + uint32((numPkts-1)*payloadSize)) + if _, err := conn.Expect(t, testbench.TCP{SeqNum: tlpProbe}, time.Second); err != nil { + t.Fatalf("expected payload was not received: %s", err) + } + diff := time.Now().Sub(lastSent) + if diff < pto { + t.Fatalf("expected payload was received before the probe timeout, got: %v, want: %v", diff, pto) + } + closeSACKConnection(t, dut, conn, acceptFd, listenFd) +} diff --git a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go index 1ab9ee1b2..b15b8fc25 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go @@ -66,33 +66,39 @@ func TestZeroWindowProbeRetransmit(t *testing.T) { probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)) ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum(t))) - startProbeDuration := time.Second - current := startProbeDuration - first := time.Now() // Ask the dut to send out data. dut.Send(t, acceptFd, sampleData, 0) + + var prev time.Duration // Expect the dut to keep the connection alive as long as the remote is // acknowledging the zero-window probes. - for i := 0; i < 5; i++ { + for i := 1; i <= 5; i++ { start := time.Now() // Expect zero-window probe with a timeout which is a function of the typical // first retransmission time. The retransmission times is supposed to // exponentially increase. - if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: probeSeq}, nil, 2*current); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: probeSeq}, nil, time.Duration(i)*time.Second); err != nil { t.Fatalf("expected a probe with sequence number %d: loop %d", probeSeq, i) } - if i == 0 { - startProbeDuration = time.Now().Sub(first) - current = 2 * startProbeDuration + if i == 1 { + // Skip the first probe as computing transmit time for that is + // non-deterministic because of the arbitrary time taken for + // the dut to receive a send command and issue a send. continue } - // Check if the probes came at exponentially increasing intervals. - if got, want := time.Since(start), current-startProbeDuration; got < want { + + // Check if the time taken to receive the probe from the dut is + // increasing exponentially. To avoid flakes, use a correction + // factor for the expected duration which accounts for any + // scheduling non-determinism. + const timeCorrection = 200 * time.Millisecond + got := time.Since(start) + if want := (2 * prev) - timeCorrection; prev != 0 && got < want { t.Errorf("got zero probe %d after %s, want >= %s", i, got, want) } + prev = got // Acknowledge the zero-window probes from the dut. conn.Send(t, testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)}) - current *= 2 } // Advertize non-zero window. conn.Send(t, testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)}) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index b37716c48..0da295e2d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -797,6 +797,7 @@ cc_binary( linkstatic = 1, deps = [ ":socket_test_util", + "//test/util:capability_util", "//test/util:cleanup", "//test/util:eventfd_util", "//test/util:file_descriptor", @@ -807,6 +808,7 @@ cc_binary( "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, + "//test/util:memory_util", "//test/util:multiprocess_util", "//test/util:posix_error", "//test/util:save_util", @@ -978,6 +980,7 @@ cc_binary( "//test/util:epoll_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:multiprocess_util", "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 4b581045b..75a5c9f17 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -15,6 +15,7 @@ #include <fcntl.h> #include <signal.h> #include <sys/epoll.h> +#include <sys/mman.h> #include <sys/types.h> #include <syscall.h> #include <unistd.h> @@ -35,10 +36,12 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/socket_test_util.h" +#include "test/util/capability_util.h" #include "test/util/cleanup.h" #include "test/util/eventfd_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/memory_util.h" #include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" #include "test/util/save_util.h" @@ -1642,6 +1645,202 @@ TEST(FcntlTest, SetFlSetOwnSetSigDoNotRace) { } } +TEST_F(FcntlLockTest, GetLockOnNothing) { + SKIP_IF(IsRunningWithVFS1()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666)); + + struct flock fl; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_GETLK, &fl), SyscallSucceeds()); + ASSERT_TRUE(fl.l_type == F_UNLCK); +} + +TEST_F(FcntlLockTest, GetLockOnLockSameProcess) { + SKIP_IF(IsRunningWithVFS1()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666)); + + struct flock fl; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + ASSERT_THAT(fcntl(fd.get(), F_GETLK, &fl), SyscallSucceeds()); + ASSERT_TRUE(fl.l_type == F_UNLCK); + + fl.l_type = F_WRLCK; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + ASSERT_THAT(fcntl(fd.get(), F_GETLK, &fl), SyscallSucceeds()); + ASSERT_TRUE(fl.l_type == F_UNLCK); +} + +TEST_F(FcntlLockTest, GetReadLockOnReadLock) { + SKIP_IF(IsRunningWithVFS1()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666)); + + struct flock fl; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + + pid_t child_pid = fork(); + if (child_pid == 0) { + TEST_CHECK(fcntl(fd.get(), F_GETLK, &fl) >= 0); + TEST_CHECK(fl.l_type == F_UNLCK); + _exit(0); + } + int status; + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); +} + +TEST_F(FcntlLockTest, GetReadLockOnWriteLock) { + SKIP_IF(IsRunningWithVFS1()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666)); + + struct flock fl; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + + fl.l_type = F_RDLCK; + pid_t child_pid = fork(); + if (child_pid == 0) { + TEST_CHECK(fcntl(fd.get(), F_GETLK, &fl) >= 0); + TEST_CHECK(fl.l_type == F_WRLCK); + TEST_CHECK(fl.l_pid == getppid()); + _exit(0); + } + + int status; + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); +} + +TEST_F(FcntlLockTest, GetWriteLockOnReadLock) { + SKIP_IF(IsRunningWithVFS1()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666)); + + struct flock fl; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + + fl.l_type = F_WRLCK; + pid_t child_pid = fork(); + if (child_pid == 0) { + TEST_CHECK(fcntl(fd.get(), F_GETLK, &fl) >= 0); + TEST_CHECK(fl.l_type == F_RDLCK); + TEST_CHECK(fl.l_pid == getppid()); + _exit(0); + } + + int status; + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); +} + +TEST_F(FcntlLockTest, GetWriteLockOnWriteLock) { + SKIP_IF(IsRunningWithVFS1()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666)); + + struct flock fl; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + + pid_t child_pid = fork(); + if (child_pid == 0) { + TEST_CHECK(fcntl(fd.get(), F_GETLK, &fl) >= 0); + TEST_CHECK(fl.l_type == F_WRLCK); + TEST_CHECK(fl.l_pid == getppid()); + _exit(0); + } + + int status; + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); +} + +// Tests that the pid returned from F_GETLK is relative to the caller's PID +// namespace. +TEST_F(FcntlLockTest, GetLockRespectsPIDNamespace) { + SKIP_IF(IsRunningWithVFS1()); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + std::string filename = file.path(); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_RDWR, 0666)); + + // Lock in the parent process. + struct flock fl; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + ASSERT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + + auto child_getlk = [](void* filename) { + int fd = open((char*)filename, O_RDWR, 0666); + TEST_CHECK(fd >= 0); + + struct flock fl; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 40; + TEST_CHECK(fcntl(fd, F_GETLK, &fl) >= 0); + TEST_CHECK(fl.l_type == F_WRLCK); + // Parent PID should be 0 in the child PID namespace. + TEST_CHECK(fl.l_pid == 0); + close(fd); + return 0; + }; + + // Set up child process in a new PID namespace. + constexpr int kStackSize = 4096; + Mapping stack = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kStackSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0)); + pid_t child_pid; + ASSERT_THAT( + child_pid = clone(child_getlk, (char*)stack.ptr() + stack.len(), + CLONE_NEWPID | SIGCHLD, (void*)filename.c_str()), + SyscallSucceeds()); + + int status; + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 8137f0e29..a88c89e20 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -36,6 +36,7 @@ #include "test/util/epoll_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -315,8 +316,7 @@ PosixErrorOr<std::vector<Event>> DrainEvents(int fd) { } PosixErrorOr<FileDescriptor> InotifyInit1(int flags) { - int fd; - EXPECT_THAT(fd = inotify_init1(flags), SyscallSucceeds()); + int fd = inotify_init1(flags); if (fd < 0) { return PosixError(errno, "inotify_init1() failed"); } @@ -325,9 +325,7 @@ PosixErrorOr<FileDescriptor> InotifyInit1(int flags) { PosixErrorOr<int> InotifyAddWatch(int fd, const std::string& path, uint32_t mask) { - int wd; - EXPECT_THAT(wd = inotify_add_watch(fd, path.c_str(), mask), - SyscallSucceeds()); + int wd = inotify_add_watch(fd, path.c_str(), mask); if (wd < 0) { return PosixError(errno, "inotify_add_watch() failed"); } @@ -784,6 +782,38 @@ TEST(Inotify, MoveWatchedTargetGeneratesEvents) { EXPECT_EQ(events[0].cookie, events[1].cookie); } +// Tests that close events are only emitted when a file description drops its +// last reference. +TEST(Inotify, DupFD) { + SKIP_IF(IsRunningWithVFS1()); + + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS)); + + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY)); + FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup()); + + std::vector<Event> events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_OPEN, wd), + })); + + fd.reset(); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({})); + + fd2.reset(); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_CLOSE_NOWRITE, wd), + })); +} + TEST(Inotify, CoalesceEvents) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor fd = @@ -1779,11 +1809,9 @@ TEST(Inotify, Sendfile) { EXPECT_THAT(out_events, Are({Event(IN_MODIFY, out_wd)})); } -// On Linux, inotify behavior is not very consistent with splice(2). We try our -// best to emulate Linux for very basic calls to splice. TEST(Inotify, SpliceOnWatchTarget) { - int pipes[2]; - ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds()); + int pipefds[2]; + ASSERT_THAT(pipe2(pipefds, O_NONBLOCK), SyscallSucceeds()); const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor inotify_fd = @@ -1798,15 +1826,20 @@ TEST(Inotify, SpliceOnWatchTarget) { const int file_wd = ASSERT_NO_ERRNO_AND_VALUE( InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS)); - EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr, 1, /*flags=*/0), + EXPECT_THAT(splice(fd.get(), nullptr, pipefds[1], nullptr, 1, /*flags=*/0), SyscallSucceedsWithValue(1)); - // Surprisingly, events are not generated in Linux if we read from a file. + // Surprisingly, events may not be generated in Linux if we read from a file. + // fs/splice.c:generic_file_splice_read, which is used most often, does not + // generate events, whereas fs/splice.c:default_file_splice_read does. std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); - ASSERT_THAT(events, Are({})); + if (IsRunningOnGvisor() && !IsRunningWithVFS1()) { + ASSERT_THAT(events, Are({Event(IN_ACCESS, dir_wd, Basename(file.path())), + Event(IN_ACCESS, file_wd)})); + } - EXPECT_THAT(splice(pipes[0], nullptr, fd.get(), nullptr, 1, /*flags=*/0), + EXPECT_THAT(splice(pipefds[0], nullptr, fd.get(), nullptr, 1, /*flags=*/0), SyscallSucceedsWithValue(1)); events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); @@ -1817,8 +1850,8 @@ TEST(Inotify, SpliceOnWatchTarget) { } TEST(Inotify, SpliceOnInotifyFD) { - int pipes[2]; - ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds()); + int pipefds[2]; + ASSERT_THAT(pipe2(pipefds, O_NONBLOCK), SyscallSucceeds()); const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor fd = @@ -1834,11 +1867,11 @@ TEST(Inotify, SpliceOnInotifyFD) { char buf; EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds()); - EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr, + EXPECT_THAT(splice(fd.get(), nullptr, pipefds[1], nullptr, sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK), SyscallSucceedsWithValue(sizeof(struct inotify_event))); - const FileDescriptor read_fd(pipes[0]); + const FileDescriptor read_fd(pipefds[0]); const std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get())); ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)})); @@ -1936,24 +1969,29 @@ TEST(Inotify, Xattr) { } TEST(Inotify, Exec) { - const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE( - TempPath::CreateSymlinkTo(dir.path(), "/bin/true")); - + SKIP_IF(IsRunningWithVFS1()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); const int wd = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS)); + InotifyAddWatch(fd.get(), "/bin/true", IN_ALL_EVENTS)); // Perform exec. - ScopedThread t([&bin]() { - ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr), - SyscallSucceeds()); - }); - t.Join(); + pid_t child = -1; + int execve_errno = -1; + auto kill = ASSERT_NO_ERRNO_AND_VALUE( + ForkAndExec("/bin/true", {}, {}, nullptr, &child, &execve_errno)); + ASSERT_EQ(0, execve_errno); + + int status; + ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds()); + EXPECT_EQ(0, status); + + // Process cleanup no longer needed. + kill.Release(); std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); - EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)})); + EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd), + Event(IN_CLOSE_NOWRITE, wd)})); } // Watches without IN_EXCL_UNLINK, should continue to emit events for file diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc index 2ed4f6f9c..d25be0e30 100644 --- a/test/syscalls/linux/packet_socket_raw.cc +++ b/test/syscalls/linux/packet_socket_raw.cc @@ -548,13 +548,7 @@ TEST_P(RawPacketTest, SetSocketSendBuf) { ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len), SyscallSucceeds()); - // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF. - // TODO(gvisor.dev/issue/2926): Remove the gvisor special casing when Netstack - // matches linux behavior. - if (!IsRunningOnGvisor()) { - quarter_sz *= 2; - } - + quarter_sz *= 2; ASSERT_EQ(quarter_sz, val); } diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc index 955bcee4b..32924466f 100644 --- a/test/syscalls/linux/raw_socket.cc +++ b/test/syscalls/linux/raw_socket.cc @@ -621,13 +621,7 @@ TEST_P(RawSocketTest, SetSocketSendBuf) { ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len), SyscallSucceeds()); - // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF. - // TODO(gvisor.dev/issue/2926): Remove the gvisor special casing when Netstack - // matches linux behavior. - if (!IsRunningOnGvisor()) { - quarter_sz *= 2; - } - + quarter_sz *= 2; ASSERT_EQ(quarter_sz, val); } diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc index d6e8b3e59..6aabd79e7 100644 --- a/test/syscalls/linux/shm.cc +++ b/test/syscalls/linux/shm.cc @@ -256,32 +256,26 @@ TEST(ShmTest, IpcInfo) { } TEST(ShmTest, ShmInfo) { - struct shm_info info; - - // We generally can't know what other processes on a linux machine - // does with shared memory segments, so we can't test specific - // numbers on Linux. When running under gvisor, we're guaranteed to - // be the only ones using shm, so we can easily verify machine-wide - // numbers. - if (IsRunningOnGvisor()) { - ASSERT_NO_ERRNO(Shmctl(0, SHM_INFO, &info)); - EXPECT_EQ(info.used_ids, 0); - EXPECT_EQ(info.shm_tot, 0); - EXPECT_EQ(info.shm_rss, 0); - EXPECT_EQ(info.shm_swp, 0); - } + // Take a snapshot of the system before the test runs. + struct shm_info snap; + ASSERT_NO_ERRNO(Shmctl(0, SHM_INFO, &snap)); const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE( Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777)); const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0)); + struct shm_info info; ASSERT_NO_ERRNO(Shmctl(1, SHM_INFO, &info)); + // We generally can't know what other processes on a linux machine do with + // shared memory segments, so we can't test specific numbers on Linux. When + // running under gvisor, we're guaranteed to be the only ones using shm, so + // we can easily verify machine-wide numbers. if (IsRunningOnGvisor()) { ASSERT_NO_ERRNO(Shmctl(shm.id(), SHM_INFO, &info)); - EXPECT_EQ(info.used_ids, 1); - EXPECT_EQ(info.shm_tot, kAllocSize / kPageSize); - EXPECT_EQ(info.shm_rss, kAllocSize / kPageSize); + EXPECT_EQ(info.used_ids, snap.used_ids + 1); + EXPECT_EQ(info.shm_tot, snap.shm_tot + (kAllocSize / kPageSize)); + EXPECT_EQ(info.shm_rss, snap.shm_rss + (kAllocSize / kPageSize)); EXPECT_EQ(info.shm_swp, 0); // Gvisor currently never swaps. } diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc index 32f583581..b616c2c87 100644 --- a/test/syscalls/linux/socket.cc +++ b/test/syscalls/linux/socket.cc @@ -16,6 +16,7 @@ #include <sys/stat.h> #include <sys/statfs.h> #include <sys/types.h> +#include <sys/wait.h> #include <unistd.h> #include "gtest/gtest.h" @@ -90,8 +91,7 @@ TEST(SocketTest, UnixSocketStat) { EXPECT_EQ(statbuf.st_mode, S_IFSOCK | (sock_perm & ~mask)); // Timestamps should be equal and non-zero. - // TODO(b/158882152): Sockets currently don't implement timestamps. - if (!IsRunningOnGvisor()) { + if (!IsRunningWithVFS1()) { EXPECT_NE(statbuf.st_atime, 0); EXPECT_EQ(statbuf.st_atime, statbuf.st_mtime); EXPECT_EQ(statbuf.st_atime, statbuf.st_ctime); @@ -111,6 +111,77 @@ TEST(SocketTest, UnixSocketStatFS) { EXPECT_EQ(st.f_namelen, NAME_MAX); } +TEST(SocketTest, UnixSCMRightsOnlyPassedOnce_NoRandomSave) { + const DisableSave ds; + + int sockets[2]; + ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets), SyscallSucceeds()); + // Send more than what will fit inside the send/receive buffers, so that it is + // split into multiple messages. + constexpr int kBufSize = 0x100000; + + pid_t pid = fork(); + if (pid == 0) { + TEST_PCHECK(close(sockets[0]) == 0); + + // Construct a message with some control message. + struct msghdr msg = {}; + char control[CMSG_SPACE(sizeof(int))] = {}; + std::vector<char> buf(kBufSize); + struct iovec iov = {}; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + ((int*)CMSG_DATA(cmsg))[0] = sockets[1]; + + iov.iov_base = buf.data(); + iov.iov_len = kBufSize; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + int n = sendmsg(sockets[1], &msg, 0); + TEST_PCHECK(n == kBufSize); + TEST_PCHECK(shutdown(sockets[1], SHUT_RDWR) == 0); + TEST_PCHECK(close(sockets[1]) == 0); + _exit(0); + } + + close(sockets[1]); + + struct msghdr msg = {}; + char control[CMSG_SPACE(sizeof(int))] = {}; + std::vector<char> buf(kBufSize); + struct iovec iov = {}; + msg.msg_control = &control; + msg.msg_controllen = sizeof(control); + + iov.iov_base = buf.data(); + iov.iov_len = kBufSize; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + // The control message should only be present in the first message received. + int n; + ASSERT_THAT(n = recvmsg(sockets[0], &msg, 0), SyscallSucceeds()); + ASSERT_GT(n, 0); + ASSERT_EQ(msg.msg_controllen, CMSG_SPACE(sizeof(int))); + + while (n > 0) { + ASSERT_THAT(n = recvmsg(sockets[0], &msg, 0), SyscallSucceeds()); + ASSERT_EQ(msg.msg_controllen, 0); + } + + close(sockets[0]); + + int status; + ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); + ASSERT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); +} + using SocketOpenTest = ::testing::TestWithParam<int>; // UDS cannot be opened. diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc index e557572a7..8eec31a46 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc @@ -2513,11 +2513,7 @@ TEST_P(IPv4UDPUnboundSocketTest, SetSocketSendBuf) { ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &val, &val_len), SyscallSucceeds()); - // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF. - if (!IsRunningOnGvisor()) { - quarter_sz *= 2; - } - + quarter_sz *= 2; ASSERT_EQ(quarter_sz, val); } diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc index a16899493..22a4ee0d1 100644 --- a/test/syscalls/linux/socket_unix_cmsg.cc +++ b/test/syscalls/linux/socket_unix_cmsg.cc @@ -362,7 +362,7 @@ TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPassTruncationMsgCtrunc) { // BasicFDPassUnalignedRecv starts off by sending a single FD just like // BasicFDPass. The difference is that when calling recvmsg, the length of the -// receive data is only aligned on a 4 byte boundry instead of the normal 8. +// receive data is only aligned on a 4 byte boundary instead of the normal 8. TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecv) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 714848b8e..9028ab024 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -2008,6 +2008,29 @@ TEST_P(SimpleTcpSocketTest, GetSocketAcceptConnWithShutdown) { EXPECT_EQ(got, 0); } +// Tests that connecting to an unspecified address results in ECONNREFUSED. +TEST_P(SimpleTcpSocketTest, ConnectUnspecifiedAddress) { + sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + memset(&addr, 0, addrlen); + addr.ss_family = GetParam(); + auto do_connect = [&addr, addrlen]() { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(addr.ss_family, SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT( + RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr), + addrlen), + SyscallFailsWithErrno(ECONNREFUSED)); + }; + do_connect(); + // Test the v4 mapped address as well. + if (GetParam() == AF_INET6) { + auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr); + sin6->sin6_addr.s6_addr[10] = sin6->sin6_addr.s6_addr[11] = 0xff; + do_connect(); + } +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest, ::testing::Values(AF_INET, AF_INET6)); diff --git a/test/util/BUILD b/test/util/BUILD index 1b028a477..e561f3daa 100644 --- a/test/util/BUILD +++ b/test/util/BUILD @@ -172,6 +172,7 @@ cc_library( ":posix_error", ":save_util", ":test_util", + gtest, "@com_google_absl//absl/strings", ], ) diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc index 8b676751b..a6b0de24b 100644 --- a/test/util/multiprocess_util.cc +++ b/test/util/multiprocess_util.cc @@ -154,6 +154,9 @@ PosixErrorOr<int> InForkedProcess(const std::function<void()>& fn) { pid_t pid = fork(); if (pid == 0) { fn(); + TEST_CHECK_MSG(!::testing::Test::HasFailure(), + "EXPECT*/ASSERT* failed. These are not async-signal-safe " + "and must not be called from fn."); _exit(0); } MaybeSave(); diff --git a/tools/bazel.mk b/tools/bazel.mk index 2b20457e9..fb0fc6524 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -191,7 +191,7 @@ endif build_paths = \ (set -euo pipefail; \ $(call wrapper,$(BAZEL) build $(BASE_OPTIONS) $(BAZEL_OPTIONS) $(1)) 2>&1 \ - | tee /proc/self/fd/2 \ + | tee /dev/fd/2 \ | sed -n -e '/^Target/,$$p' \ | sed -n -e '/^ \($(subst /,\/,$(subst $(SPACE),\|,$(BUILD_ROOTS)))\)/p' \ | sed -e 's/ /\n/g' \ diff --git a/website/cmd/syscalldocs/main.go b/website/cmd/syscalldocs/main.go index 327537214..830d2bac7 100644 --- a/website/cmd/syscalldocs/main.go +++ b/website/cmd/syscalldocs/main.go @@ -52,6 +52,7 @@ layout: docs category: Compatibility weight: 50 permalink: /docs/user_guide/compatibility/{{.OS}}/{{.Arch}}/ +include_in_menu: True --- This table is a reference of {{.OS}} syscalls for the {{.Arch}} architecture and @@ -75,9 +76,9 @@ syscalls. {{if .Undocumented}}{{.Undocumented}} syscalls are not yet documented. </thead> <tbody> {{range $i, $syscall := .Syscalls}} - <tr> - <td><a class="doc-table-anchor" id="{{.Name}}"></a>{{.Number}}</td> - <td><a href="http://man7.org/linux/man-pages/man2/{{.Name}}.2.html" target="_blank" rel="noopener">{{.Name}}</a></td> + <tr id="{{.Name}}"> + <td><a href="#{{.Name}}">{{.Number}}</a></td> + <td><a href="{{.DocURL}}" target="_blank" rel="noopener">{{.Name}}</a></td> <td>{{.Support}}</td> <td>{{.Note}} {{range $i, $url := .URLs}}<br/>See: <a href="{{.}}">{{.}}</a>{{end}}</td> </tr> @@ -92,6 +93,27 @@ func Fatalf(format string, a ...interface{}) { os.Exit(1) } +// syscallDocURL returns a doc url for a given syscall, doing its best to return a url that exists. +func syscallDocURL(name string) string { + customDocs := map[string]string{ + "io_pgetevents": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "rseq": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "io_uring_setup": "https://manpages.debian.org/buster-backports/liburing-dev/io_uring_setup.2.en.html", + "io_uring_enter": "https://manpages.debian.org/buster-backports/liburing-dev/io_uring_enter.2.en.html", + "io_uring_register": "https://manpages.debian.org/buster-backports/liburing-dev/io_uring_register.2.en.html", + "open_tree": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "move_mount": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "fsopen": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "fsconfig": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "fsmount": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + "fspick": "https://man7.org/linux/man-pages/man2/syscalls.2.html", + } + if url, ok := customDocs[name]; ok { + return url + } + return fmt.Sprintf("http://man7.org/linux/man-pages/man2/%s.2.html", name) +} + func main() { inputFlag := flag.String("in", "-", "File to input ('-' for stdin)") outputDir := flag.String("out", ".", "Directory to output files.") @@ -145,6 +167,7 @@ func main() { Syscalls []struct { Name string Number uintptr + DocURL string Support string Note string URLs []string @@ -161,6 +184,7 @@ func main() { Syscalls: []struct { Name string Number uintptr + DocURL string Support string Note string URLs []string @@ -187,14 +211,16 @@ func main() { data.Syscalls = append(data.Syscalls, struct { Name string Number uintptr + DocURL string Support string Note string URLs []string }{ Name: s.Name, Number: num, + DocURL: syscallDocURL(s.Name), Support: s.Support, - Note: s.Note, // TODO urls + Note: s.Note, URLs: s.URLs, }) } |