diff options
-rw-r--r-- | pkg/bitmap/BUILD | 16 | ||||
-rw-r--r-- | pkg/bitmap/bitmap.go | 234 | ||||
-rw-r--r-- | pkg/bitmap/bitmap_test.go | 306 | ||||
-rw-r--r-- | pkg/sentry/kernel/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table.go | 196 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_unsafe.go | 11 | ||||
-rw-r--r-- | test/perf/BUILD | 7 | ||||
-rw-r--r-- | test/perf/linux/BUILD | 16 | ||||
-rw-r--r-- | test/perf/linux/dup_benchmark.cc | 55 | ||||
-rw-r--r-- | test/syscalls/linux/BUILD | 1 | ||||
-rw-r--r-- | test/syscalls/linux/dup.cc | 41 |
11 files changed, 776 insertions, 108 deletions
diff --git a/pkg/bitmap/BUILD b/pkg/bitmap/BUILD new file mode 100644 index 000000000..0f1e7006d --- /dev/null +++ b/pkg/bitmap/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "bitmap", + srcs = ["bitmap.go"], + visibility = ["//:sandbox"], +) + +go_test( + name = "bitmap_test", + size = "small", + srcs = ["bitmap_test.go"], + library = ":bitmap", +) diff --git a/pkg/bitmap/bitmap.go b/pkg/bitmap/bitmap.go new file mode 100644 index 000000000..12d2fc2b8 --- /dev/null +++ b/pkg/bitmap/bitmap.go @@ -0,0 +1,234 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package bitmap provides the implementation of bitmap. +package bitmap + +import ( + "math" + "math/bits" +) + +// Bitmap implements an efficient bitmap. +// +// +stateify savable +type Bitmap struct { + // numOnes is the number of ones in the bitmap. + numOnes uint32 + + // bitBlock holds the bits. The type of bitBlock is uint64 which means + // each number in bitBlock contains 64 entries. + bitBlock []uint64 +} + +// New create a new empty Bitmap. +func New(size uint32) Bitmap { + b := Bitmap{} + bSize := (size + 63) / 64 + b.bitBlock = make([]uint64, bSize) + return b +} + +// IsEmpty verifies whether the Bitmap is empty. +func (b *Bitmap) IsEmpty() bool { + return b.numOnes == 0 +} + +// Minimum return the smallest value in the Bitmap. +func (b *Bitmap) Minimum() uint32 { + for i := 0; i < len(b.bitBlock); i++ { + if w := b.bitBlock[i]; w != 0 { + r := bits.TrailingZeros64(w) + return uint32(r + i*64) + } + } + return math.MaxInt32 +} + +// FirstZero returns the first unset bit from the range [start, ). +func (b *Bitmap) FirstZero(start uint32) uint32 { + i, nbit := int(start/64), start%64 + n := len(b.bitBlock) + if i >= n { + return math.MaxInt32 + } + w := b.bitBlock[i] | ((1 << nbit) - 1) + for { + if w != ^uint64(0) { + r := bits.TrailingZeros64(^w) + return uint32(r + i*64) + } + i++ + if i == n { + break + } + w = b.bitBlock[i] + } + return math.MaxInt32 +} + +// Maximum return the largest value in the Bitmap. +func (b *Bitmap) Maximum() uint32 { + for i := len(b.bitBlock) - 1; i >= 0; i-- { + if w := b.bitBlock[i]; w != 0 { + r := bits.LeadingZeros64(w) + return uint32(i*64 + 63 - r) + } + } + return uint32(0) +} + +// Add add i to the Bitmap. +func (b *Bitmap) Add(i uint32) { + blockNum, mask := i/64, uint64(1)<<(i%64) + // if blockNum is out of range, extend b.bitBlock + if x, y := int(blockNum), len(b.bitBlock); x >= y { + b.bitBlock = append(b.bitBlock, make([]uint64, x-y+1)...) + } + oldBlock := b.bitBlock[blockNum] + newBlock := oldBlock | mask + if oldBlock != newBlock { + b.bitBlock[blockNum] = newBlock + b.numOnes++ + } +} + +// Remove i from the Bitmap. +func (b *Bitmap) Remove(i uint32) { + blockNum, mask := i/64, uint64(1)<<(i%64) + oldBlock := b.bitBlock[blockNum] + newBlock := oldBlock &^ mask + if oldBlock != newBlock { + b.bitBlock[blockNum] = newBlock + b.numOnes-- + } +} + +// Clone the Bitmap. +func (b *Bitmap) Clone() Bitmap { + bitmap := Bitmap{b.numOnes, make([]uint64, len(b.bitBlock))} + copy(bitmap.bitBlock, b.bitBlock[:]) + return bitmap +} + +// countOnesForBlocks count all 1 bits within b.bitBlock of begin and that of end. +// The begin block and end block are inclusive. +func (b *Bitmap) countOnesForBlocks(begin, end uint32) uint64 { + ones := uint64(0) + beginBlock := begin / 64 + endBlock := end / 64 + for i := beginBlock; i <= endBlock; i++ { + ones += uint64(bits.OnesCount64(b.bitBlock[i])) + } + return ones +} + +// countOnesForAllBlocks count all 1 bits in b.bitBlock. +func (b *Bitmap) countOnesForAllBlocks() uint64 { + ones := uint64(0) + for i := 0; i < len(b.bitBlock); i++ { + ones += uint64(bits.OnesCount64(b.bitBlock[i])) + } + return ones +} + +// flipRange flip the bits within range (begin and end). begin is inclusive and end is exclusive. +func (b *Bitmap) flipRange(begin, end uint32) { + end-- + beginBlock := begin / 64 + endBlock := end / 64 + if beginBlock == endBlock { + b.bitBlock[endBlock] ^= ((^uint64(0) << uint(begin%64)) & ((uint64(1) << (uint(end)%64 + 1)) - 1)) + } else { + b.bitBlock[beginBlock] ^= ^(^uint64(0) << uint(begin%64)) + for i := beginBlock; i < endBlock; i++ { + b.bitBlock[i] = ^b.bitBlock[i] + } + b.bitBlock[endBlock] ^= ((uint64(1) << (uint(end)%64 + 1)) - 1) + } +} + +// clearRange clear the bits within range (begin and end). begin is inclusive and end is exclusive. +func (b *Bitmap) clearRange(begin, end uint32) { + end-- + beginBlock := begin / 64 + endBlock := end / 64 + if beginBlock == endBlock { + b.bitBlock[beginBlock] &= (((uint64(1) << uint(begin%64)) - 1) | ^((uint64(1) << (uint(end)%64 + 1)) - 1)) + } else { + b.bitBlock[beginBlock] &= ((uint64(1) << uint(begin%64)) - 1) + for i := beginBlock + 1; i < endBlock; i++ { + b.bitBlock[i] &= ^b.bitBlock[i] + } + b.bitBlock[endBlock] &= ^((uint64(1) << (uint(end)%64 + 1)) - 1) + } +} + +// ClearRange clear bits within range (begin and end) for the Bitmap. begin is inclusive and end is exclusive. +func (b *Bitmap) ClearRange(begin, end uint32) { + blockRange := end/64 - begin/64 + // When the number of cleared blocks is larger than half of the length of b.bitBlock, + // counting 1s for the entire bitmap has better performance. + if blockRange > uint32(len(b.bitBlock)/2) { + b.clearRange(begin, end) + b.numOnes = uint32(b.countOnesForAllBlocks()) + } else { + oldRangeOnes := b.countOnesForBlocks(begin, end) + b.clearRange(begin, end) + newRangeOnes := b.countOnesForBlocks(begin, end) + b.numOnes += uint32(newRangeOnes - oldRangeOnes) + } +} + +// FlipRange flip bits within range (begin and end) for the Bitmap. begin is inclusive and end is exclusive. +func (b *Bitmap) FlipRange(begin, end uint32) { + blockRange := end/64 - begin/64 + // When the number of flipped blocks is larger than half of the length of b.bitBlock, + // counting 1s for the entire bitmap has better performance. + if blockRange > uint32(len(b.bitBlock)/2) { + b.flipRange(begin, end) + b.numOnes = uint32(b.countOnesForAllBlocks()) + } else { + oldRangeOnes := b.countOnesForBlocks(begin, end) + b.flipRange(begin, end) + newRangeOnes := b.countOnesForBlocks(begin, end) + b.numOnes += uint32(newRangeOnes - oldRangeOnes) + } +} + +// ToSlice transform the Bitmap into slice. For example, a bitmap of [0, 1, 0, 1] +// will return the slice [1, 3]. +func (b *Bitmap) ToSlice() []uint32 { + bitmapSlice := make([]uint32, 0, b.numOnes) + // base is the start number of a bitBlock + base := 0 + for i := 0; i < len(b.bitBlock); i++ { + bitBlock := b.bitBlock[i] + // Iterate through all the numbers held by this bit block. + for bitBlock != 0 { + // Extract the lowest set 1 bit. + j := bitBlock & -bitBlock + // Interpret the bit as the in32 number it represents and add it to result. + bitmapSlice = append(bitmapSlice, uint32((base + int(bits.OnesCount64(j-1))))) + bitBlock ^= j + } + base += 64 + } + return bitmapSlice +} + +// GetNumOnes return the the number of ones in the Bitmap. +func (b *Bitmap) GetNumOnes() uint32 { + return b.numOnes +} diff --git a/pkg/bitmap/bitmap_test.go b/pkg/bitmap/bitmap_test.go new file mode 100644 index 000000000..76ebd779f --- /dev/null +++ b/pkg/bitmap/bitmap_test.go @@ -0,0 +1,306 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bitmap + +import ( + "math" + "reflect" + "testing" +) + +// generateFilledSlice generates a slice fill with numbers. +func generateFilledSlice(min, max, length int) []uint32 { + if max == min { + return []uint32{uint32(min)} + } + if length > (max - min) { + return nil + } + randSlice := make([]uint32, length) + if length != 0 { + rangeNum := uint32((max - min) / length) + randSlice[0], randSlice[length-1] = uint32(min), uint32(max) + for i := 1; i < length-1; i++ { + randSlice[i] = randSlice[i-1] + rangeNum + } + } + return randSlice +} + +// generateFilledBitmap generates a Bitmap filled with fillNum of numbers, +// and returns the slice and bitmap. +func generateFilledBitmap(min, max, fillNum int) ([]uint32, Bitmap) { + bitmap := New(uint32(max)) + randSlice := generateFilledSlice(min, max, fillNum) + for i := 0; i < fillNum; i++ { + bitmap.Add(randSlice[i]) + } + return randSlice, bitmap +} + +func TestNewBitmap(t *testing.T) { + tests := []struct { + name string + size int + expectSize int + }{ + {"length 1", 1, 1}, + {"length 1024", 1024, 16}, + {"length 1025", 1025, 17}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + if bitmap := New(uint32(tt.size)); len(bitmap.bitBlock) != tt.expectSize { + t.Errorf("New created bitmap with %v, bitBlock size: %d, wanted: %d", tt.name, len(bitmap.bitBlock), tt.expectSize) + } + }) + } +} + +func TestAdd(t *testing.T) { + tests := []struct { + name string + bitmapSize int + addNum int + }{ + {"Add with null bitmap.bitBlock", 0, 10}, + {"Add without extending bitBlock", 64, 10}, + {"Add without extending bitblock with margin number", 63, 64}, + {"Add with extended one block", 1024, 1025}, + {"Add with extended more then one block", 1024, 2048}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + bitmap := New(uint32(tt.bitmapSize)) + bitmap.Add(uint32(tt.addNum)) + bitmapSlice := bitmap.ToSlice() + if bitmapSlice[0] != uint32(tt.addNum) { + t.Errorf("%v, get number: %d, wanted: %d.", tt.name, bitmapSlice[0], tt.addNum) + } + }) + } +} + +func TestRemove(t *testing.T) { + bitmap := New(uint32(1024)) + firstSlice := generateFilledSlice(0, 511, 50) + secondSlice := generateFilledSlice(512, 1024, 50) + for i := 0; i < 50; i++ { + bitmap.Add(firstSlice[i]) + bitmap.Add(secondSlice[i]) + } + + for i := 0; i < 50; i++ { + bitmap.Remove(firstSlice[i]) + } + bitmapSlice := bitmap.ToSlice() + if !reflect.DeepEqual(bitmapSlice, secondSlice) { + t.Errorf("After Remove() firstSlice, remained slice: %v, wanted: %v", bitmapSlice, secondSlice) + } + + for i := 0; i < 50; i++ { + bitmap.Remove(secondSlice[i]) + } + bitmapSlice = bitmap.ToSlice() + emptySlice := make([]uint32, 0) + if !reflect.DeepEqual(bitmapSlice, emptySlice) { + t.Errorf("After Remove secondSlice, remained slice: %v, wanted: %v", bitmapSlice, emptySlice) + } + +} + +// Verify flip bits within one bitBlock, one bit and bits cross multi bitBlocks. +func TestFlipRange(t *testing.T) { + tests := []struct { + name string + flipRangeMin int + flipRangeMax int + filledSliceLen int + }{ + {"Flip one number in bitmap", 77, 77, 1}, + {"Flip numbers within one bitBlocks", 5, 60, 20}, + {"Flip numbers that cross multi bitBlocks", 20, 1000, 300}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + fillSlice, bitmap := generateFilledBitmap(tt.flipRangeMin, tt.flipRangeMax, tt.filledSliceLen) + flipFillSlice := make([]uint32, 0) + for i, j := tt.flipRangeMin, 0; i <= tt.flipRangeMax; i++ { + if uint32(i) != fillSlice[j] { + flipFillSlice = append(flipFillSlice, uint32(i)) + } else { + j++ + } + } + + bitmap.FlipRange(uint32(tt.flipRangeMin), uint32(tt.flipRangeMax+1)) + flipBitmapSlice := bitmap.ToSlice() + if !reflect.DeepEqual(flipFillSlice, flipBitmapSlice) { + t.Errorf("%v, flipped slice: %v, wanted: %v", tt.name, flipBitmapSlice, flipFillSlice) + } + }) + } +} + +// Verify clear bits within one bitBlock, one bit and bits cross multi bitBlocks. +func TestClearRange(t *testing.T) { + tests := []struct { + name string + clearRangeMin int + clearRangeMax int + bitmapSize int + }{ + {"ClearRange clear one number", 5, 5, 64}, + {"ClearRange clear numbers within one bitBlock", 4, 61, 64}, + {"ClearRange clear numbers cross multi bitBlocks", 20, 254, 512}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + bitmap := New(uint32(tt.bitmapSize)) + bitmap.FlipRange(uint32(0), uint32(tt.bitmapSize)) + bitmap.ClearRange(uint32(tt.clearRangeMin), uint32(tt.clearRangeMax+1)) + clearedBitmapSlice := bitmap.ToSlice() + clearedSlice := make([]uint32, 0) + for i := 0; i < tt.bitmapSize; i++ { + if i < tt.clearRangeMin || i > tt.clearRangeMax { + clearedSlice = append(clearedSlice, uint32(i)) + } + } + if !reflect.DeepEqual(clearedSlice, clearedBitmapSlice) { + t.Errorf("%v, cleared slice: %v, wanted: %v", tt.name, clearedBitmapSlice, clearedSlice) + } + }) + + } +} + +func TestMinimum(t *testing.T) { + randSlice, bitmap := generateFilledBitmap(0, 1024, 200) + min := bitmap.Minimum() + if min != randSlice[0] { + t.Errorf("Minimum() returns: %v, wanted: %v", min, randSlice[0]) + } + + bitmap.ClearRange(uint32(0), uint32(200)) + min = bitmap.Minimum() + bitmapSlice := bitmap.ToSlice() + if min != bitmapSlice[0] { + t.Errorf("After ClearRange, Minimum() returns: %v, wanted: %v", min, bitmapSlice[0]) + } + + bitmap.FlipRange(uint32(2), uint32(11)) + min = bitmap.Minimum() + bitmapSlice = bitmap.ToSlice() + if min != bitmapSlice[0] { + t.Errorf("After Flip, Minimum() returns: %v, wanted: %v", min, bitmapSlice[0]) + } +} + +func TestMaximum(t *testing.T) { + randSlice, bitmap := generateFilledBitmap(0, 1024, 200) + max := bitmap.Maximum() + if max != randSlice[len(randSlice)-1] { + t.Errorf("Maximum() returns: %v, wanted: %v", max, randSlice[len(randSlice)-1]) + } + + bitmap.ClearRange(uint32(1000), uint32(1025)) + max = bitmap.Maximum() + bitmapSlice := bitmap.ToSlice() + if max != bitmapSlice[len(bitmapSlice)-1] { + t.Errorf("After ClearRange, Maximum() returns: %v, wanted: %v", max, bitmapSlice[len(bitmapSlice)-1]) + } + + bitmap.FlipRange(uint32(1001), uint32(1021)) + max = bitmap.Maximum() + bitmapSlice = bitmap.ToSlice() + if max != bitmapSlice[len(bitmapSlice)-1] { + t.Errorf("After Flip, Maximum() returns: %v, wanted: %v", max, bitmapSlice[len(bitmapSlice)-1]) + } +} + +func TestBitmapNumOnes(t *testing.T) { + randSlice, bitmap := generateFilledBitmap(0, 1024, 200) + bitmapOnes := bitmap.GetNumOnes() + if bitmapOnes != uint32(200) { + t.Errorf("GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 200) + } + // Remove 10 numbers. + for i := 5; i < 15; i++ { + bitmap.Remove(randSlice[i]) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(190) { + t.Errorf("After Remove 10 number, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 190) + } + // Remove the 10 number again, the length supposed not change. + for i := 5; i < 15; i++ { + bitmap.Remove(randSlice[i]) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(190) { + t.Errorf("After Remove the 10 number again, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 190) + } + + // Add 10 number. + for i := 1080; i < 1090; i++ { + bitmap.Add(uint32(i)) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(200) { + t.Errorf("After Add 10 number, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 200) + } + + // Add the 10 number again, the length supposed not change. + for i := 1080; i < 1090; i++ { + bitmap.Add(uint32(i)) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(200) { + t.Errorf("After Add the 10 number again, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 200) + } + + // Flip 10 bits from 0 to 1. + bitmap.FlipRange(uint32(1025), uint32(1035)) + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(210) { + t.Errorf("After Flip, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 210) + } + + // ClearRange numbers range from [0, 1025). + bitmap.ClearRange(uint32(0), uint32(1025)) + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(20) { + t.Errorf("After ClearRange, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 20) + } +} + +func TestFirstZero(t *testing.T) { + bitmap := New(uint32(1000)) + bitmap.FlipRange(200, 400) + for i, j := range map[uint32]uint32{0: 0, 201: 400, 200: 400, 199: 199, 400: 400, 10000: math.MaxInt32} { + v := bitmap.FirstZero(i) + if v != j { + t.Errorf("Minimum() returns: %v, wanted: %v", v, j) + } + } +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c613f4932..e4e0dc04f 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -220,6 +220,7 @@ go_library( "//pkg/abi/linux", "//pkg/abi/linux/errno", "//pkg/amutex", + "//pkg/bitmap", "//pkg/bits", "//pkg/bpf", "//pkg/cleanup", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 8786a70b5..eff556a0c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -18,10 +18,10 @@ import ( "fmt" "math" "strings" - "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -84,13 +84,8 @@ type FDTable struct { // mu protects below. mu sync.Mutex `state:"nosave"` - // next is start position to find fd. - next int32 - - // used contains the number of non-nil entries. It must be accessed - // atomically. It may be read atomically without holding mu (but not - // written). - used int32 + // fdBitmap shows which fds are already in use. + fdBitmap bitmap.Bitmap `state:"nosave"` // descriptorTable holds descriptors. descriptorTable `state:".(map[int32]descriptor)"` @@ -98,6 +93,8 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) + f.mu.Lock() + defer f.mu.Unlock() f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, @@ -111,12 +108,16 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { ctx := context.Background() f.initNoLeakCheck() // Initialize table. - f.used = 0 + f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) for fd, d := range m { + if fd < 0 { + panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd)) + } + if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil { panic("VFS1 or VFS2 files set") } - + f.fdBitmap.Add(uint32(fd)) // Note that we do _not_ need to acquire a extra table reference here. The // table reference will already be accounted for in the file, so we drop the // reference taken by set above. @@ -189,8 +190,10 @@ func (f *FDTable) DecRef(ctx context.Context) { func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 - fd := int32(0) - for { + fds := f.fdBitmap.ToSlice() + // Iterate through the fdBitmap. + for _, ufd := range fds { + fd := int32(ufd) file, fileVFS2, flags, ok := f.getAll(fd) if !ok { break @@ -218,7 +221,6 @@ func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2.DecRef(ctx) } retries = 0 - fd++ } } @@ -226,6 +228,8 @@ func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, func (f *FDTable) String() string { var buf strings.Builder ctx := context.Background() + f.mu.Lock() + defer f.mu.Unlock() f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: @@ -250,10 +254,10 @@ func (f *FDTable) String() string { } // NewFDs allocates new FDs guaranteed to be the lowest number available -// greater than or equal to the fd parameter. All files will share the set +// greater than or equal to the minFD parameter. All files will share the set // flags. Success is guaranteed to be all or none. -func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { - if fd < 0 { +func (f *FDTable) NewFDs(ctx context.Context, minFD int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { + if minFD < 0 { // Don't accept negative FDs. return nil, unix.EINVAL } @@ -267,31 +271,48 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags if lim.Cur != limits.Infinity { end = int32(lim.Cur) } - if fd >= end { + if minFD+int32(len(files)) > end { return nil, unix.EMFILE } } f.mu.Lock() - // From f.next to find available fd. - if fd < f.next { - fd = f.next + // max is used as the largest number in fdBitmap + 1. + max := int32(0) + + if !f.fdBitmap.IsEmpty() { + max = int32(f.fdBitmap.Maximum()) + max++ } + // Adjust max in case it is less than minFD. + if max < minFD { + max = minFD + } // Install all entries. - for i := fd; i < end && len(fds) < len(files); i++ { - if d, _, _ := f.get(i); d == nil { - // Set the descriptor. - f.set(ctx, i, files[len(fds)], flags) - fds = append(fds, i) // Record the file descriptor. + for len(fds) < len(files) { + // Try to use free bit in fdBitmap. + // If all bits in fdBitmap are used, expand fd to the max. + fd := f.fdBitmap.FirstZero(uint32(minFD)) + if fd == math.MaxInt32 { + fd = uint32(max) + max++ + } + if fd >= uint32(end) { + break } + f.fdBitmap.Add(fd) + f.set(ctx, int32(fd), files[len(fds)], flags) + fds = append(fds, int32(fd)) + minFD = int32(fd) } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { f.set(ctx, i, nil, FDFlags{}) + f.fdBitmap.Remove(uint32(i)) } f.mu.Unlock() @@ -305,20 +326,15 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return nil, unix.EMFILE } - if fd == f.next { - // Update next search start position. - f.next = fds[len(fds)-1] + 1 - } - f.mu.Unlock() return fds, nil } // NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available -// greater than or equal to the fd parameter. All files will share the set +// greater than or equal to the minFD parameter. All files will share the set // flags. Success is guaranteed to be all or none. -func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { - if fd < 0 { +func (f *FDTable) NewFDsVFS2(ctx context.Context, minFD int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { + if minFD < 0 { // Don't accept negative FDs. return nil, unix.EINVAL } @@ -332,31 +348,47 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes if lim.Cur != limits.Infinity { end = int32(lim.Cur) } - if fd >= end { + if minFD >= end { return nil, unix.EMFILE } } f.mu.Lock() - // From f.next to find available fd. - if fd < f.next { - fd = f.next + // max is used as the largest number in fdBitmap + 1. + max := int32(0) + + if !f.fdBitmap.IsEmpty() { + max = int32(f.fdBitmap.Maximum()) + max++ } - // Install all entries. - for i := fd; i < end && len(fds) < len(files); i++ { - if d, _, _ := f.getVFS2(i); d == nil { - // Set the descriptor. - f.setVFS2(ctx, i, files[len(fds)], flags) - fds = append(fds, i) // Record the file descriptor. - } + // Adjust max in case it is less than minFD. + if max < minFD { + max = minFD } + for len(fds) < len(files) { + // Try to use free bit in fdBitmap. + // If all bits in fdBitmap are used, expand fd to the max. + fd := f.fdBitmap.FirstZero(uint32(minFD)) + if fd == math.MaxInt32 { + fd = uint32(max) + max++ + } + if fd >= uint32(end) { + break + } + f.fdBitmap.Add(fd) + f.setVFS2(ctx, int32(fd), files[len(fds)], flags) + fds = append(fds, int32(fd)) + minFD = int32(fd) + } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { f.setVFS2(ctx, i, nil, FDFlags{}) + f.fdBitmap.Remove(uint32(i)) } f.mu.Unlock() @@ -370,57 +402,19 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes return nil, unix.EMFILE } - if fd == f.next { - // Update next search start position. - f.next = fds[len(fds)-1] + 1 - } - f.mu.Unlock() return fds, nil } -// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for +// NewFDVFS2 allocates a file descriptor greater than or equal to minFD for // the given file description. If it succeeds, it takes a reference on file. -func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { - if minfd < 0 { - // Don't accept negative FDs. - return -1, unix.EINVAL - } - - // Default limit. - end := int32(math.MaxInt32) - - // Ensure we don't get past the provided limit. - if limitSet := limits.FromContext(ctx); limitSet != nil { - lim := limitSet.Get(limits.NumberOfFiles) - if lim.Cur != limits.Infinity { - end = int32(lim.Cur) - } - if minfd >= end { - return -1, unix.EMFILE - } - } - - f.mu.Lock() - defer f.mu.Unlock() - - // From f.next to find available fd. - fd := minfd - if fd < f.next { - fd = f.next - } - for fd < end { - if d, _, _ := f.getVFS2(fd); d == nil { - f.setVFS2(ctx, fd, file, flags) - if fd == f.next { - // Update next search start position. - f.next = fd + 1 - } - return fd, nil - } - fd++ +func (f *FDTable) NewFDVFS2(ctx context.Context, minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + files := []*vfs.FileDescription{file} + fileSlice, error := f.NewFDsVFS2(ctx, minFD, files, flags) + if error != nil { + return -1, error } - return -1, unix.EMFILE + return fileSlice[0], nil } // NewFDAt sets the file reference for the given FD. If there is an active @@ -469,6 +463,11 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 defer f.mu.Unlock() df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags) + // Add fd to fdBitmap. + if file != nil || fileVFS2 != nil { + f.fdBitmap.Add(uint32(fd)) + } + return df, dfVFS2, nil } @@ -573,7 +572,9 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. func (f *FDTable) GetFDs(ctx context.Context) []int32 { - fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) + f.mu.Lock() + defer f.mu.Unlock() + fds := make([]int32, 0, int(f.fdBitmap.GetNumOnes())) f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) @@ -583,13 +584,15 @@ func (f *FDTable) GetFDs(ctx context.Context) []int32 { // Fork returns an independent FDTable. func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - + f.mu.Lock() + defer f.mu.Unlock() f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil { panic("VFS1 or VFS2 files set") } + clone.fdBitmap.Add(uint32(fd)) }) return clone } @@ -604,11 +607,6 @@ func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDesc f.mu.Lock() - // Update current available position. - if fd < f.next { - f.next = fd - } - orig, orig2, _, _ := f.getAll(fd) // Add reference for caller. @@ -621,6 +619,7 @@ func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDesc if orig != nil || orig2 != nil { orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry. + f.fdBitmap.Remove(uint32(fd)) } f.mu.Unlock() @@ -644,16 +643,13 @@ func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDes f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table. + f.fdBitmap.Remove(uint32(fd)) if df != nil { files = append(files, df) } if dfVFS2 != nil { filesVFS2 = append(filesVFS2, dfVFS2) } - // Update current available position. - if fd < f.next { - f.next = fd - } } }) f.mu.Unlock() diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index f17f9c59c..2b3e6ef71 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -15,9 +15,11 @@ package kernel import ( + "math" "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -44,6 +46,7 @@ func (f *FDTable) initNoLeakCheck() { func (f *FDTable) init() { f.initNoLeakCheck() f.InitRefs() + f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) } // get gets a file entry. @@ -162,14 +165,6 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 } } - // Adjust used. - switch { - case orig == nil && desc != nil: - atomic.AddInt32(&f.used, 1) - case orig != nil && desc == nil: - atomic.AddInt32(&f.used, -1) - } - if orig != nil { switch { case orig.file != nil: diff --git a/test/perf/BUILD b/test/perf/BUILD index 8ac63f11b..4299f08d5 100644 --- a/test/perf/BUILD +++ b/test/perf/BUILD @@ -70,6 +70,13 @@ syscall_test( ) syscall_test( + size = "large", + add_overlay = True, + debug = False, + test = "//test/perf/linux:dup_benchmark", +) + +syscall_test( debug = False, test = "//test/perf/linux:pipe_benchmark", ) diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD index 50dd8e808..8a3216550 100644 --- a/test/perf/linux/BUILD +++ b/test/perf/linux/BUILD @@ -109,6 +109,22 @@ cc_binary( ) cc_binary( + name = "dup_benchmark", + testonly = 1, + srcs = [ + "dup_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + ], +) + +cc_binary( name = "read_benchmark", testonly = 1, srcs = [ diff --git a/test/perf/linux/dup_benchmark.cc b/test/perf/linux/dup_benchmark.cc new file mode 100644 index 000000000..5d808d225 --- /dev/null +++ b/test/perf/linux/dup_benchmark.cc @@ -0,0 +1,55 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" +#include "benchmark/benchmark.h" +#include "test/util/fs_util.h" +#include "test/util/logging.h" +#include "test/util/temp_path.h" + +namespace gvisor { + +namespace { + +void BM_Dup(benchmark::State& state) { + const int size = state.range(0); + + for (auto _ : state) { + std::vector<int> v; + for (int i = 0; i < size; i++) { + int fd = dup(2); + TEST_CHECK(fd != -1); + v.push_back(fd); + } + for (int i = 0; i < size; i++) { + int fd = v[i]; + close(fd); + } + } + state.SetItemsProcessed(state.iterations() * size); +} + +BENCHMARK(BM_Dup)->Range(1, 1 << 15)->UseRealTime(); + +} // namespace + +} // namespace gvisor diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 188c052b3..3383495d0 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -560,6 +560,7 @@ cc_binary( deps = [ "//test/util:eventfd_util", "//test/util:file_descriptor", + "@com_google_absl//absl/memory", gtest, "//test/util:fs_util", "//test/util:posix_error", diff --git a/test/syscalls/linux/dup.cc b/test/syscalls/linux/dup.cc index ba4e13fb9..8f0974f45 100644 --- a/test/syscalls/linux/dup.cc +++ b/test/syscalls/linux/dup.cc @@ -13,9 +13,11 @@ // limitations under the License. #include <fcntl.h> +#include <sys/resource.h> #include <unistd.h> #include "gtest/gtest.h" +#include "absl/memory/memory.h" #include "test/util/eventfd_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" @@ -98,6 +100,45 @@ TEST(DupTest, Dup2) { ASSERT_NO_ERRNO(CheckSameFile(fd, nfd2)); } +TEST(DupTest, Rlimit) { + auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY)); + + struct rlimit rl = {}; + EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds()); + + ASSERT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds()); + + constexpr int kFDLimit = 101; + // Create a file descriptor that will be above the limit. + FileDescriptor aboveLimitFD = + ASSERT_NO_ERRNO_AND_VALUE(Dup2(fd, kFDLimit * 2 - 1)); + + rl.rlim_cur = kFDLimit; + ASSERT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds()); + ASSERT_THAT(dup3(fd.get(), kFDLimit, 0), SyscallFails()); + + std::vector<std::unique_ptr<FileDescriptor>> fds; + int prev_fd = fd.get(); + int used_fds = 0; + for (int i = 0; i < kFDLimit; ++i) { + int new_fd = dup(fd.get()); + if (new_fd == -1) { + break; + } + auto f = absl::make_unique<FileDescriptor>(new_fd); + EXPECT_LT(new_fd, kFDLimit); + EXPECT_GT(new_fd, prev_fd); + // Check that all fds in (prev_fd, new_fd) are used. + for (int j = prev_fd + 1; j < new_fd; ++j) { + if (fcntl(j, F_GETFD) != -1) used_fds++; + } + prev_fd = new_fd; + fds.push_back(std::move(f)); + } + EXPECT_EQ(fds.size() + used_fds, kFDLimit - fd.get() - 1); +} + TEST(DupTest, Dup2SameFD) { auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY)); |