diff options
59 files changed, 1506 insertions, 283 deletions
@@ -145,6 +145,12 @@ go_repository( ) go_repository( + name = "org_golang_x_time", + commit = "9d24e82272b4f38b78bc8cff74fa936d31ccd8ef", + importpath = "golang.org/x/time", +) + +go_repository( name = "org_golang_x_tools", commit = "aa82965741a9fecd12b026fbb3d3c6ed3231b8f8", importpath = "golang.org/x/tools", diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index 4c336ea84..9961baaa9 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") package(licenses = ["notice"]) @@ -7,6 +7,7 @@ go_library( name = "eventchannel", srcs = [ "event.go", + "rate.go", ], importpath = "gvisor.dev/gvisor/pkg/eventchannel", visibility = ["//:sandbox"], @@ -16,6 +17,7 @@ go_library( "//pkg/unet", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_golang_protobuf//ptypes:go_default_library_gen", + "@org_golang_x_time//rate:go_default_library", ], ) @@ -30,3 +32,12 @@ go_proto_library( proto = ":eventchannel_proto", visibility = ["//:sandbox"], ) + +go_test( + name = "eventchannel_test", + srcs = ["event_test.go"], + embed = [":eventchannel"], + deps = [ + "@com_github_golang_protobuf//proto:go_default_library", + ], +) diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go index f6d26532b..d37ad0428 100644 --- a/pkg/eventchannel/event.go +++ b/pkg/eventchannel/event.go @@ -43,18 +43,36 @@ type Emitter interface { Close() error } -var ( - mu sync.Mutex - emitters = make(map[Emitter]struct{}) -) +// DefaultEmitter is the default emitter. Calls to Emit and AddEmitter are sent +// to this Emitter. +var DefaultEmitter = &multiEmitter{} -// Emit emits a message using all added emitters. +// Emit is a helper method that calls DefaultEmitter.Emit. func Emit(msg proto.Message) error { - mu.Lock() - defer mu.Unlock() + _, err := DefaultEmitter.Emit(msg) + return err +} + +// AddEmitter is a helper method that calls DefaultEmitter.AddEmitter. +func AddEmitter(e Emitter) { + DefaultEmitter.AddEmitter(e) +} + +// multiEmitter is an Emitter that forwards messages to multiple Emitters. +type multiEmitter struct { + // mu protects emitters. + mu sync.Mutex + // emitters is initialized lazily in AddEmitter. + emitters map[Emitter]struct{} +} + +// Emit emits a message using all added emitters. +func (me *multiEmitter) Emit(msg proto.Message) (bool, error) { + me.mu.Lock() + defer me.mu.Unlock() var err error - for e := range emitters { + for e := range me.emitters { hangup, eerr := e.Emit(msg) if eerr != nil { if err == nil { @@ -68,18 +86,36 @@ func Emit(msg proto.Message) error { } if hangup { log.Infof("Hangup on eventchannel emitter %v.", e) - delete(emitters, e) + delete(me.emitters, e) } } - return err + return false, err } // AddEmitter adds a new emitter. -func AddEmitter(e Emitter) { - mu.Lock() - defer mu.Unlock() - emitters[e] = struct{}{} +func (me *multiEmitter) AddEmitter(e Emitter) { + me.mu.Lock() + defer me.mu.Unlock() + if me.emitters == nil { + me.emitters = make(map[Emitter]struct{}) + } + me.emitters[e] = struct{}{} +} + +// Close closes all emitters. If any Close call errors, it returns the first +// one encountered. +func (me *multiEmitter) Close() error { + me.mu.Lock() + defer me.mu.Unlock() + var err error + for e := range me.emitters { + if eerr := e.Close(); err == nil && eerr != nil { + err = eerr + } + delete(me.emitters, e) + } + return err } func marshal(msg proto.Message) ([]byte, error) { diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go new file mode 100644 index 000000000..3649097d6 --- /dev/null +++ b/pkg/eventchannel/event_test.go @@ -0,0 +1,146 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventchannel + +import ( + "fmt" + "sync" + "testing" + "time" + + "github.com/golang/protobuf/proto" +) + +// testEmitter is an emitter that can be used in tests. It records all events +// emitted, and whether it has been closed. +type testEmitter struct { + // mu protects all fields below. + mu sync.Mutex + + // events contains all emitted events. + events []proto.Message + + // closed records whether Close() was called. + closed bool +} + +// Emit implements Emitter.Emit. +func (te *testEmitter) Emit(msg proto.Message) (bool, error) { + te.mu.Lock() + defer te.mu.Unlock() + te.events = append(te.events, msg) + return false, nil +} + +// Close implements Emitter.Close. +func (te *testEmitter) Close() error { + te.mu.Lock() + defer te.mu.Unlock() + if te.closed { + return fmt.Errorf("closed called twice") + } + te.closed = true + return nil +} + +// testMessage implements proto.Message for testing. +type testMessage struct { + proto.Message + + // name is the name of the message, used by tests to compare messages. + name string +} + +func TestMultiEmitter(t *testing.T) { + // Create three testEmitters, tied together in a multiEmitter. + me := &multiEmitter{} + var emitters []*testEmitter + for i := 0; i < 3; i++ { + te := &testEmitter{} + emitters = append(emitters, te) + me.AddEmitter(te) + } + + // Emit three messages to multiEmitter. + names := []string{"foo", "bar", "baz"} + for _, name := range names { + m := testMessage{name: name} + if _, err := me.Emit(m); err != nil { + t.Fatal("me.Emit(%v) failed: %v", m, err) + } + } + + // All three emitters should have all three events. + for _, te := range emitters { + if got, want := len(te.events), len(names); got != want { + t.Fatalf("emitter got %d events, want %d", got, want) + } + for i, name := range names { + if got := te.events[i].(testMessage).name; got != name { + t.Errorf("emitter got message with name %q, want %q", got, name) + } + } + } + + // Close multiEmitter. + if err := me.Close(); err != nil { + t.Fatal("me.Close() failed: %v", err) + } + + // All testEmitters should be closed. + for _, te := range emitters { + if !te.closed { + t.Errorf("te.closed got false, want true") + } + } +} + +func TestRateLimitedEmitter(t *testing.T) { + // Create a RateLimittedEmitter that wraps a testEmitter. + te := &testEmitter{} + max := float64(5) // events per second + burst := 10 // events + rle := RateLimitedEmitterFrom(te, max, burst) + + // Send 50 messages in one shot. + for i := 0; i < 50; i++ { + if _, err := rle.Emit(testMessage{}); err != nil { + t.Fatalf("rle.Emit failed: %v", err) + } + } + + // We should have received only 10 messages. + if got, want := len(te.events), 10; got != want { + t.Errorf("got %d events, want %d", got, want) + } + + // Sleep for a second and then send another 50. + time.Sleep(1 * time.Second) + for i := 0; i < 50; i++ { + if _, err := rle.Emit(testMessage{}); err != nil { + t.Fatalf("rle.Emit failed: %v", err) + } + } + + // We should have at least 5 more message, plus maybe a few more if the + // test ran slowly. + got, wantAtLeast, wantAtMost := len(te.events), 15, 20 + if got < wantAtLeast { + t.Errorf("got %d events, want at least %d", got, wantAtLeast) + } + if got > wantAtMost { + t.Errorf("got %d events, want at most %d", got, wantAtMost) + } +} diff --git a/pkg/eventchannel/rate.go b/pkg/eventchannel/rate.go new file mode 100644 index 000000000..179226c92 --- /dev/null +++ b/pkg/eventchannel/rate.go @@ -0,0 +1,54 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventchannel + +import ( + "github.com/golang/protobuf/proto" + "golang.org/x/time/rate" +) + +// rateLimitedEmitter wraps an emitter and limits events to the given limits. +// Events that would exceed the limit are discarded. +type rateLimitedEmitter struct { + inner Emitter + limiter *rate.Limiter +} + +// RateLimitedEmitterFrom creates a new event channel emitter that wraps the +// existing emitter and enforces rate limits. The limits are imposed via a +// token bucket, with `maxRate` events per second, with burst size of `burst` +// events. See the golang.org/x/time/rate package and +// https://en.wikipedia.org/wiki/Token_bucket for more information about token +// buckets generally. +func RateLimitedEmitterFrom(inner Emitter, maxRate float64, burst int) Emitter { + return &rateLimitedEmitter{ + inner: inner, + limiter: rate.NewLimiter(rate.Limit(maxRate), burst), + } +} + +// Emit implements EventEmitter.Emit. +func (rle *rateLimitedEmitter) Emit(msg proto.Message) (bool, error) { + if !rle.limiter.Allow() { + // Drop event. + return false, nil + } + return rle.inner.Emit(msg) +} + +// Close implements EventEmitter.Close. +func (rle *rateLimitedEmitter) Close() error { + return rle.inner.Close() +} diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go index ab8857b5e..4225b04dd 100644 --- a/pkg/fdnotifier/poll_unsafe.go +++ b/pkg/fdnotifier/poll_unsafe.go @@ -35,8 +35,14 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask { events: int16(mask.ToLinux()), } + ts := syscall.Timespec{ + Sec: 0, + Nsec: 0, + } + for { - n, _, err := syscall.RawSyscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&e)), 1, 0) + n, _, err := syscall.RawSyscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(&e)), 1, + uintptr(unsafe.Pointer(&ts)), 0, 0, 0) // Interrupted by signal, try again. if err == syscall.EINTR { continue diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go index 62ae1fd9f..48413f1fb 100644 --- a/pkg/seccomp/seccomp_test_victim.go +++ b/pkg/seccomp/seccomp_test_victim.go @@ -70,7 +70,7 @@ func main() { syscall.SYS_NANOSLEEP: {}, syscall.SYS_NEWFSTATAT: {}, syscall.SYS_OPEN: {}, - syscall.SYS_POLL: {}, + syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, syscall.SYS_PSELECT6: {}, syscall.SYS_PWRITE64: {}, diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD index 2c15875f5..8158aa522 100644 --- a/pkg/sentry/fs/ext/BUILD +++ b/pkg/sentry/fs/ext/BUILD @@ -1,14 +1,35 @@ package(licenses = ["notice"]) load("//tools/go_stateify:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "ext", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) go_library( name = "ext", srcs = [ + "block_map_file.go", "dentry.go", + "dentry_list.go", + "directory.go", "ext.go", + "extent_file.go", "filesystem.go", + "inline_file.go", "inode.go", + "named_pipe.go", + "regular_file.go", + "symlink.go", "utils.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext", @@ -16,9 +37,14 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/fd", "//pkg/sentry/context", + "//pkg/sentry/fs", "//pkg/sentry/fs/ext/disklayout", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/safemem", + "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", ], diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go new file mode 100644 index 000000000..9aabbd145 --- /dev/null +++ b/pkg/sentry/fs/ext/block_map_file.go @@ -0,0 +1,65 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + "sync" + + "gvisor.dev/gvisor/pkg/binary" +) + +// blockMapFile is a type of regular file which uses direct/indirect block +// addressing to store file data. This was deprecated in ext4. +type blockMapFile struct { + regFile regularFile + + // mu serializes changes to fileToPhysBlks. + mu sync.RWMutex + + // fileToPhysBlks maps the file block numbers to the physical block numbers. + // the physical block number for the (i)th file block is stored in the (i)th + // index. This is initialized (at max) with the first 12 entries. The rest + // have to be read in from disk when required. Protected by mu. + fileToPhysBlks []uint32 +} + +// Compiles only if blockMapFile implements fileReader. +var _ fileReader = (*blockMapFile)(nil) + +// Read implements fileReader.getFileReader. +func (f *blockMapFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader { + panic("unimplemented") +} + +// newBlockMapFile is the blockMapFile constructor. It initializes the file to +// physical blocks map with (at most) the first 12 (direct) blocks. +func newBlockMapFile(blkSize uint64, regFile regularFile) (*blockMapFile, error) { + file := &blockMapFile{regFile: regFile} + file.regFile.impl = file + + toFill := uint64(12) + blksUsed := regFile.blksUsed(blkSize) + if blksUsed < toFill { + toFill = blksUsed + } + + blkMap := regFile.inode.diskInode.Data() + file.fileToPhysBlks = make([]uint32, toFill) + for i := uint64(0); i < toFill; i++ { + binary.Unmarshal(blkMap[i*4:(i+1)*4], binary.LittleEndian, &file.fileToPhysBlks[i]) + } + return file, nil +} diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go index 054fb42b6..19c9b3b2d 100644 --- a/pkg/sentry/fs/ext/dentry.go +++ b/pkg/sentry/fs/ext/dentry.go @@ -26,6 +26,8 @@ type dentry struct { // share a single non-directory Inode (with hard links). inode is // immutable. inode *inode + // dentryEntry links Dentries into their parent directory.childList. + dentryEntry } // Compiles only if dentry implements vfs.DentryImpl. diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go new file mode 100644 index 000000000..ab2b59e44 --- /dev/null +++ b/pkg/sentry/fs/ext/directory.go @@ -0,0 +1,36 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +// directory represents a directory inode. It holds the childList in memory. +type directory struct { + inode inode + + // childList is a list containing (1) child Dentries and (2) fake Dentries + // (with inode == nil) that represent the iteration position of + // directoryFDs. childList is used to support directoryFD.IterDirents() + // efficiently. childList is immutable. + childList dentryList + + // TODO(b/134676337): Add directory navigators. +} + +// newDirectroy is the directory constructor. +func newDirectroy(inode inode) *directory { + // TODO(b/134676337): initialize childList. + file := &directory{inode: inode} + file.inode.impl = file + return file +} diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index 10e235fb1..d303dd122 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -19,9 +19,9 @@ import ( "errors" "fmt" "io" - "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -35,11 +35,11 @@ type filesystemType struct{} // Compiles only if filesystemType implements vfs.FilesystemType. var _ vfs.FilesystemType = (*filesystemType)(nil) -// getDeviceFd returns the read seeker to the underlying device. +// getDeviceFd returns an io.ReaderAt to the underlying device. // Currently there are two ways of mounting an ext(2/3/4) fs: // 1. Specify a mount with our internal special MountType in the OCI spec. // 2. Expose the device to the container and mount it from application layer. -func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) { +func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) { if opts.InternalData == nil { // User mount call. // TODO(b/134676337): Open the device specified by `source` and return that. @@ -47,20 +47,19 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, e } // NewFilesystem call originated from within the sentry. - fd, ok := opts.InternalData.(uintptr) + devFd, ok := opts.InternalData.(int) if !ok { - return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device") + return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") } - // We do not close this file because that would close the underlying device - // file descriptor (which is required for reading the fs from disk). - // TODO(b/134676337): Use pkg/fd instead. - deviceFile := os.NewFile(fd, source) - if deviceFile == nil { - return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd) + if devFd < 0 { + return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd) } - return deviceFile, nil + // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership + // of the file descriptor and hence will not close it when it is garbage + // collected. + return fd.NewReadWriter(devFd), nil } // NewFilesystem implements vfs.FilesystemType.NewFilesystem. @@ -88,7 +87,7 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred return nil, nil, err } - rootInode, err := fs.getOrCreateInode(disklayout.RootDirInode) + rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode) if err != nil { return nil, nil, err } diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go index ee7f7907c..18764e92a 100644 --- a/pkg/sentry/fs/ext/ext_test.go +++ b/pkg/sentry/fs/ext/ext_test.go @@ -69,7 +69,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v // Mount the ext4 fs and retrieve the inode structure for the file. mockCtx := contexttest.Context(t) - fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()}) + fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go new file mode 100644 index 000000000..aa4102dbb --- /dev/null +++ b/pkg/sentry/fs/ext/extent_file.go @@ -0,0 +1,260 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + "sort" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// extentFile is a type of regular file which uses extents to store file data. +type extentFile struct { + regFile regularFile + + // root is the root extent node. This lives in the 60 byte diskInode.Data(). + // Immutable. + root disklayout.ExtentNode +} + +// Compiles only if extentFile implements fileReader. +var _ fileReader = (*extentFile)(nil) + +// Read implements fileReader.getFileReader. +func (f *extentFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader { + return &extentReader{ + dev: dev, + file: f, + fileOff: offset, + blkSize: blkSize, + } +} + +// newExtentFile is the extent file constructor. It reads the entire extent +// tree into memory. +// TODO(b/134676337): Build extent tree on demand to reduce memory usage. +func newExtentFile(dev io.ReaderAt, blkSize uint64, regFile regularFile) (*extentFile, error) { + file := &extentFile{regFile: regFile} + file.regFile.impl = file + err := file.buildExtTree(dev, blkSize) + if err != nil { + return nil, err + } + return file, nil +} + +// buildExtTree builds the extent tree by reading it from disk by doing +// running a simple DFS. It first reads the root node from the inode struct in +// memory. Then it recursively builds the rest of the tree by reading it off +// disk. +// +// Precondition: inode flag InExtents must be set. +func (f *extentFile) buildExtTree(dev io.ReaderAt, blkSize uint64) error { + rootNodeData := f.regFile.inode.diskInode.Data() + + binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header) + + // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. + if f.root.Header.NumEntries > 4 { + // read(2) specifies that EINVAL should be returned if the file is unsuitable + // for reading. + return syserror.EINVAL + } + + f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries) + for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { + var curEntry disklayout.ExtentEntry + if f.root.Header.Height == 0 { + // Leaf node. + curEntry = &disklayout.Extent{} + } else { + // Internal node. + curEntry = &disklayout.ExtentIdx{} + } + binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry) + f.root.Entries[i].Entry = curEntry + } + + // If this node is internal, perform DFS. + if f.root.Header.Height > 0 { + for i := uint16(0); i < f.root.Header.NumEntries; i++ { + var err error + if f.root.Entries[i].Node, err = buildExtTreeFromDisk(dev, f.root.Entries[i].Entry, blkSize); err != nil { + return err + } + } + } + + return nil +} + +// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively +// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to +// by the ExtentEntry. +func buildExtTreeFromDisk(dev io.ReaderAt, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) { + var header disklayout.ExtentHeader + off := entry.PhysicalBlock() * blkSize + err := readFromDisk(dev, int64(off), &header) + if err != nil { + return nil, err + } + + entries := make([]disklayout.ExtentEntryPair, header.NumEntries) + for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { + var curEntry disklayout.ExtentEntry + if header.Height == 0 { + // Leaf node. + curEntry = &disklayout.Extent{} + } else { + // Internal node. + curEntry = &disklayout.ExtentIdx{} + } + + err := readFromDisk(dev, int64(off), curEntry) + if err != nil { + return nil, err + } + entries[i].Entry = curEntry + } + + // If this node is internal, perform DFS. + if header.Height > 0 { + for i := uint16(0); i < header.NumEntries; i++ { + var err error + entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize) + if err != nil { + return nil, err + } + } + } + + return &disklayout.ExtentNode{header, entries}, nil +} + +// extentReader implements io.Reader which can traverse the extent tree and +// read file data. This is not thread safe. +type extentReader struct { + dev io.ReaderAt + file *extentFile + fileOff uint64 // Represents the current file offset being read from. + blkSize uint64 +} + +// Compiles only if inlineReader implements io.Reader. +var _ io.Reader = (*extentReader)(nil) + +// Read implements io.Reader.Read. +func (r *extentReader) Read(dst []byte) (int, error) { + if len(dst) == 0 { + return 0, nil + } + + if r.fileOff >= r.file.regFile.inode.diskInode.Size() { + return 0, io.EOF + } + + return r.read(&r.file.root, dst) +} + +// read is a helper which traverses the extent tree and reads data. +func (r *extentReader) read(node *disklayout.ExtentNode, dst []byte) (int, error) { + // Perform a binary search for the node covering bytes starting at r.fileOff. + // A highly fragmented filesystem can have upto 340 entries and so linear + // search should be avoided. Finds the first entry which does not cover the + // file block we want and subtracts 1 to get the desired index. + fileBlk := r.fileBlock() + n := len(node.Entries) + found := sort.Search(n, func(i int) bool { + return node.Entries[i].Entry.FileBlock() > fileBlk + }) - 1 + + // We should be in this recursive step only if the data we want exists under + // the current node. + if found < 0 { + panic("searching for a file block in an extent entry which does not cover it") + } + + read := 0 + toRead := len(dst) + var curR int + var err error + for i := found; i < n && read < toRead; i++ { + if node.Header.Height == 0 { + curR, err = r.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), dst[read:]) + } else { + curR, err = r.read(node.Entries[i].Node, dst[read:]) + } + + read += curR + if err != nil { + return read, err + } + } + + return read, nil +} + +// readFromExtent reads file data from the extent. It takes advantage of the +// sequential nature of extents and reads file data from multiple blocks in one +// call. Also updates the file offset. +// +// A non-nil error indicates that this is a partial read and there is probably +// more to read from this extent. The caller should propagate the error upward +// and not move to the next extent in the tree. +// +// A subsequent call to extentReader.Read should continue reading from where we +// left off as expected. +func (r *extentReader) readFromExtent(ex *disklayout.Extent, dst []byte) (int, error) { + curFileBlk := r.fileBlock() + exFirstFileBlk := ex.FileBlock() + exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive. + + // We should be in this recursive step only if the data we want exists under + // the current extent. + if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk { + panic("searching for a file block in an extent which does not cover it") + } + + curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock() + readStart := curPhyBlk*r.blkSize + r.fileBlockOff() + + endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length) + extentEnd := endPhyBlk * r.blkSize // This is exclusive. + + toRead := int(extentEnd - readStart) + if len(dst) < toRead { + toRead = len(dst) + } + + n, _ := r.dev.ReadAt(dst[:toRead], int64(readStart)) + r.fileOff += uint64(n) + if n < toRead { + return n, syserror.EIO + } + return n, nil +} + +// fileBlock returns the file block number we are currently reading. +func (r *extentReader) fileBlock() uint32 { + return uint32(r.fileOff / r.blkSize) +} + +// fileBlockOff returns the current offset within the current file block. +func (r *extentReader) fileBlockOff() uint64 { + return r.fileOff % r.blkSize +} diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go index b3f342c8e..dff401114 100644 --- a/pkg/sentry/fs/ext/extent_test.go +++ b/pkg/sentry/fs/ext/extent_test.go @@ -16,6 +16,8 @@ package ext import ( "bytes" + "io" + "math/rand" "testing" "github.com/google/go-cmp/cmp" @@ -24,9 +26,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" ) -// TestExtentTree tests the extent tree building logic. +const ( + // mockExtentBlkSize is the mock block size used for testing. + // No block has more than 1 header + 4 entries. + mockExtentBlkSize = uint64(64) +) + +// The tree described below looks like: // -// Test tree: // 0.{Head}[Idx][Idx] // / \ // / \ @@ -44,12 +51,8 @@ import ( // // Please note that ext4 might not construct extent trees looking like this. // This is purely for testing the tree traversal logic. -func TestExtentTree(t *testing.T) { - blkSize := uint64(64) // No block has more than 1 header + 4 entries. - mockDisk := make([]byte, blkSize*10) - mockInode := &inode{diskInode: &disklayout.InodeNew{}} - - node3 := &disklayout.ExtentNode{ +var ( + node3 = &disklayout.ExtentNode{ Header: disklayout.ExtentHeader{ Magic: disklayout.ExtentMagic, NumEntries: 1, @@ -68,7 +71,7 @@ func TestExtentTree(t *testing.T) { }, } - node2 := &disklayout.ExtentNode{ + node2 = &disklayout.ExtentNode{ Header: disklayout.ExtentHeader{ Magic: disklayout.ExtentMagic, NumEntries: 1, @@ -86,7 +89,7 @@ func TestExtentTree(t *testing.T) { }, } - node1 := &disklayout.ExtentNode{ + node1 = &disklayout.ExtentNode{ Header: disklayout.ExtentHeader{ Magic: disklayout.ExtentMagic, NumEntries: 2, @@ -113,7 +116,7 @@ func TestExtentTree(t *testing.T) { }, } - node0 := &disklayout.ExtentNode{ + node0 = &disklayout.ExtentNode{ Header: disklayout.ExtentHeader{ Magic: disklayout.ExtentMagic, NumEntries: 2, @@ -137,22 +140,95 @@ func TestExtentTree(t *testing.T) { }, }, } +) - writeTree(mockInode, mockDisk, node0, blkSize) +// TestExtentReader tests extentReader functionality. We should be able to use +// the file reader like any other io.Reader. +func TestExtentReader(t *testing.T) { + type extentReaderTest struct { + name string + from func(uint64) uint64 + to func(uint64) uint64 + } - r := bytes.NewReader(mockDisk) - if err := mockInode.buildExtTree(r, blkSize); err != nil { - t.Fatalf("inode.buildExtTree failed: %v", err) + tests := []extentReaderTest{ + { + name: "read first half", + from: beginning, + to: middle, + }, + { + name: "read entire file", + from: beginning, + to: end, + }, + { + name: "read second half", + from: middle, + to: end, + }, } + dev, mockExtentFile, want := extentTreeSetUp(t, node0) + size := mockExtentFile.regFile.inode.diskInode.Size() + + for _, test := range tests { + from := test.from(size) + to := test.to(size) + fileReader := mockExtentFile.getFileReader(dev, mockExtentBlkSize, from) + + got := make([]byte, to-from) + if _, err := io.ReadFull(fileReader, got); err != nil { + t.Errorf("file read failed: %v", err) + } + + if diff := cmp.Diff(got, want[from:to]); diff != "" { + t.Errorf("file data mismatch (-want +got):\n%s", diff) + } + } +} + +// TestBuildExtentTree tests the extent tree building logic. +func TestBuildExtentTree(t *testing.T) { + _, mockExtentFile, _ := extentTreeSetUp(t, node0) + opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{}) - if diff := cmp.Diff(mockInode.root, node0, opt); diff != "" { + if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" { t.Errorf("extent tree mismatch (-want +got):\n%s", diff) } } -// writeTree writes the tree represented by `root` to the inode and disk passed. -func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint64) { +// extentTreeSetUp writes the passed extent tree to a mock disk as an extent +// tree. It also constucts a mock extent file with the same tree built in it. +// It also writes random data file data and returns it. +func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReaderAt, *extentFile, []byte) { + t.Helper() + + mockDisk := make([]byte, mockExtentBlkSize*10) + mockExtentFile := &extentFile{ + regFile: regularFile{ + inode: inode{ + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), + }, + }, + }, + }, + } + + fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize) + + r := bytes.NewReader(mockDisk) + if err := mockExtentFile.buildExtTree(r, mockExtentBlkSize); err != nil { + t.Fatalf("inode.buildExtTree failed: %v", err) + } + return r, mockExtentFile, fileData +} + +// writeTree writes the tree represented by `root` to the inode and disk. It +// also writes random file data on disk. +func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { rootData := binary.Marshal(nil, binary.LittleEndian, root.Header) for _, ep := range root.Entries { rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry) @@ -160,26 +236,57 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint copy(in.diskInode.Data(), rootData) - if root.Header.Height > 0 { - for _, ep := range root.Entries { - writeTreeToDisk(disk, ep, blkSize) + var fileData []byte + for _, ep := range root.Entries { + if root.Header.Height == 0 { + fileData = append(fileData, writeRandomFileData(disk, ep.Entry.(*disklayout.Extent))...) + } else { + fileData = append(fileData, writeTreeToDisk(disk, ep)...) } } + return fileData } // writeTreeToDisk is the recursive step for writeTree which writes the tree -// on the disk only. -func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair, blkSize uint64) { +// on the disk only. Also writes random file data on disk. +func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header) for _, ep := range curNode.Node.Entries { nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry) } - copy(disk[curNode.Entry.PhysicalBlock()*blkSize:], nodeData) + copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData) + + var fileData []byte + for _, ep := range curNode.Node.Entries { + if curNode.Node.Header.Height == 0 { + fileData = append(fileData, writeRandomFileData(disk, ep.Entry.(*disklayout.Extent))...) + } else { + fileData = append(fileData, writeTreeToDisk(disk, ep)...) + } + } + return fileData +} + +// writeRandomFileData writes random bytes to the blocks on disk that the +// passed extent points to. +func writeRandomFileData(disk []byte, ex *disklayout.Extent) []byte { + phyExStartBlk := ex.PhysicalBlock() + phyExStartOff := phyExStartBlk * mockExtentBlkSize + phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize + rand.Read(disk[phyExStartOff:phyExEndOff]) + return disk[phyExStartOff:phyExEndOff] +} - if curNode.Node.Header.Height > 0 { - for _, ep := range curNode.Node.Entries { - writeTreeToDisk(disk, ep, blkSize) +// getNumPhyBlks returns the number of physical blocks covered under the node. +func getNumPhyBlks(node *disklayout.ExtentNode) uint32 { + var res uint32 + for _, ep := range node.Entries { + if node.Header.Height == 0 { + res += uint32(ep.Entry.(*disklayout.Extent).Length) + } else { + res += getNumPhyBlks(ep.Node) } } + return res } diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go index 7150e75a5..12aeb5dac 100644 --- a/pkg/sentry/fs/ext/filesystem.go +++ b/pkg/sentry/fs/ext/filesystem.go @@ -31,22 +31,16 @@ type filesystem struct { vfsfs vfs.Filesystem - // mu serializes changes to the Dentry tree and the usage of the read seeker. - mu sync.Mutex + // mu serializes changes to the Dentry tree. + mu sync.RWMutex - // dev is the ReadSeeker for the underlying fs device. It is protected by mu. - // - // The ext filesystems aim to maximize locality, i.e. place all the data - // blocks of a file close together. On a spinning disk, locality reduces the - // amount of movement of the head hence speeding up IO operations. On an SSD - // there are no moving parts but locality increases the size of each transer - // request. Hence, having mutual exclusion on the read seeker while reading a - // file *should* help in achieving the intended performance gains. - // - // Note: This synchronization was not coupled with the ReadSeeker itself - // because we want to synchronize across read/seek operations for the - // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. - dev io.ReadSeeker + // dev is the io.ReaderAt for the underlying fs device. It does not require + // protection because io.ReaderAt permits concurrent read calls to it. It + // translates to the pread syscall which passes on the read request directly + // to the device driver. Device drivers are intelligent in serving multiple + // concurrent read requests in the optimal order (taking locality into + // consideration). + dev io.ReaderAt // inodeCache maps absolute inode numbers to the corresponding Inode struct. // Inodes should be removed from this once their reference count hits 0. @@ -69,13 +63,13 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil) // getOrCreateInode gets the inode corresponding to the inode number passed in. // It creates a new one with the given inode number if one does not exist. // -// Preconditions: must be holding fs.mu. -func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) { +// Precondition: must be holding fs.mu. +func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) { if in, ok := fs.inodeCache[inodeNum]; ok { return in, nil } - in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum) + in, err := newInode(ctx, fs.dev, fs.sb, fs.bgs, inodeNum) if err != nil { return nil, err } diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go new file mode 100644 index 000000000..b9adfe548 --- /dev/null +++ b/pkg/sentry/fs/ext/inline_file.go @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" +) + +// inlineFile is a type of regular file. All the data here is stored in the +// inode.Data() array. +type inlineFile struct { + regFile regularFile +} + +// Compiles only if inlineFile implements fileReader. +var _ fileReader = (*inlineFile)(nil) + +// getFileReader implements fileReader.getFileReader. +func (f *inlineFile) getFileReader(_ io.ReaderAt, _ uint64, offset uint64) io.Reader { + diskInode := f.regFile.inode.diskInode + return &inlineReader{offset: offset, data: diskInode.Data()[:diskInode.Size()]} +} + +// newInlineFile is the inlineFile constructor. +func newInlineFile(regFile regularFile) *inlineFile { + file := &inlineFile{regFile: regFile} + file.regFile.impl = file + return file +} + +// inlineReader implements io.Reader which can read the underlying data. This +// is not thread safe. +type inlineReader struct { + offset uint64 + data []byte +} + +// Compiles only if inlineReader implements io.Reader. +var _ io.Reader = (*inlineReader)(nil) + +// Read implements io.Reader.Read. +func (r *inlineReader) Read(dst []byte) (int, error) { + if len(dst) == 0 { + return 0, nil + } + + if int(r.offset) >= len(r.data) { + return 0, io.EOF + } + + n := copy(dst, r.data[r.offset:]) + r.offset += uint64(n) + return n, nil +} diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go index df1ea0bda..00e022953 100644 --- a/pkg/sentry/fs/ext/inode.go +++ b/pkg/sentry/fs/ext/inode.go @@ -18,12 +18,26 @@ import ( "io" "sync/atomic" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // inode represents an ext inode. +// +// inode uses the same inheritance pattern that pkg/sentry/vfs structures use. +// This has been done to increase memory locality. +// +// Implementations: +// inode -- +// |-- pipe +// |-- dir +// |-- symlink +// |-- regular-- +// |-- extent file +// |-- block map file +// |-- inline file type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 @@ -35,9 +49,9 @@ type inode struct { // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode - // root is the root extent node. This lives in the 60 byte diskInode.Blocks(). - // Immutable. Nil if the inode does not use extents. - root *disklayout.ExtentNode + // This is immutable. The first field of the implementations must have inode + // as the first field to ensure temporality. + impl interface{} } // incRef increments the inode ref count. @@ -61,7 +75,7 @@ func (in *inode) tryIncRef() bool { // decRef decrements the inode ref count and releases the inode resources if // the ref count hits 0. // -// Preconditions: Must have locked fs.mu. +// Precondition: Must have locked fs.mu. func (in *inode) decRef(fs *filesystem) { if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { delete(fs.inodeCache, in.inodeNum) @@ -72,19 +86,17 @@ func (in *inode) decRef(fs *filesystem) { // newInode is the inode constructor. Reads the inode off disk. Identifies // inodes based on the absolute inode number on disk. -// -// Preconditions: Must hold the mutex of the filesystem containing dev. -func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) { +func newInode(ctx context.Context, dev io.ReaderAt, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) { if inodeNum == 0 { panic("inode number 0 on ext filesystems is not possible") } - in := &inode{refs: 1, inodeNum: inodeNum} inodeRecordSize := sb.InodeSize() + var diskInode disklayout.Inode if inodeRecordSize == disklayout.OldInodeSize { - in.diskInode = &disklayout.InodeOld{} + diskInode = &disklayout.InodeOld{} } else { - in.diskInode = &disklayout.InodeNew{} + diskInode = &disklayout.InodeNew{} } // Calculate where the inode is actually placed. @@ -93,16 +105,38 @@ func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.Bloc inodeTableOff := bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp)) - // Read it from disk and figure out which type of inode this is. - if err := readFromDisk(dev, int64(inodeOff), in.diskInode); err != nil { + if err := readFromDisk(dev, int64(inodeOff), diskInode); err != nil { return nil, err } - if in.diskInode.Flags().Extents { - in.buildExtTree(dev, blkSize) + // Build the inode based on its type. + inode := inode{ + refs: 1, + inodeNum: inodeNum, + diskInode: diskInode, } - return in, nil + switch diskInode.Mode().FileType() { + case linux.ModeSymlink: + f, err := newSymlink(dev, blkSize, inode) + if err != nil { + return nil, err + } + return &f.inode, nil + case linux.ModeRegular: + f, err := newRegularFile(dev, blkSize, inode) + if err != nil { + return nil, err + } + return &f.inode, nil + case linux.ModeDirectory: + return &newDirectroy(inode).inode, nil + case linux.ModeNamedPipe: + return &newNamedPipe(ctx, inode).inode, nil + default: + // TODO(b/134676337): Return appropriate errors for sockets and devices. + return nil, syserror.EINVAL + } } // getBGNum returns the block group number that a given inode belongs to. @@ -115,95 +149,3 @@ func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 { return (inodeNum - 1) % inodesPerGrp } - -// buildExtTree builds the extent tree by reading it from disk by doing -// running a simple DFS. It first reads the root node from the inode struct in -// memory. Then it recursively builds the rest of the tree by reading it off -// disk. -// -// Preconditions: -// - Must hold the mutex of the filesystem containing dev. -// - Inode flag InExtents must be set. -func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { - rootNodeData := in.diskInode.Data() - - var rootHeader disklayout.ExtentHeader - binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &rootHeader) - - // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. - if rootHeader.NumEntries > 4 { - // read(2) specifies that EINVAL should be returned if the file is unsuitable - // for reading. - return syserror.EINVAL - } - - rootEntries := make([]disklayout.ExtentEntryPair, rootHeader.NumEntries) - for i, off := uint16(0), disklayout.ExtentStructsSize; i < rootHeader.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { - var curEntry disklayout.ExtentEntry - if rootHeader.Height == 0 { - // Leaf node. - curEntry = &disklayout.Extent{} - } else { - // Internal node. - curEntry = &disklayout.ExtentIdx{} - } - binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry) - rootEntries[i].Entry = curEntry - } - - // If this node is internal, perform DFS. - if rootHeader.Height > 0 { - for i := uint16(0); i < rootHeader.NumEntries; i++ { - var err error - if rootEntries[i].Node, err = buildExtTreeFromDisk(dev, rootEntries[i].Entry, blkSize); err != nil { - return err - } - } - } - - in.root = &disklayout.ExtentNode{rootHeader, rootEntries} - return nil -} - -// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively -// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to -// by the ExtentEntry. -// -// Preconditions: Must hold the mutex of the filesystem containing dev. -func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) { - var header disklayout.ExtentHeader - off := entry.PhysicalBlock() * blkSize - if err := readFromDisk(dev, int64(off), &header); err != nil { - return nil, err - } - - entries := make([]disklayout.ExtentEntryPair, header.NumEntries) - for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { - var curEntry disklayout.ExtentEntry - if header.Height == 0 { - // Leaf node. - curEntry = &disklayout.Extent{} - } else { - // Internal node. - curEntry = &disklayout.ExtentIdx{} - } - - if err := readFromDisk(dev, int64(off), curEntry); err != nil { - return nil, err - } - entries[i].Entry = curEntry - } - - // If this node is internal, perform DFS. - if header.Height > 0 { - for i := uint16(0); i < header.NumEntries; i++ { - var err error - entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize) - if err != nil { - return nil, err - } - } - } - - return &disklayout.ExtentNode{header, entries}, nil -} diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go new file mode 100644 index 000000000..0f3af1b53 --- /dev/null +++ b/pkg/sentry/fs/ext/named_pipe.go @@ -0,0 +1,40 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// namedPipe represents a named pipe inode. It is currently just a wrapper +// around pkg/sentry/kernel/pipe. +type namedPipe struct { + inode inode + + p *pipe.Pipe + inodeOps fs.InodeOperations +} + +// newNamedPipe is the namedPipe constructor. +func newNamedPipe(ctx context.Context, inode inode) *namedPipe { + file := &namedPipe{inode: inode} + file.inode.impl = file + file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) + file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p) + return file +} diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go new file mode 100644 index 000000000..b48f61795 --- /dev/null +++ b/pkg/sentry/fs/ext/regular_file.go @@ -0,0 +1,85 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" +) + +// fileReader is used to abstact away the complexity of how the file data is +// stored under the hood. Provides a method to get a file reader which can be +// used to read file data without worrying about how it is organized on disk. +type fileReader interface { + + // getFileReader returns a Reader implementation which can be used to read a + // file. It abstracts away the complexity of how the file is actually + // organized on disk. The reader is initialized with the passed offset. + // + // This reader is not meant to be retained across Read operations as it needs + // to be reinitialized with the correct offset for every Read. + getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader +} + +// regularFile represents a regular file's inode. This too follows the +// inheritance pattern prevelant in the vfs layer described in +// pkg/sentry/vfs/README.md. +type regularFile struct { + inode inode + + // This is immutable. The first field of fileReader implementations must be + // regularFile to ensure temporality. + impl fileReader +} + +// newRegularFile is the regularFile constructor. It figures out what kind of +// file this is and initializes the fileReader. +func newRegularFile(dev io.ReaderAt, blkSize uint64, inode inode) (*regularFile, error) { + regFile := regularFile{ + inode: inode, + } + + inodeFlags := inode.diskInode.Flags() + + if inodeFlags.Extents { + file, err := newExtentFile(dev, blkSize, regFile) + if err != nil { + return nil, err + } + + file.regFile.inode.impl = &file.regFile + return &file.regFile, nil + } + + if inodeFlags.Inline { + if inode.diskInode.Size() > 60 { + panic("ext fs: inline file larger than 60 bytes") + } + + file := newInlineFile(regFile) + file.regFile.inode.impl = &file.regFile + return &file.regFile, nil + } + + file, err := newBlockMapFile(blkSize, regFile) + if err != nil { + return nil, err + } + file.regFile.inode.impl = &file.regFile + return &file.regFile, nil +} + +func (f *regularFile) blksUsed(blkSize uint64) uint64 { + return (f.inode.diskInode.Size() + blkSize - 1) / blkSize +} diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go new file mode 100644 index 000000000..6a55c1a7b --- /dev/null +++ b/pkg/sentry/fs/ext/symlink.go @@ -0,0 +1,57 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + + "gvisor.dev/gvisor/pkg/syserror" +) + +// symlink represents a symlink inode. +type symlink struct { + inode inode + target string // immutable +} + +// newSymlink is the symlink constructor. It reads out the symlink target from +// the inode (however it might have been stored). +func newSymlink(dev io.ReaderAt, blkSize uint64, inode inode) (*symlink, error) { + var file *symlink + var link []byte + + // If the symlink target is lesser than 60 bytes, its stores in inode.Data(). + // Otherwise either extents or block maps will be used to store the link. + size := inode.diskInode.Size() + if size < 60 { + link = inode.diskInode.Data()[:size] + } else { + // Create a regular file out of this inode and read out the target. + regFile, err := newRegularFile(dev, blkSize, inode) + if err != nil { + return nil, err + } + + link = make([]byte, size) + reader := regFile.impl.getFileReader(dev, blkSize, 0) + if _, err := io.ReadFull(reader, link); err != nil { + return nil, syserror.EIO + } + } + + file = &symlink{inode: inode, target: string(link)} + file.inode.impl = file + return file, nil +} diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go index 3472c5fa8..3d89d664d 100644 --- a/pkg/sentry/fs/ext/utils.go +++ b/pkg/sentry/fs/ext/utils.go @@ -15,38 +15,30 @@ package ext import ( - "encoding/binary" "io" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // readFromDisk performs a binary read from disk into the given struct from // the absolute offset provided. -// -// All disk reads should use this helper so we avoid reading from stale -// previously used offsets. This function forces the offset parameter. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { - if _, err := dev.Seek(abOff, io.SeekStart); err != nil { - return syserror.EIO - } - - if err := binary.Read(dev, binary.LittleEndian, v); err != nil { +func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error { + n := binary.Size(v) + buf := make([]byte, n) + if read, _ := dev.ReadAt(buf, abOff); read < int(n) { return syserror.EIO } + binary.Unmarshal(buf, binary.LittleEndian, v) return nil } // readSuperBlock reads the SuperBlock from block group 0 in the underlying // device. There are three versions of the superblock. This function identifies // and returns the correct version. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) { +func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) { var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { return nil, err @@ -76,19 +68,12 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 { blocksPerGroup := uint64(sb.BlocksPerGroup()) // Round up the result. float64 can compromise precision so do it manually. - bgCount := blocksCount / blocksPerGroup - if blocksCount%blocksPerGroup != 0 { - bgCount++ - } - - return bgCount + return (blocksCount + blocksPerGroup - 1) / blocksPerGroup } // readBlockGroups reads the block group descriptor table from block group 0 in // the underlying device. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { +func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { bgCount := blockGroupsCount(sb) bgdSize := uint64(sb.BgDescSize()) is64Bit := sb.IncompatibleFeatures().Is64Bit diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 693ffc760..728575864 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -678,7 +678,7 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s return "", syserror.ENOENT } -// GetPath returns the PATH as a slice of strings given the environemnt +// GetPath returns the PATH as a slice of strings given the environment // variables. func GetPath(env []string) []string { const prefix = "PATH=" diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 4c2d48e65..55a9d3d29 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -197,6 +197,11 @@ type Kernel struct { // caches. Not all caches use it, only the caches that use host resources use // the limiter. It may be nil if disabled. DirentCacheLimiter *fs.DirentCacheLimiter + + // unimplementedSyscallEmitter is used to emit unimplemented syscall + // events. This is initialized lazily on the first unimplemented + // syscall. + unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` } // InitKernelArgs holds arguments to Init. @@ -290,7 +295,6 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() - return nil } @@ -586,11 +590,17 @@ func (k *Kernel) UniqueID() uint64 { // CreateProcessArgs holds arguments to kernel.CreateProcess. type CreateProcessArgs struct { - // Filename is the filename to load. + // Filename is the filename to load as the init binary. // - // If this is provided as "", then the file will be guessed via Argv[0]. + // If this is provided as "", File will be checked, then the file will be + // guessed via Argv[0]. Filename string + // File is a passed host FD pointing to a file to load as the init binary. + // + // This is checked if and only if Filename is "". + File *fs.File + // Argvv is a list of arguments. Argv []string @@ -775,8 +785,16 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, defer wd.DecRef() } - if args.Filename == "" { - // Was anything provided? + // Check which file to start from. + switch { + case args.Filename != "": + // If a filename is given, take that. + // Set File to nil so we resolve the path in LoadTaskImage. + args.File = nil + case args.File != nil: + // If File is set, take the File provided directly. + default: + // Otherwise look at Argv and see if the first argument is a valid path. if len(args.Argv) == 0 { return nil, 0, fmt.Errorf("no filename or command provided") } @@ -788,7 +806,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, // Create a fresh task context. remainingTraversals = uint(args.MaxSymlinkTraversals) - tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet) + + tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet) + if se != nil { return nil, 0, errors.New(se.String()) } @@ -1168,16 +1188,6 @@ func (k *Kernel) SupervisorContext() context.Context { } } -// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event -// channel. -func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { - t := TaskFromContext(ctx) - eventchannel.Emit(&uspb.UnimplementedSyscall{ - Tid: int32(t.ThreadID()), - Registers: t.Arch().StateData().Proto(), - }) -} - // SocketEntry represents a socket recorded in Kernel.sockets. It implements // refs.WeakRefUser for sockets stored in the socket table. // @@ -1272,3 +1282,23 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return nil } } + +// Rate limits for the number of unimplemented syscall evants. +const ( + unimplementedSyscallsMaxRate = 100 // events per second + unimplementedSyscallBurst = 1000 // events +) + +// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event +// channel. +func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { + if k.unimplementedSyscallEmitter == nil { + k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) + } + + t := TaskFromContext(ctx) + k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ + Tid: int32(t.ThreadID()), + Registers: t.Arch().StateData().Proto(), + }) +} diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 54b1676b0..8639d379f 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -140,15 +140,22 @@ func (t *Task) Stack() *arch.Stack { // * wd: Working directory to lookup filename under // * maxTraversals: maximum number of symlinks to follow // * filename: path to binary to load +// * file: an open fs.File object of the binary to load. If set, +// file will be loaded and not filename. // * argv: Binary argv // * envv: Binary envv // * fs: Binary FeatureSet -func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) { +func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) { + // If File is not nil, we should load that instead of resolving filename. + if file != nil { + filename = file.MappedName(ctx) + } + // Prepare a new user address space to load into. m := mm.NewMemoryManager(k, k) defer m.DecUsers(ctx) - os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso) + os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso) if err != nil { return nil, err } diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index baa12d9a0..f6f1ae762 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -67,8 +67,64 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m if err != nil { return nil, nil, err } + + // Open file will take a reference to Dirent, so destroy this one. defer d.DecRef() + return openFile(ctx, nil, d, name) +} + +// openFile performs checks on a file to be executed. If provided a *fs.File, +// openFile takes that file's Dirent and performs checks on it. If provided a +// *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's +// Inode and performs checks on that. +// +// openFile returns an *fs.File and *fs.Dirent, and the caller takes ownership +// of both. +// +// "dirent" and "file" must not both be nil and point to a readable, executable, regular file. +func openFile(ctx context.Context, file *fs.File, dirent *fs.Dirent, name string) (*fs.Dirent, *fs.File, error) { + // file and dirent must not be nil. + if dirent == nil && file == nil { + ctx.Infof("dirent and file cannot both be nil.") + return nil, nil, syserror.ENOENT + } + + if file != nil { + dirent = file.Dirent + } + + // Perform permissions checks on the file. + if err := checkFile(ctx, dirent, name); err != nil { + return nil, nil, err + } + + if file == nil { + var ferr error + if file, ferr = dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true}); ferr != nil { + return nil, nil, ferr + } + } else { + // GetFile takes a reference to the created file, so make one in the case + // that the file reference already existed. + file.IncRef() + } + + // We must be able to read at arbitrary offsets. + if !file.Flags().Pread { + file.DecRef() + ctx.Infof("%s cannot be read at an offset: %+v", file.MappedName(ctx), file.Flags()) + return nil, nil, syserror.EACCES + } + + // Grab reference for caller. + dirent.IncRef() + return dirent, file, nil +} + +// checkFile performs file permissions checks for binaries called in openPath +// and openFile +func checkFile(ctx context.Context, d *fs.Dirent, name string) error { perms := fs.PermMask{ // TODO(gvisor.dev/issue/160): Linux requires only execute // permission, not read. However, our backing filesystems may @@ -80,7 +136,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m Execute: true, } if err := d.Inode.CheckPermission(ctx, perms); err != nil { - return nil, nil, err + return err } // If they claim it's a directory, then make sure. @@ -88,31 +144,17 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m // N.B. we reject directories below, but we must first reject // non-directories passed as directories. if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) { - return nil, nil, syserror.ENOTDIR + return syserror.ENOTDIR } // No exec-ing directories, pipes, etc! if !fs.IsRegular(d.Inode.StableAttr) { ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr) - return nil, nil, syserror.EACCES + return syserror.EACCES } - // Create a new file. - file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) - if err != nil { - return nil, nil, err - } + return nil - // We must be able to read at arbitrary offsets. - if !file.Flags().Pread { - file.DecRef() - ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags()) - return nil, nil, syserror.EACCES - } - - // Grab a reference for the caller. - d.IncRef() - return d, file, nil } // allocStack allocates and maps a stack in to any available part of the address space. @@ -131,16 +173,30 @@ const ( maxLoaderAttempts = 6 ) -// loadPath resolves filename to a binary and loads it. +// loadBinary loads a binary that is pointed to by "file". If nil, the path +// "filename" is resolved and loaded. // // It returns: // * loadedELF, description of the loaded binary // * arch.Context matching the binary arch // * fs.Dirent of the binary file // * Possibly updated argv -func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) { +func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) { for i := 0; i < maxLoaderAttempts; i++ { - d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename) + var ( + d *fs.Dirent + f *fs.File + err error + ) + if passedFile == nil { + d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename) + + } else { + d, f, err = openFile(ctx, passedFile, nil, "") + // Set to nil in case we loop on a Interpreter Script. + passedFile = nil + } + if err != nil { ctx.Infof("Error opening %s: %v", filename, err) return loadedELF{}, nil, nil, nil, err @@ -165,7 +221,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac switch { case bytes.Equal(hdr[:], []byte(elfMagic)): - loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, fs, f) + loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, features, f) if err != nil { ctx.Infof("Error loading ELF: %v", err) return loadedELF{}, nil, nil, nil, err @@ -190,7 +246,8 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac return loadedELF{}, nil, nil, nil, syserror.ELOOP } -// Load loads filename into a MemoryManager. +// Load loads "file" into a MemoryManager. If file is nil, the path "filename" +// is resolved and loaded instead. // // If Load returns ErrSwitchFile it should be called again with the returned // path and argv. @@ -198,9 +255,9 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac // Preconditions: // * The Task MemoryManager is empty. // * Load is called on the Task goroutine. -func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { +func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { // Load the binary itself. - loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv) + loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux()) } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 51db2d8f7..ed996ba51 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -30,8 +30,7 @@ import ( const _AUDIT_ARCH_X86_64 = 0xc000003e // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall -// numbers from Linux 4.4. The entries commented out are those syscalls we -// don't currently support. +// numbers from Linux 4.4. var AMD64 = &kernel.SyscallTable{ OS: abi.Linux, Arch: arch.AMD64, diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 595eb9155..8ab7ffa25 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -96,7 +96,7 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Load the new TaskContext. maxTraversals := uint(linux.MaxSymlinkTraversals) - tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet()) + tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, nil, argv, envv, t.Arch().FeatureSet()) if se != nil { return 0, nil, se.ToError() } diff --git a/pkg/tcpip/link/fdbased/mmap_amd64.go b/pkg/tcpip/link/fdbased/mmap_amd64.go index 8bbb4f9ab..029f86a18 100644 --- a/pkg/tcpip/link/fdbased/mmap_amd64.go +++ b/pkg/tcpip/link/fdbased/mmap_amd64.go @@ -134,7 +134,7 @@ func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) { FD: int32(d.fd), Events: unix.POLLIN | unix.POLLERR, } - if _, errno := rawfile.BlockingPoll(&event, 1, -1); errno != 0 { + if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 { if errno == syscall.EINTR { continue } diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s index b54131573..298bad55d 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s @@ -14,17 +14,18 @@ #include "textflag.h" -// BlockingPoll makes the poll() syscall while calling the version of +// BlockingPoll makes the ppoll() syscall while calling the version of // entersyscall that relinquishes the P so that other Gs can run. This is meant // to be called in cases when the syscall is expected to block. // -// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno) +// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno) TEXT ·BlockingPoll(SB),NOSPLIT,$0-40 CALL ·callEntersyscallblock(SB) MOVQ fds+0(FP), DI MOVQ nfds+8(FP), SI MOVQ timeout+16(FP), DX - MOVQ $0x7, AX // SYS_POLL + MOVQ $0x0, R10 // sigmask parameter which isn't used here + MOVQ $0x10f, AX // SYS_PPOLL SYSCALL CMPQ AX, $0xfffffffffffff001 JLS ok diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go index c87268610..47039a446 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go @@ -26,7 +26,7 @@ import ( ) //go:noescape -func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) +func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno) // Use go:linkname to call into the runtime. As of Go 1.12 this has to // be done from Go code so that we make an ABIInternal call to an diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go index 4eab77c74..84dc0e918 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go @@ -21,9 +21,11 @@ import ( "unsafe" ) -// BlockingPoll is just a stub function that forwards to the poll() system call +// BlockingPoll is just a stub function that forwards to the ppoll() system call // on non-amd64 platforms. -func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) { - n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) +func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno) { + n, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(fds)), + uintptr(nfds), uintptr(unsafe.Pointer(timeout)), 0, 0, 0) + return int(n), e } diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index e3fbb15c2..7e286a3a6 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -123,7 +123,7 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) { Events: 1, // POLLIN } - _, e = BlockingPoll(&event, 1, -1) + _, e = BlockingPoll(&event, 1, nil) if e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } @@ -145,7 +145,7 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) { Events: 1, // POLLIN } - _, e = BlockingPoll(&event, 1, -1) + _, e = BlockingPoll(&event, 1, nil) if e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } @@ -175,7 +175,7 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) { Events: 1, // POLLIN } - if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR { + if _, e := BlockingPoll(&event, 1, nil); e != 0 && e != syscall.EINTR { return 0, TranslateErrno(e) } } diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD index 769509e80..cbd92fc05 100644 --- a/pkg/unet/BUILD +++ b/pkg/unet/BUILD @@ -11,8 +11,8 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/unet", visibility = ["//visibility:public"], deps = [ - "//pkg/abi/linux", "//pkg/gate", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go index f8a42c914..85ef46edf 100644 --- a/pkg/unet/unet_unsafe.go +++ b/pkg/unet/unet_unsafe.go @@ -16,12 +16,11 @@ package unet import ( "io" - "math" "sync/atomic" "syscall" "unsafe" - "gvisor.dev/gvisor/pkg/abi/linux" + "golang.org/x/sys/unix" ) // wait blocks until the socket FD is ready for reading or writing, depending @@ -37,23 +36,23 @@ func (s *Socket) wait(write bool) error { return errClosing } - events := []linux.PollFD{ + events := []unix.PollFd{ { // The actual socket FD. - FD: fd, - Events: linux.POLLIN, + Fd: fd, + Events: unix.POLLIN, }, { // The eventfd, signaled when we are closing. - FD: int32(s.efd), - Events: linux.POLLIN, + Fd: int32(s.efd), + Events: unix.POLLIN, }, } if write { - events[0].Events = linux.POLLOUT + events[0].Events = unix.POLLOUT } - _, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&events[0])), 2, uintptr(math.MaxUint64)) + _, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(&events[0])), 2, 0, 0, 0, 0) if e == syscall.EINTR { continue } @@ -61,7 +60,7 @@ func (s *Socket) wait(write bool) error { return e } - if events[1].REvents&linux.POLLIN == linux.POLLIN { + if events[1].Revents&unix.POLLIN == unix.POLLIN { // eventfd signaled, we're closing. return errClosing } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 0ee5b8bbd..7ca776b3a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -207,7 +207,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MPROTECT: {}, syscall.SYS_MUNMAP: {}, syscall.SYS_NANOSLEEP: {}, - syscall.SYS_POLL: {}, + syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, syscall.SYS_PWRITE64: {}, syscall.SYS_READ: {}, diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 2d50774d4..8ddfa77d6 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -138,7 +138,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_NANOSLEEP: {}, syscall.SYS_NEWFSTATAT: {}, syscall.SYS_OPENAT: {}, - syscall.SYS_POLL: {}, + syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, syscall.SYS_PWRITE64: {}, syscall.SYS_READ: {}, diff --git a/runsc/test/runtimes/BUILD b/test/runtimes/BUILD index ea87029dd..e85804a83 100644 --- a/runsc/test/runtimes/BUILD +++ b/test/runtimes/BUILD @@ -8,7 +8,7 @@ package(licenses = ["notice"]) go_library( name = "runtimes", srcs = ["runtimes.go"], - importpath = "gvisor.dev/gvisor/runsc/test/runtimes", + importpath = "gvisor.dev/gvisor/test/runtimes", ) runtime_test( diff --git a/runsc/test/runtimes/README.md b/test/runtimes/README.md index 4e5a950bc..4e5a950bc 100644 --- a/runsc/test/runtimes/README.md +++ b/test/runtimes/README.md diff --git a/runsc/test/runtimes/go/BUILD b/test/runtimes/go/BUILD index c34f49ea6..c34f49ea6 100644 --- a/runsc/test/runtimes/go/BUILD +++ b/test/runtimes/go/BUILD diff --git a/runsc/test/runtimes/go/Dockerfile b/test/runtimes/go/Dockerfile index cd55608cd..cd55608cd 100644 --- a/runsc/test/runtimes/go/Dockerfile +++ b/test/runtimes/go/Dockerfile diff --git a/runsc/test/runtimes/go/proctor-go.go b/test/runtimes/go/proctor-go.go index c5387e21d..c5387e21d 100644 --- a/runsc/test/runtimes/go/proctor-go.go +++ b/test/runtimes/go/proctor-go.go diff --git a/runsc/test/runtimes/java/BUILD b/test/runtimes/java/BUILD index 7e2808ece..7e2808ece 100644 --- a/runsc/test/runtimes/java/BUILD +++ b/test/runtimes/java/BUILD diff --git a/runsc/test/runtimes/java/Dockerfile b/test/runtimes/java/Dockerfile index e162d7218..e162d7218 100644 --- a/runsc/test/runtimes/java/Dockerfile +++ b/test/runtimes/java/Dockerfile diff --git a/runsc/test/runtimes/java/proctor-java.go b/test/runtimes/java/proctor-java.go index 0177f421d..0177f421d 100644 --- a/runsc/test/runtimes/java/proctor-java.go +++ b/test/runtimes/java/proctor-java.go diff --git a/runsc/test/runtimes/nodejs/BUILD b/test/runtimes/nodejs/BUILD index 0fe5ff83e..0fe5ff83e 100644 --- a/runsc/test/runtimes/nodejs/BUILD +++ b/test/runtimes/nodejs/BUILD diff --git a/runsc/test/runtimes/nodejs/Dockerfile b/test/runtimes/nodejs/Dockerfile index b2416cce8..b2416cce8 100644 --- a/runsc/test/runtimes/nodejs/Dockerfile +++ b/test/runtimes/nodejs/Dockerfile diff --git a/runsc/test/runtimes/nodejs/proctor-nodejs.go b/test/runtimes/nodejs/proctor-nodejs.go index 8ddfb67fe..8ddfb67fe 100644 --- a/runsc/test/runtimes/nodejs/proctor-nodejs.go +++ b/test/runtimes/nodejs/proctor-nodejs.go diff --git a/runsc/test/runtimes/php/BUILD b/test/runtimes/php/BUILD index 22aef7ba4..22aef7ba4 100644 --- a/runsc/test/runtimes/php/BUILD +++ b/test/runtimes/php/BUILD diff --git a/runsc/test/runtimes/php/Dockerfile b/test/runtimes/php/Dockerfile index 1f8959b50..1f8959b50 100644 --- a/runsc/test/runtimes/php/Dockerfile +++ b/test/runtimes/php/Dockerfile diff --git a/runsc/test/runtimes/php/proctor-php.go b/test/runtimes/php/proctor-php.go index 9dfb33b04..9dfb33b04 100644 --- a/runsc/test/runtimes/php/proctor-php.go +++ b/test/runtimes/php/proctor-php.go diff --git a/runsc/test/runtimes/python/BUILD b/test/runtimes/python/BUILD index 501f77d63..501f77d63 100644 --- a/runsc/test/runtimes/python/BUILD +++ b/test/runtimes/python/BUILD diff --git a/runsc/test/runtimes/python/Dockerfile b/test/runtimes/python/Dockerfile index 811f48f8a..811f48f8a 100644 --- a/runsc/test/runtimes/python/Dockerfile +++ b/test/runtimes/python/Dockerfile diff --git a/runsc/test/runtimes/python/proctor-python.go b/test/runtimes/python/proctor-python.go index 73c8deb49..73c8deb49 100644 --- a/runsc/test/runtimes/python/proctor-python.go +++ b/test/runtimes/python/proctor-python.go diff --git a/runsc/test/runtimes/runtimes.go b/test/runtimes/runtimes.go index 2568e07fe..2568e07fe 100644 --- a/runsc/test/runtimes/runtimes.go +++ b/test/runtimes/runtimes.go diff --git a/runsc/test/runtimes/runtimes_test.go b/test/runtimes/runtimes_test.go index 6bf954e78..6bf954e78 100644 --- a/runsc/test/runtimes/runtimes_test.go +++ b/test/runtimes/runtimes_test.go diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index c5a368463..40fc73812 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -904,6 +904,14 @@ cc_binary( ], ) +cc_library( + name = "iptables_types", + testonly = 1, + hdrs = [ + "iptables.h", + ], +) + cc_binary( name = "itimer_test", testonly = 1, diff --git a/test/syscalls/linux/iptables.h b/test/syscalls/linux/iptables.h new file mode 100644 index 000000000..616bea550 --- /dev/null +++ b/test/syscalls/linux/iptables.h @@ -0,0 +1,198 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// There are a number of structs and values that we can't #include because of a +// difference between C and C++ (C++ won't let you implicitly cast from void* to +// struct something*). We re-define them here. + +#ifndef GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_ +#define GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_ + +// Netfilter headers require some headers to preceed them. +// clang-format off +#include <netinet/in.h> +#include <stddef.h> +// clang-format on + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <stdint.h> + +#define ipt_standard_target xt_standard_target +#define ipt_entry_target xt_entry_target +#define ipt_error_target xt_error_target + +enum SockOpts { + // For setsockopt. + BASE_CTL = 64, + SO_SET_REPLACE = BASE_CTL, + SO_SET_ADD_COUNTERS, + SO_SET_MAX = SO_SET_ADD_COUNTERS, + + // For getsockopt. + SO_GET_INFO = BASE_CTL, + SO_GET_ENTRIES, + SO_GET_REVISION_MATCH, + SO_GET_REVISION_TARGET, + SO_GET_MAX = SO_GET_REVISION_TARGET +}; + +// ipt_ip specifies basic matching criteria that can be applied by examining +// only the IP header of a packet. +struct ipt_ip { + // Source IP address. + struct in_addr src; + + // Destination IP address. + struct in_addr dst; + + // Source IP address mask. + struct in_addr smsk; + + // Destination IP address mask. + struct in_addr dmsk; + + // Input interface. + char iniface[IFNAMSIZ]; + + // Output interface. + char outiface[IFNAMSIZ]; + + // Input interface mask. + unsigned char iniface_mask[IFNAMSIZ]; + + // Output interface mask. + unsigned char outiface_mask[IFNAMSIZ]; + + // Transport protocol. + uint16_t proto; + + // Flags. + uint8_t flags; + + // Inverse flags. + uint8_t invflags; +}; + +// ipt_entry is an iptables rule. It contains information about what packets the +// rule matches and what action (target) to perform for matching packets. +struct ipt_entry { + // Basic matching information used to match a packet's IP header. + struct ipt_ip ip; + + // A caching field that isn't used by userspace. + unsigned int nfcache; + + // The number of bytes between the start of this ipt_entry struct and the + // rule's target. + uint16_t target_offset; + + // The total size of this rule, from the beginning of the entry to the end of + // the target. + uint16_t next_offset; + + // A return pointer not used by userspace. + unsigned int comefrom; + + // Counters for packets and bytes, which we don't yet implement. + struct xt_counters counters; + + // The data for all this rules matches followed by the target. This runs + // beyond the value of sizeof(struct ipt_entry). + unsigned char elems[0]; +}; + +// Passed to getsockopt(SO_GET_INFO). +struct ipt_getinfo { + // The name of the table. The user only fills this in, the rest is filled in + // when returning from getsockopt. Currently "nat" and "mangle" are supported. + char name[XT_TABLE_MAXNAMELEN]; + + // A bitmap of which hooks apply to the table. For example, a table with hooks + // PREROUTING and FORWARD has the value + // (1 << NF_IP_PRE_REOUTING) | (1 << NF_IP_FORWARD). + unsigned int valid_hooks; + + // The offset into the entry table for each valid hook. The entry table is + // returned by getsockopt(SO_GET_ENTRIES). + unsigned int hook_entry[NF_IP_NUMHOOKS]; + + // For each valid hook, the underflow is the offset into the entry table to + // jump to in case traversing the table yields no verdict (although I have no + // clue how that could happen - builtin chains always end with a policy, and + // user-defined chains always end with a RETURN. + // + // The entry referred to must be an "unconditional" entry, meaning it has no + // matches, specifies no IP criteria, and either DROPs or ACCEPTs packets. It + // basically has to be capable of making a definitive decision no matter what + // it's passed. + unsigned int underflow[NF_IP_NUMHOOKS]; + + // The number of entries in the entry table returned by + // getsockopt(SO_GET_ENTRIES). + unsigned int num_entries; + + // The size of the entry table returned by getsockopt(SO_GET_ENTRIES). + unsigned int size; +}; + +// Passed to getsockopt(SO_GET_ENTRIES). +struct ipt_get_entries { + // The name of the table. The user fills this in. Currently "nat" and "mangle" + // are supported. + char name[XT_TABLE_MAXNAMELEN]; + + // The size of the entry table in bytes. The user fills this in with the value + // from struct ipt_getinfo.size. + unsigned int size; + + // The entries for the given table. This will run past the size defined by + // sizeof(struct ipt_get_entries). + struct ipt_entry entrytable[0]; +}; + +// Passed to setsockopt(SO_SET_REPLACE). +struct ipt_replace { + // The name of the table. + char name[XT_TABLE_MAXNAMELEN]; + + // The same as struct ipt_getinfo.valid_hooks. Users don't change this. + unsigned int valid_hooks; + + // The same as struct ipt_getinfo.num_entries. + unsigned int num_entries; + + // The same as struct ipt_getinfo.size. + unsigned int size; + + // The same as struct ipt_getinfo.hook_entry. + unsigned int hook_entry[NF_IP_NUMHOOKS]; + + // The same as struct ipt_getinfo.underflow. + unsigned int underflow[NF_IP_NUMHOOKS]; + + // The number of counters, which should equal the number of entries. + unsigned int num_counters; + + // The unchanged values from each ipt_entry's counters. + struct xt_counters *counters; + + // The entries to write to the table. This will run past the size defined by + // sizeof(srtuct ipt_replace); + struct ipt_entry entries[0]; +}; + +#endif // GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_ diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index 510f7bee5..88ab90b5b 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -539,9 +539,8 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) { ASSERT_THAT(fstat(fd1.get(), &st1), SyscallSucceeds()); ASSERT_THAT(fstat(fd2.get(), &st2), SyscallSucceeds()); - // The two fds should have different inode numbers. Specifically, since fd2 - // was created later, it should have a higher inode number. - EXPECT_GT(st2.st_ino, st1.st_ino); + // The two fds should have different inode numbers. + EXPECT_NE(st2.st_ino, st1.st_ino); // Verify again after another S/R cycle. The inode numbers should remain the // same. |