diff options
-rw-r--r-- | pkg/sentry/device/BUILD | 4 | ||||
-rw-r--r-- | pkg/sentry/device/device.go | 111 | ||||
-rw-r--r-- | pkg/sentry/kernel/BUILD | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel_state.go | 11 | ||||
-rw-r--r-- | test/syscalls/linux/stat.cc | 33 |
6 files changed, 139 insertions, 25 deletions
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD index 01de708d3..4ccf0674d 100644 --- a/pkg/sentry/device/BUILD +++ b/pkg/sentry/device/BUILD @@ -1,7 +1,7 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") - package(licenses = ["notice"]) +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + go_library( name = "device", srcs = ["device.go"], diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go index 27e4eb258..ae4fa1d93 100644 --- a/pkg/sentry/device/device.go +++ b/pkg/sentry/device/device.go @@ -14,11 +14,6 @@ // Package device defines reserved virtual kernel devices and structures // for managing them. -// -// Saving and restoring devices is not necessary if the devices are initialized -// as package global variables. Package initialization happens in a single goroutine -// and in a deterministic order, so minor device numbers will be assigned in the -// same order as packages are loaded. package device import ( @@ -30,7 +25,83 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" ) +// Registry tracks all simple devices and related state on the system for +// save/restore. +// +// The set of devices across save/restore must remain consistent. That is, no +// devices may be created or removed on restore relative to the saved +// system. Practically, this means do not create new devices specifically as +// part of restore. +// +// +stateify savable +type Registry struct { + // lastAnonDeviceMinor is the last minor device number used for an anonymous + // device. Must be accessed atomically. + lastAnonDeviceMinor uint64 + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + devices map[ID]*Device +} + +// SimpleDevices is the system-wide simple device registry. This is +// saved/restored by kernel.Kernel, but defined here to allow access without +// depending on the kernel package. See kernel.Kernel.deviceRegistry. +var SimpleDevices = newRegistry() + +func newRegistry() *Registry { + return &Registry{ + devices: make(map[ID]*Device), + } +} + +// newAnonID assigns a major and minor number to an anonymous device ID. +func (r *Registry) newAnonID() ID { + return ID{ + // Anon devices always have a major number of 0. + Major: 0, + // Use the next minor number. + Minor: atomic.AddUint64(&r.lastAnonDeviceMinor, 1), + } +} + +// newAnonDevice allocates a new anonymous device with a unique minor device +// number, and registers it with r. +func (r *Registry) newAnonDevice() *Device { + r.mu.Lock() + defer r.mu.Unlock() + d := &Device{ + ID: r.newAnonID(), + } + r.devices[d.ID] = d + return d +} + +// LoadFrom initializes the internal state of all devices in r from other. The +// set of devices in both registries must match. Devices may not be created or +// destroyed across save/restore. +func (r *Registry) LoadFrom(other *Registry) { + r.mu.Lock() + defer r.mu.Unlock() + other.mu.Lock() + defer other.mu.Unlock() + if len(r.devices) != len(other.devices) { + panic(fmt.Sprintf("Devices were added or removed when restoring the registry:\nnew:\n%+v\nold:\n%+v", r.devices, other.devices)) + } + for id, otherD := range other.devices { + ourD, ok := r.devices[id] + if !ok { + panic(fmt.Sprintf("Device %+v could not be restored as it wasn't defined in the new registry", otherD)) + } + ourD.loadFrom(otherD) + } + atomic.StoreUint64(&r.lastAnonDeviceMinor, atomic.LoadUint64(&other.lastAnonDeviceMinor)) +} + // ID identifies a device. +// +// +stateify savable type ID struct { Major uint64 Minor uint64 @@ -41,18 +112,12 @@ func (i *ID) DeviceID() uint64 { return uint64(linux.MakeDeviceID(uint16(i.Major), uint32(i.Minor))) } -// nextAnonDeviceMinor is the next minor number for a new anonymous device. -// Must be accessed atomically. -var nextAnonDeviceMinor uint64 - // NewAnonDevice creates a new anonymous device. Packages that require an anonymous // device should initialize the device in a global variable in a file called device.go: // // var myDevice = device.NewAnonDevice() func NewAnonDevice() *Device { - return &Device{ - ID: newAnonID(), - } + return SimpleDevices.newAnonDevice() } // NewAnonMultiDevice creates a new multi-keyed anonymous device. Packages that require @@ -62,21 +127,13 @@ func NewAnonDevice() *Device { // var myDevice = device.NewAnonMultiDevice() func NewAnonMultiDevice() *MultiDevice { return &MultiDevice{ - ID: newAnonID(), - } -} - -// newAnonID assigns a major and minor number to an anonymous device ID. -func newAnonID() ID { - return ID{ - // Anon devices always have a major number of 0. - Major: 0, - // Use the next minor number. - Minor: atomic.AddUint64(&nextAnonDeviceMinor, 1), + ID: SimpleDevices.newAnonID(), } } // Device is a simple virtual kernel device. +// +// +stateify savable type Device struct { ID @@ -84,6 +141,14 @@ type Device struct { last uint64 } +// loadFrom initializes d from other. The IDs of both devices must match. +func (d *Device) loadFrom(other *Device) { + if d.ID != other.ID { + panic(fmt.Sprintf("Attempting to initialize a device %+v from %+v, but device IDs don't match", d, other)) + } + atomic.StoreUint64(&d.last, atomic.LoadUint64(&other.last)) +} + // NextIno generates a new inode number func (d *Device) NextIno() uint64 { return atomic.AddUint64(&d.last, 1) diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 4d34bc733..99a2fd964 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -137,6 +137,7 @@ go_library( importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel", imports = [ "gvisor.googlesource.com/gvisor/pkg/bpf", + "gvisor.googlesource.com/gvisor/pkg/sentry/device", "gvisor.googlesource.com/gvisor/pkg/tcpip", ], visibility = ["//:sandbox"], @@ -156,6 +157,7 @@ go_library( "//pkg/secio", "//pkg/sentry/arch", "//pkg/sentry/context", + "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index f5cbd6c23..f7f471aaa 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -185,6 +185,9 @@ type Kernel struct { // socketTable is used to track all sockets on the system. Protected by // extMu. socketTable map[int]map[*refs.WeakRef]struct{} + + // deviceRegistry is used to save/restore device.SimpleDevices. + deviceRegistry struct{} `state:".(*device.Registry)"` } // InitKernelArgs holds arguments to Init. diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go index a0a69b498..aae6f9ad2 100644 --- a/pkg/sentry/kernel/kernel_state.go +++ b/pkg/sentry/kernel/kernel_state.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" "gvisor.googlesource.com/gvisor/pkg/tcpip" ) @@ -29,3 +30,13 @@ func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) { tcpip.AddDanglingEndpoint(e) } } + +// saveDeviceRegistry is invoked by stateify. +func (k *Kernel) saveDeviceRegistry() *device.Registry { + return device.SimpleDevices +} + +// loadDeviceRegistry is invoked by stateify. +func (k *Kernel) loadDeviceRegistry(r *device.Registry) { + device.SimpleDevices.LoadFrom(r) +} diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index f96da5706..553fb7e56 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -429,6 +429,39 @@ TEST_F(StatTest, LstatELOOPPath) { ASSERT_THAT(lstat(path.c_str(), &s), SyscallFailsWithErrno(ELOOP)); } +// Ensure that inode allocation for anonymous devices work correctly across +// save/restore. In particular, inode numbers should be unique across S/R. +TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) { + // Use sockets as a convenient way to create inodes on an anonymous device. + int fd; + ASSERT_THAT(fd = socket(AF_UNIX, SOCK_STREAM, 0), SyscallSucceeds()); + FileDescriptor fd1(fd); + MaybeSave(); + ASSERT_THAT(fd = socket(AF_UNIX, SOCK_STREAM, 0), SyscallSucceeds()); + FileDescriptor fd2(fd); + + struct stat st1; + struct stat st2; + ASSERT_THAT(fstat(fd1.get(), &st1), SyscallSucceeds()); + ASSERT_THAT(fstat(fd2.get(), &st2), SyscallSucceeds()); + + // The two fds should have different inode numbers. Specifically, since fd2 + // was created later, it should have a higher inode number. + EXPECT_GT(st2.st_ino, st1.st_ino); + + // Verify again after another S/R cycle. The inode numbers should remain the + // same. + MaybeSave(); + + struct stat st1_after; + struct stat st2_after; + ASSERT_THAT(fstat(fd1.get(), &st1_after), SyscallSucceeds()); + ASSERT_THAT(fstat(fd2.get(), &st2_after), SyscallSucceeds()); + + EXPECT_EQ(st1_after.st_ino, st1.st_ino); + EXPECT_EQ(st2_after.st_ino, st2.st_ino); +} + } // namespace } // namespace testing |