diff options
Diffstat (limited to 'pkg/sentry')
80 files changed, 342 insertions, 6025 deletions
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 1d88db12f..de7a0f3ab 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -404,3 +404,16 @@ func ttyName(tty *kernel.TTY) string { } return fmt.Sprintf("pts/%d", tty.Index) } + +// ContainerUsage retrieves per-container CPU usage. +func ContainerUsage(kr *kernel.Kernel) map[string]uint64 { + cusage := make(map[string]uint64) + for _, tg := range kr.TaskSet().Root.ThreadGroups() { + // We want each tg's usage including reaped children. + cid := tg.Leader().ContainerID() + stats := tg.CPUStats() + stats.Accumulate(tg.JoinedChildCPUStats()) + cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds()) + } + return cusage +} diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index a2f3d5918..07b4fb70f 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -257,7 +257,7 @@ func (c *ConnectedEndpoint) Passcred() bool { } // GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. -func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 089955a96..ae972fcb5 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -299,10 +299,15 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off src = src.TakeFirst64(limit) } - // Do a buffered write. See rationale in PRead. if d.cachedMetadataAuthoritative() { - d.touchCMtime() + if fd.isRegularFile { + d.touchCMtimeLocked() + } else { + d.touchCMtime() + } } + + // Do a buffered write. See rationale in PRead. buf := make([]byte, src.NumBytes()) copied, copyErr := src.CopyIn(ctx, buf) if copied == 0 && copyErr != nil { diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 60acc367f..72aa535f8 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -201,7 +201,7 @@ func (c *ConnectedEndpoint) Passcred() bool { } // GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. -func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index eac578f25..8139bff76 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -371,6 +371,8 @@ type OrderedChildrenOptions struct { // OrderedChildren may modify the tracked children. This applies to // operations related to rename, unlink and rmdir. If an OrderedChildren is // not writable, these operations all fail with EPERM. + // + // Note that writable users must implement the sticky bit (I_SVTX). Writable bool } @@ -556,7 +558,6 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) return err } - // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. o.removeLocked(name) return nil } @@ -603,8 +604,8 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c if err := o.checkExistingLocked(oldname, child); err != nil { return err } + o.removeLocked(oldname) - // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. dst.replaceChildLocked(ctx, newname, child) return nil } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 3b6336e94..09c0ccaf2 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -368,17 +368,15 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst }) } -// CopyOutFrom implements usermem.IO.CopyOutFrom. +// CopyOutFrom implements usermem.IO.CopyOutFrom. Note that it is the caller's +// responsibility to call fd.pipe.Notify(waiter.EventIn) after the write is +// completed. // // Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { - n, err := fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) { + return fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) { return src.ReadToBlocks(dsts) }) - if n > 0 { - fd.pipe.Notify(waiter.EventIn) - } - return n, err } // SwapUint32 implements usermem.IO.SwapUint32. diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 8ce411102..b3290917e 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -45,14 +45,14 @@ go_library( "//pkg/cpuid", "//pkg/log", "//pkg/procid", + "//pkg/ring0", + "//pkg/ring0/pagetables", "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/ring0", - "//pkg/sentry/platform/ring0/pagetables", "//pkg/sentry/time", "//pkg/sync", "//pkg/usermem", @@ -75,11 +75,11 @@ go_test( "requires-kvm", ], deps = [ + "//pkg/ring0", + "//pkg/ring0/pagetables", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", - "//pkg/sentry/platform/ring0", - "//pkg/sentry/platform/ring0/pagetables", "//pkg/sentry/time", "//pkg/usermem", ], @@ -89,6 +89,6 @@ genrule( name = "bluepill_impl_amd64", srcs = ["bluepill_amd64.s"], outs = ["bluepill_impl_amd64.s"], - cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", - tools = ["//pkg/sentry/platform/ring0/gen_offsets"], + cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/ring0/gen_offsets) && cat $(SRCS)) > $@", + tools = ["//pkg/ring0/gen_offsets"], ) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index af5c5e191..25c21e843 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -18,9 +18,9 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 4b23f7803..2c970162e 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -19,9 +19,9 @@ import ( "reflect" "syscall" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) // bluepill enters guest mode. diff --git a/pkg/sentry/platform/kvm/bluepill_allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go index 9485e1301..1825edc3a 100644 --- a/pkg/sentry/platform/kvm/bluepill_allocator.go +++ b/pkg/sentry/platform/kvm/bluepill_allocator.go @@ -17,7 +17,7 @@ package kvm import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/ring0/pagetables" ) type allocator struct { diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index ddc1554d5..83a4766fb 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -19,8 +19,8 @@ package kvm import ( "syscall" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) var ( diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index f8ccb7430..0063e947b 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -20,8 +20,8 @@ import ( "syscall" "unsafe" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) // dieArchSetup initializes the state for dieTrampoline. diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index 1f09813ba..35298135a 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -19,8 +19,8 @@ package kvm import ( "syscall" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) var ( diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index 4d912769a..dbbf2a897 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -20,8 +20,8 @@ import ( "syscall" "unsafe" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) // fpsimdPtr returns a fpsimd64 for the given address. diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index 17268d127..aeae01dbd 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -18,10 +18,10 @@ import ( "sync/atomic" pkgcontext "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index 5979aef97..7bdf57436 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -20,9 +20,9 @@ import ( "os" "syscall" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index 093497bc4..b9ed4a706 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -18,7 +18,7 @@ package kvm import ( "gvisor.dev/gvisor/pkg/cpuid" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" + "gvisor.dev/gvisor/pkg/ring0" ) // userRegs represents KVM user registers. diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index c0b4fd374..76fc594a0 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -19,11 +19,11 @@ package kvm import ( "testing" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ) func TestSegments(t *testing.T) { diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index 9db1db4e9..b73340f0e 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -17,8 +17,8 @@ package kvm import ( + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) type kvmOneReg struct { diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index a650877d6..11ca1f0ea 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -22,11 +22,11 @@ import ( "testing" "time" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index e2fffc99b..1ece1b8d8 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 8e03c310d..59c752d73 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -24,10 +24,10 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index aa2d21748..7d7857067 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -17,10 +17,10 @@ package kvm import ( + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index a466acf4d..dca0cdb60 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -23,10 +23,10 @@ import ( "syscall" "unsafe" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index f7fa2f98d..8bdec93ae 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -20,7 +20,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" + "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/usermem" ) diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD deleted file mode 100644 index 2852b7387..000000000 --- a/pkg/sentry/platform/ring0/BUILD +++ /dev/null @@ -1,85 +0,0 @@ -load("//tools:defs.bzl", "arch_genrule", "go_library") -load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") - -package(licenses = ["notice"]) - -go_template( - name = "defs_amd64", - srcs = [ - "defs.go", - "defs_amd64.go", - "offsets_amd64.go", - "x86.go", - ], - visibility = [":__subpackages__"], -) - -go_template( - name = "defs_arm64", - srcs = [ - "aarch64.go", - "defs.go", - "defs_arm64.go", - "offsets_arm64.go", - ], - visibility = [":__subpackages__"], -) - -go_template_instance( - name = "defs_impl_amd64", - out = "defs_impl_amd64.go", - package = "ring0", - template = ":defs_amd64", -) - -go_template_instance( - name = "defs_impl_arm64", - out = "defs_impl_arm64.go", - package = "ring0", - template = ":defs_arm64", -) - -arch_genrule( - name = "entry_impl_amd64", - srcs = ["entry_amd64.s"], - outs = ["entry_impl_amd64.s"], - cmd = "(echo -e '// build +amd64\\n' && QEMU $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(location entry_amd64.s)) > $@", - tools = ["//pkg/sentry/platform/ring0/gen_offsets"], -) - -arch_genrule( - name = "entry_impl_arm64", - srcs = ["entry_arm64.s"], - outs = ["entry_impl_arm64.s"], - cmd = "(echo -e '// build +arm64\\n' && QEMU $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(location entry_arm64.s)) > $@", - tools = ["//pkg/sentry/platform/ring0/gen_offsets"], -) - -go_library( - name = "ring0", - srcs = [ - "defs_impl_amd64.go", - "defs_impl_arm64.go", - "entry_amd64.go", - "entry_arm64.go", - "entry_impl_amd64.s", - "entry_impl_arm64.s", - "kernel.go", - "kernel_amd64.go", - "kernel_arm64.go", - "kernel_unsafe.go", - "lib_amd64.go", - "lib_amd64.s", - "lib_arm64.go", - "lib_arm64.s", - "ring0.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/cpuid", - "//pkg/safecopy", - "//pkg/sentry/arch", - "//pkg/sentry/platform/ring0/pagetables", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go deleted file mode 100644 index 3bda594f9..000000000 --- a/pkg/sentry/platform/ring0/aarch64.go +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -// Useful bits. -const ( - _PGD_PGT_BASE = 0x1000 - _PGD_PGT_SIZE = 0x1000 - _PUD_PGT_BASE = 0x2000 - _PUD_PGT_SIZE = 0x1000 - _PMD_PGT_BASE = 0x3000 - _PMD_PGT_SIZE = 0x4000 - _PTE_PGT_BASE = 0x7000 - _PTE_PGT_SIZE = 0x1000 -) - -const ( - // DAIF bits:debug, sError, IRQ, FIQ. - _PSR_D_BIT = 0x00000200 - _PSR_A_BIT = 0x00000100 - _PSR_I_BIT = 0x00000080 - _PSR_F_BIT = 0x00000040 - _PSR_DAIF_SHIFT = 6 - _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT - - // PSR bits. - _PSR_MODE_EL0t = 0x00000000 - _PSR_MODE_EL1t = 0x00000004 - _PSR_MODE_EL1h = 0x00000005 - _PSR_MODE_MASK = 0x0000000f - - PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK - PsrModeMask = _PSR_MODE_MASK - - // KernelFlagsSet should always be set in the kernel. - KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT - - // UserFlagsSet are always set in userspace. - UserFlagsSet = _PSR_MODE_EL0t -) - -// Vector is an exception vector. -type Vector uintptr - -// Exception vectors. -const ( - El1InvSync = iota - El1InvIrq - El1InvFiq - El1InvError - - El1Sync - El1Irq - El1Fiq - El1Err - - El0Sync - El0Irq - El0Fiq - El0Err - - El0InvSync - El0InvIrq - El0InvFiq - El0InvErr - - El1SyncDa - El1SyncIa - El1SyncSpPc - El1SyncUndef - El1SyncDbg - El1SyncInv - - El0SyncSVC - El0SyncDa - El0SyncIa - El0SyncFpsimdAcc - El0SyncSveAcc - El0SyncFpsimdExc - El0SyncSys - El0SyncSpPc - El0SyncUndef - El0SyncDbg - El0SyncWfx - El0SyncInv - - El0ErrNMI - El0ErrBounce - - _NR_INTERRUPTS -) - -// System call vectors. -const ( - Syscall Vector = El0SyncSVC - PageFault Vector = El0SyncDa - VirtualizationException Vector = El0ErrBounce -) - -// VirtualAddressBits returns the number bits available for virtual addresses. -func VirtualAddressBits() uint32 { - return 48 -} - -// PhysicalAddressBits returns the number of bits available for physical addresses. -func PhysicalAddressBits() uint32 { - return 40 -} diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go deleted file mode 100644 index f9765771e..000000000 --- a/pkg/sentry/platform/ring0/defs.go +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" -) - -// Kernel is a global kernel object. -// -// This contains global state, shared by multiple CPUs. -type Kernel struct { - // PageTables are the kernel pagetables; this must be provided. - PageTables *pagetables.PageTables - - KernelArchState -} - -// Hooks are hooks for kernel functions. -type Hooks interface { - // KernelSyscall is called for kernel system calls. - // - // Return from this call will restore registers and return to the kernel: the - // registers must be modified directly. - // - // If this function is not provided, a kernel exception results in halt. - // - // This must be go:nosplit, as this will be on the interrupt stack. - // Closures are permitted, as the pointer to the closure frame is not - // passed on the stack. - KernelSyscall() - - // KernelException handles an exception during kernel execution. - // - // Return from this call will restore registers and return to the kernel: the - // registers must be modified directly. - // - // If this function is not provided, a kernel exception results in halt. - // - // This must be go:nosplit, as this will be on the interrupt stack. - // Closures are permitted, as the pointer to the closure frame is not - // passed on the stack. - KernelException(Vector) -} - -// CPU is the per-CPU struct. -type CPU struct { - // self is a self reference. - // - // This is always guaranteed to be at offset zero. - self *CPU - - // kernel is reference to the kernel that this CPU was initialized - // with. This reference is kept for garbage collection purposes: CPU - // registers may refer to objects within the Kernel object that cannot - // be safely freed. - kernel *Kernel - - // CPUArchState is architecture-specific state. - CPUArchState - - // registers is a set of registers; these may be used on kernel system - // calls and exceptions via the Registers function. - registers arch.Registers - - // hooks are kernel hooks. - hooks Hooks -} - -// Registers returns a modifiable-copy of the kernel registers. -// -// This is explicitly safe to call during KernelException and KernelSyscall. -// -//go:nosplit -func (c *CPU) Registers() *arch.Registers { - return &c.registers -} - -// SwitchOpts are passed to the Switch function. -type SwitchOpts struct { - // Registers are the user register state. - Registers *arch.Registers - - // FloatingPointState is a byte pointer where floating point state is - // saved and restored. - FloatingPointState *byte - - // PageTables are the application page tables. - PageTables *pagetables.PageTables - - // Flush indicates that a TLB flush should be forced on switch. - Flush bool - - // FullRestore indicates that an iret-based restore should be used. - FullRestore bool - - // SwitchArchOpts are architecture-specific options. - SwitchArchOpts -} diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go deleted file mode 100644 index 7a2275558..000000000 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/usermem" -) - -var ( - // UserspaceSize is the total size of userspace. - UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) - - // MaximumUserAddress is the largest possible user address. - MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) - - // KernelStartAddress is the starting kernel address. - KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) -) - -// Segment indices and Selectors. -const ( - // Index into GDT array. - _ = iota // Null descriptor first. - _ // Reserved (Linux is kernel 32). - segKcode // Kernel code (64-bit). - segKdata // Kernel data. - segUcode32 // User code (32-bit). - segUdata // User data. - segUcode64 // User code (64-bit). - segTss // Task segment descriptor. - segTssHi // Upper bits for TSS. - segLast // Last segment (terminal, not included). -) - -// Selectors. -const ( - Kcode Selector = segKcode << 3 - Kdata Selector = segKdata << 3 - Ucode32 Selector = (segUcode32 << 3) | 3 - Udata Selector = (segUdata << 3) | 3 - Ucode64 Selector = (segUcode64 << 3) | 3 - Tss Selector = segTss << 3 -) - -// Standard segments. -var ( - UserCodeSegment32 SegmentDescriptor - UserDataSegment SegmentDescriptor - UserCodeSegment64 SegmentDescriptor - KernelCodeSegment SegmentDescriptor - KernelDataSegment SegmentDescriptor -) - -// KernelArchState contains architecture-specific state. -type KernelArchState struct { - // cpuEntries is array of kernelEntry for all cpus. - cpuEntries []kernelEntry - - // globalIDT is our set of interrupt gates. - globalIDT *idt64 -} - -// kernelEntry contains minimal CPU-specific arch state -// that can be mapped at the upper of the address space. -// Malicious APP might steal info from it via CPU bugs. -type kernelEntry struct { - // stack is the stack used for interrupts on this CPU. - stack [256]byte - - // scratch space for temporary usage. - scratch0 uint64 - - // stackTop is the top of the stack. - stackTop uint64 - - // cpuSelf is back reference to CPU. - cpuSelf *CPU - - // kernelCR3 is the cr3 used for sentry kernel. - kernelCR3 uintptr - - // gdt is the CPU's descriptor table. - gdt descriptorTable - - // tss is the CPU's task state. - tss TaskState64 -} - -// CPUArchState contains CPU-specific arch state. -type CPUArchState struct { - // errorCode is the error code from the last exception. - errorCode uintptr - - // errorType indicates the type of error code here, it is always set - // along with the errorCode value above. - // - // It will either by 1, which indicates a user error, or 0 indicating a - // kernel error. If the error code below returns false (kernel error), - // then it cannot provide relevant information about the last - // exception. - errorType uintptr - - *kernelEntry -} - -// ErrorCode returns the last error code. -// -// The returned boolean indicates whether the error code corresponds to the -// last user error or not. If it does not, then fault information must be -// ignored. This is generally the result of a kernel fault while servicing a -// user fault. -// -//go:nosplit -func (c *CPU) ErrorCode() (value uintptr, user bool) { - return c.errorCode, c.errorType != 0 -} - -// ClearErrorCode resets the error code. -// -//go:nosplit -func (c *CPU) ClearErrorCode() { - c.errorCode = 0 // No code. - c.errorType = 1 // User mode. -} - -// SwitchArchOpts are embedded in SwitchOpts. -type SwitchArchOpts struct { - // UserPCID indicates that the application PCID to be used on switch, - // assuming that PCIDs are supported. - // - // Per pagetables_x86.go, a zero PCID implies a flush. - UserPCID uint16 - - // KernelPCID indicates that the kernel PCID to be used on return, - // assuming that PCIDs are supported. - // - // Per pagetables_x86.go, a zero PCID implies a flush. - KernelPCID uint16 -} - -func init() { - KernelCodeSegment.setCode64(0, 0, 0) - KernelDataSegment.setData(0, 0xffffffff, 0) - UserCodeSegment32.setCode64(0, 0, 3) - UserDataSegment.setData(0, 0xffffffff, 3) - UserCodeSegment64.setCode64(0, 0, 3) -} diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go deleted file mode 100644 index a014dcbc0..000000000 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/usermem" -) - -var ( - // UserspaceSize is the total size of userspace. - UserspaceSize = uintptr(1) << (VirtualAddressBits()) - - // MaximumUserAddress is the largest possible user address. - MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) - - // KernelStartAddress is the starting kernel address. - KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) -) - -// KernelArchState contains architecture-specific state. -type KernelArchState struct { -} - -// CPUArchState contains CPU-specific arch state. -type CPUArchState struct { - // stack is the stack used for interrupts on this CPU. - stack [512]byte - - // errorCode is the error code from the last exception. - errorCode uintptr - - // errorType indicates the type of error code here, it is always set - // along with the errorCode value above. - // - // It will either by 1, which indicates a user error, or 0 indicating a - // kernel error. If the error code below returns false (kernel error), - // then it cannot provide relevant information about the last - // exception. - errorType uintptr - - // faultAddr is the value of far_el1. - faultAddr uintptr - - // ttbr0Kvm is the value of ttbr0_el1 for sentry. - ttbr0Kvm uintptr - - // ttbr0App is the value of ttbr0_el1 for applicaton. - ttbr0App uintptr - - // exception vector. - vecCode Vector - - // application context pointer. - appAddr uintptr - - // lazyVFP is the value of cpacr_el1. - lazyVFP uintptr - - // appASID is the asid value of guest application. - appASID uintptr -} - -// ErrorCode returns the last error code. -// -// The returned boolean indicates whether the error code corresponds to the -// last user error or not. If it does not, then fault information must be -// ignored. This is generally the result of a kernel fault while servicing a -// user fault. -// -//go:nosplit -func (c *CPU) ErrorCode() (value uintptr, user bool) { - return c.errorCode, c.errorType != 0 -} - -// ClearErrorCode resets the error code. -// -//go:nosplit -func (c *CPU) ClearErrorCode() { - c.errorCode = 0 // No code. - c.errorType = 1 // User mode. -} - -//go:nosplit -func (c *CPU) GetFaultAddr() (value uintptr) { - return c.faultAddr -} - -//go:nosplit -func (c *CPU) SetTtbr0Kvm(value uintptr) { - c.ttbr0Kvm = value -} - -//go:nosplit -func (c *CPU) SetTtbr0App(value uintptr) { - c.ttbr0App = value -} - -//go:nosplit -func (c *CPU) GetVector() (value Vector) { - return c.vecCode -} - -//go:nosplit -func (c *CPU) SetAppAddr(value uintptr) { - c.appAddr = value -} - -// GetLazyVFP returns the value of cpacr_el1. -//go:nosplit -func (c *CPU) GetLazyVFP() (value uintptr) { - return c.lazyVFP -} - -// SwitchArchOpts are embedded in SwitchOpts. -type SwitchArchOpts struct { - // UserASID indicates that the application ASID to be used on switch, - UserASID uint16 - - // KernelASID indicates that the kernel ASID to be used on return, - KernelASID uint16 -} - -func init() { -} diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go deleted file mode 100644 index d87b1fd00..000000000 --- a/pkg/sentry/platform/ring0/entry_amd64.go +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" -) - -// This is an assembly function. -// -// The sysenter function is invoked in two situations: -// -// (1) The guest kernel has executed a system call. -// (2) The guest application has executed a system call. -// -// The interrupt flag is examined to determine whether the system call was -// executed from kernel mode or not and the appropriate stub is called. -func sysenter() - -// swapgs swaps the current GS value. -// -// This must be called prior to sysret/iret. -func swapgs() - -// jumpToKernel jumps to the kernel version of the current RIP. -func jumpToKernel() - -// sysret returns to userspace from a system call. -// -// The return code is the vector that interrupted execution. -// -// See stubs.go for a note regarding the frame size of this function. -func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector - -// "iret is the cadillac of CPL switching." -// -// -- Neel Natu -// -// iret is nearly identical to sysret, except an iret is used to fully restore -// all user state. This must be called in cases where all registers need to be -// restored. -func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector - -// exception is the generic exception entry. -// -// This is called by the individual stub definitions. -func exception() - -// resume is a stub that restores the CPU kernel registers. -// -// This is used when processing kernel exceptions and syscalls. -func resume() - -// Start is the CPU entrypoint. -// -// The following start conditions must be satisfied: -// -// * AX should contain the CPU pointer. -// * c.GDT() should be loaded as the GDT. -// * c.IDT() should be loaded as the IDT. -// * c.CR0() should be the current CR0 value. -// * c.CR3() should be set to the kernel PageTables. -// * c.CR4() should be the current CR4 value. -// * c.EFER() should be the current EFER value. -// -// The CPU state will be set to c.Registers(). -func Start() - -// Exception stubs. -func divideByZero() -func debug() -func nmi() -func breakpoint() -func overflow() -func boundRangeExceeded() -func invalidOpcode() -func deviceNotAvailable() -func doubleFault() -func coprocessorSegmentOverrun() -func invalidTSS() -func segmentNotPresent() -func stackSegmentFault() -func generalProtectionFault() -func pageFault() -func x87FloatingPointException() -func alignmentCheck() -func machineCheck() -func simdFloatingPointException() -func virtualizationException() -func securityException() -func syscallInt80() - -// Exception handler index. -var handlers = map[Vector]func(){ - DivideByZero: divideByZero, - Debug: debug, - NMI: nmi, - Breakpoint: breakpoint, - Overflow: overflow, - BoundRangeExceeded: boundRangeExceeded, - InvalidOpcode: invalidOpcode, - DeviceNotAvailable: deviceNotAvailable, - DoubleFault: doubleFault, - CoprocessorSegmentOverrun: coprocessorSegmentOverrun, - InvalidTSS: invalidTSS, - SegmentNotPresent: segmentNotPresent, - StackSegmentFault: stackSegmentFault, - GeneralProtectionFault: generalProtectionFault, - PageFault: pageFault, - X87FloatingPointException: x87FloatingPointException, - AlignmentCheck: alignmentCheck, - MachineCheck: machineCheck, - SIMDFloatingPointException: simdFloatingPointException, - VirtualizationException: virtualizationException, - SecurityException: securityException, - SyscallInt80: syscallInt80, -} diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s deleted file mode 100644 index f59747df3..000000000 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "funcdata.h" -#include "textflag.h" - -// NB: Offsets are programmatically generated (see BUILD). -// -// This file is concatenated with the definitions. - -// Saves a register set. -// -// This is a macro because it may need to executed in contents where a stack is -// not available for calls. -// -// The following registers are not saved: AX, SP, IP, FLAGS, all segments. -#define REGISTERS_SAVE(reg, offset) \ - MOVQ R15, offset+PTRACE_R15(reg); \ - MOVQ R14, offset+PTRACE_R14(reg); \ - MOVQ R13, offset+PTRACE_R13(reg); \ - MOVQ R12, offset+PTRACE_R12(reg); \ - MOVQ BP, offset+PTRACE_RBP(reg); \ - MOVQ BX, offset+PTRACE_RBX(reg); \ - MOVQ CX, offset+PTRACE_RCX(reg); \ - MOVQ DX, offset+PTRACE_RDX(reg); \ - MOVQ R11, offset+PTRACE_R11(reg); \ - MOVQ R10, offset+PTRACE_R10(reg); \ - MOVQ R9, offset+PTRACE_R9(reg); \ - MOVQ R8, offset+PTRACE_R8(reg); \ - MOVQ SI, offset+PTRACE_RSI(reg); \ - MOVQ DI, offset+PTRACE_RDI(reg); - -// Loads a register set. -// -// This is a macro because it may need to executed in contents where a stack is -// not available for calls. -// -// The following registers are not loaded: AX, SP, IP, FLAGS, all segments. -#define REGISTERS_LOAD(reg, offset) \ - MOVQ offset+PTRACE_R15(reg), R15; \ - MOVQ offset+PTRACE_R14(reg), R14; \ - MOVQ offset+PTRACE_R13(reg), R13; \ - MOVQ offset+PTRACE_R12(reg), R12; \ - MOVQ offset+PTRACE_RBP(reg), BP; \ - MOVQ offset+PTRACE_RBX(reg), BX; \ - MOVQ offset+PTRACE_RCX(reg), CX; \ - MOVQ offset+PTRACE_RDX(reg), DX; \ - MOVQ offset+PTRACE_R11(reg), R11; \ - MOVQ offset+PTRACE_R10(reg), R10; \ - MOVQ offset+PTRACE_R9(reg), R9; \ - MOVQ offset+PTRACE_R8(reg), R8; \ - MOVQ offset+PTRACE_RSI(reg), SI; \ - MOVQ offset+PTRACE_RDI(reg), DI; - -// WRITE_CR3() writes the given CR3 value. -// -// The code corresponds to: -// -// mov %rax, %cr3 -// -#define WRITE_CR3() \ - BYTE $0x0f; BYTE $0x22; BYTE $0xd8; - -// SWAP_GS swaps the kernel GS (CPU). -#define SWAP_GS() \ - BYTE $0x0F; BYTE $0x01; BYTE $0xf8; - -// IRET returns from an interrupt frame. -#define IRET() \ - BYTE $0x48; BYTE $0xcf; - -// SYSRET64 executes the sysret instruction. -#define SYSRET64() \ - BYTE $0x48; BYTE $0x0f; BYTE $0x07; - -// LOAD_KERNEL_STACK loads the kernel stack. -#define LOAD_KERNEL_STACK(entry) \ - MOVQ ENTRY_STACK_TOP(entry), SP; - -// See kernel.go. -TEXT ·Halt(SB),NOSPLIT,$0 - HLT - RET - -// See entry_amd64.go. -TEXT ·swapgs(SB),NOSPLIT,$0 - SWAP_GS() - RET - -// jumpToKernel changes execution to the kernel address space. -// -// This works by changing the return value to the kernel version. -TEXT ·jumpToKernel(SB),NOSPLIT,$0 - MOVQ 0(SP), AX - ORQ ·KernelStartAddress(SB), AX // Future return value. - MOVQ AX, 0(SP) - RET - -// See entry_amd64.go. -TEXT ·sysret(SB),NOSPLIT,$0-24 - CALL ·jumpToKernel(SB) - // Save original state and stack. sysenter() or exception() - // from APP(gr3) will switch to this stack, set the return - // value (vector: 32(SP)) and then do RET, which will also - // automatically return to the lower half. - MOVQ cpu+0(FP), BX - MOVQ regs+8(FP), AX - MOVQ userCR3+16(FP), CX - MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) - MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) - MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) - - // save SP AX userCR3 on the kernel stack. - MOVQ CPU_ENTRY(BX), BX - LOAD_KERNEL_STACK(BX) - PUSHQ PTRACE_RSP(AX) - PUSHQ PTRACE_RAX(AX) - PUSHQ CX - - // Restore user register state. - REGISTERS_LOAD(AX, 0) - MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. - MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. - - // restore userCR3, AX, SP. - POPQ AX // Get userCR3. - WRITE_CR3() // Switch to userCR3. - POPQ AX // Restore AX. - POPQ SP // Restore SP. - SYSRET64() - -// See entry_amd64.go. -TEXT ·iret(SB),NOSPLIT,$0-24 - CALL ·jumpToKernel(SB) - // Save original state and stack. sysenter() or exception() - // from APP(gr3) will switch to this stack, set the return - // value (vector: 32(SP)) and then do RET, which will also - // automatically return to the lower half. - MOVQ cpu+0(FP), BX - MOVQ regs+8(FP), AX - MOVQ userCR3+16(FP), CX - MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) - MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) - MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) - - // Build an IRET frame & restore state. - MOVQ CPU_ENTRY(BX), BX - LOAD_KERNEL_STACK(BX) - PUSHQ PTRACE_SS(AX) - PUSHQ PTRACE_RSP(AX) - PUSHQ PTRACE_FLAGS(AX) - PUSHQ PTRACE_CS(AX) - PUSHQ PTRACE_RIP(AX) - PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack. - PUSHQ CX // Save userCR3 on kernel stack. - REGISTERS_LOAD(AX, 0) // Restore most registers. - POPQ AX // Get userCR3. - WRITE_CR3() // Switch to userCR3. - POPQ AX // Restore AX. - IRET() - -// See entry_amd64.go. -TEXT ·resume(SB),NOSPLIT,$0 - // See iret, above. - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - PUSHQ CPU_REGISTERS+PTRACE_SS(AX) - PUSHQ CPU_REGISTERS+PTRACE_RSP(AX) - PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX) - PUSHQ CPU_REGISTERS+PTRACE_CS(AX) - PUSHQ CPU_REGISTERS+PTRACE_RIP(AX) - REGISTERS_LOAD(AX, CPU_REGISTERS) - MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX - IRET() - -// See entry_amd64.go. -TEXT ·Start(SB),NOSPLIT,$0 - PUSHQ $0x0 // Previous frame pointer. - MOVQ SP, BP // Set frame pointer. - PUSHQ AX // First argument (CPU). - CALL ·start(SB) // Call Go hook. - JMP ·resume(SB) // Restore to registers. - -// See entry_amd64.go. -TEXT ·sysenter(SB),NOSPLIT,$0 - // _RFLAGS_IOPL0 is always set in the user mode and it is never set in - // the kernel mode. See the comment of UserFlagsSet for more details. - TESTL $_RFLAGS_IOPL0, R11 - JZ kernel -user: - SWAP_GS() - MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. - MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. - WRITE_CR3() // Switch to kernel cr3. - - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. - REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - MOVQ CX, PTRACE_RIP(AX) - MOVQ R11, PTRACE_FLAGS(AX) - MOVQ SP, PTRACE_RSP(AX) - MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value. - MOVQ CX, PTRACE_RAX(AX) // Save everything else. - MOVQ CX, PTRACE_ORIGRAX(AX) - - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks. - MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. - MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. - - // Return to the kernel, where the frame is: - // - // vector (sp+32) - // userCR3 (sp+24) - // regs (sp+16) - // cpu (sp+8) - // vcpu.Switch (sp+0) - // - MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. - MOVQ $Syscall, 32(SP) // Output vector. - RET - -kernel: - // We can't restore the original stack, but we can access the registers - // in the CPU state directly. No need for temporary juggling. - MOVQ AX, ENTRY_SCRATCH0(GS) - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - REGISTERS_SAVE(AX, CPU_REGISTERS) - MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX) - MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX) - MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX) - MOVQ ENTRY_SCRATCH0(GS), BX - MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) - MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) - MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. - MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. - - // Call the syscall trampoline. - LOAD_KERNEL_STACK(GS) - PUSHQ AX // First argument (vCPU). - CALL ·kernelSyscall(SB) // Call the trampoline. - POPQ AX // Pop vCPU. - JMP ·resume(SB) - -// exception is a generic exception handler. -// -// There are two cases handled: -// -// 1) An exception in kernel mode: this results in saving the state at the time -// of the exception and calling the defined hook. -// -// 2) An exception in guest mode: the original kernel frame is restored, and -// the vector & error codes are pushed as return values. -// -// See below for the stubs that call exception. -TEXT ·exception(SB),NOSPLIT,$0 - // Determine whether the exception occurred in kernel mode or user - // mode, based on the flags. We expect the following stack: - // - // SS (sp+48) - // SP (sp+40) - // FLAGS (sp+32) - // CS (sp+24) - // IP (sp+16) - // ERROR_CODE (sp+8) - // VECTOR (sp+0) - // - TESTL $_RFLAGS_IOPL0, 32(SP) - JZ kernel - -user: - SWAP_GS() - ADDQ $-8, SP // Adjust for flags. - MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). - PUSHQ AX // Save user AX on stack. - MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. - WRITE_CR3() // Switch to kernel cr3. - - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. - REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - POPQ BX // Restore original AX. - MOVQ BX, PTRACE_RAX(AX) // Save it. - MOVQ BX, PTRACE_ORIGRAX(AX) - MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) - MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX) - MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX) - MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) - MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) - - // Copy out and return. - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - MOVQ 0(SP), BX // Load vector. - MOVQ 8(SP), CX // Load error code. - MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version). - MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. - MOVQ CX, CPU_ERROR_CODE(AX) // Set error code. - MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. - MOVQ BX, 32(SP) // Output vector. - RET - -kernel: - // As per above, we can save directly. - PUSHQ AX - MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. - REGISTERS_SAVE(AX, CPU_REGISTERS) - POPQ BX - MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) - MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) - MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX) - MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX) - MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX) - - // Set the error code and adjust the stack. - MOVQ 8(SP), BX // Load the error code. - MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. - MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. - MOVQ 0(SP), BX // BX contains the vector. - - // Call the exception trampoline. - LOAD_KERNEL_STACK(GS) - PUSHQ BX // Second argument (vector). - PUSHQ AX // First argument (vCPU). - CALL ·kernelException(SB) // Call the trampoline. - POPQ BX // Pop vector. - POPQ AX // Pop vCPU. - JMP ·resume(SB) - -#define EXCEPTION_WITH_ERROR(value, symbol) \ -TEXT symbol,NOSPLIT,$0; \ - PUSHQ $value; \ - JMP ·exception(SB); - -#define EXCEPTION_WITHOUT_ERROR(value, symbol) \ -TEXT symbol,NOSPLIT,$0; \ - PUSHQ $0x0; \ - PUSHQ $value; \ - JMP ·exception(SB); - -EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB)) -EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB)) -EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB)) -EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB)) -EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB)) -EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB)) -EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB)) -EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB)) -EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB)) -EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB)) -EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB)) -EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB)) -EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB)) -EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB)) -EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB)) -EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB)) -EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB)) -EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB)) -EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB)) -EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB)) -EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB)) -EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB)) diff --git a/pkg/sentry/platform/ring0/entry_arm64.go b/pkg/sentry/platform/ring0/entry_arm64.go deleted file mode 100644 index 62a93f3d6..000000000 --- a/pkg/sentry/platform/ring0/entry_arm64.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -// This is an assembly function. -// -// The sysenter function is invoked in two situations: -// -// (1) The guest kernel has executed a system call. -// (2) The guest application has executed a system call. -// -// The interrupt flag is examined to determine whether the system call was -// executed from kernel mode or not and the appropriate stub is called. - -func El1_sync_invalid() -func El1_irq_invalid() -func El1_fiq_invalid() -func El1_error_invalid() - -func El1_sync() -func El1_irq() -func El1_fiq() -func El1_error() - -func El0_sync() -func El0_irq() -func El0_fiq() -func El0_error() - -func El0_sync_invalid() -func El0_irq_invalid() -func El0_fiq_invalid() -func El0_error_invalid() - -func Vectors() - -// Start is the CPU entrypoint. -// -// The CPU state will be set to c.Registers(). -func Start() -func kernelExitToEl1() - -func kernelExitToEl0() - -// Shutdown execution -func Shutdown() diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s deleted file mode 100644 index b2bb18257..000000000 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ /dev/null @@ -1,769 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "funcdata.h" -#include "textflag.h" - -// NB: Offsets are programatically generated (see BUILD). -// -// This file is concatenated with the definitions. - -// Saves a register set. -// -// This is a macro because it may need to executed in contents where a stack is -// not available for calls. -// - -// ERET returns using the ELR and SPSR for the current exception level. -#define ERET() \ - WORD $0xd69f03e0; \ - DSB $7; \ - ISB $15; - -// RSV_REG is a register that holds el1 information temporarily. -#define RSV_REG R18_PLATFORM - -// RSV_REG_APP is a register that holds el0 information temporarily. -#define RSV_REG_APP R9 - -#define FPEN_NOTRAP 0x3 -#define FPEN_SHIFT 20 - -#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) - -// sctlr_el1: system control register el1. -#define SCTLR_M 1 << 0 -#define SCTLR_C 1 << 2 -#define SCTLR_I 1 << 12 -#define SCTLR_DZE 1 << 14 -#define SCTLR_UCT 1 << 15 -#define SCTLR_UCI 1 << 26 - -#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI | SCTLR_DZE) - -// cntkctl_el1: counter-timer kernel control register el1. -#define CNTKCTL_EL0PCTEN 1 << 0 -#define CNTKCTL_EL0VCTEN 1 << 1 - -#define CNTKCTL_EL1_DEFAULT (CNTKCTL_EL0PCTEN | CNTKCTL_EL0VCTEN) - -// Saves a register set. -// -// This is a macro because it may need to executed in contents where a stack is -// not available for calls. -// -// The following registers are not saved: R9, R18. -#define REGISTERS_SAVE(reg, offset) \ - MOVD R0, offset+PTRACE_R0(reg); \ - MOVD R1, offset+PTRACE_R1(reg); \ - MOVD R2, offset+PTRACE_R2(reg); \ - MOVD R3, offset+PTRACE_R3(reg); \ - MOVD R4, offset+PTRACE_R4(reg); \ - MOVD R5, offset+PTRACE_R5(reg); \ - MOVD R6, offset+PTRACE_R6(reg); \ - MOVD R7, offset+PTRACE_R7(reg); \ - MOVD R8, offset+PTRACE_R8(reg); \ - MOVD R10, offset+PTRACE_R10(reg); \ - MOVD R11, offset+PTRACE_R11(reg); \ - MOVD R12, offset+PTRACE_R12(reg); \ - MOVD R13, offset+PTRACE_R13(reg); \ - MOVD R14, offset+PTRACE_R14(reg); \ - MOVD R15, offset+PTRACE_R15(reg); \ - MOVD R16, offset+PTRACE_R16(reg); \ - MOVD R17, offset+PTRACE_R17(reg); \ - MOVD R19, offset+PTRACE_R19(reg); \ - MOVD R20, offset+PTRACE_R20(reg); \ - MOVD R21, offset+PTRACE_R21(reg); \ - MOVD R22, offset+PTRACE_R22(reg); \ - MOVD R23, offset+PTRACE_R23(reg); \ - MOVD R24, offset+PTRACE_R24(reg); \ - MOVD R25, offset+PTRACE_R25(reg); \ - MOVD R26, offset+PTRACE_R26(reg); \ - MOVD R27, offset+PTRACE_R27(reg); \ - MOVD g, offset+PTRACE_R28(reg); \ - MOVD R29, offset+PTRACE_R29(reg); \ - MOVD R30, offset+PTRACE_R30(reg); - -// Loads a register set. -// -// This is a macro because it may need to executed in contents where a stack is -// not available for calls. -// -// The following registers are not loaded: R9, R18. -#define REGISTERS_LOAD(reg, offset) \ - MOVD offset+PTRACE_R0(reg), R0; \ - MOVD offset+PTRACE_R1(reg), R1; \ - MOVD offset+PTRACE_R2(reg), R2; \ - MOVD offset+PTRACE_R3(reg), R3; \ - MOVD offset+PTRACE_R4(reg), R4; \ - MOVD offset+PTRACE_R5(reg), R5; \ - MOVD offset+PTRACE_R6(reg), R6; \ - MOVD offset+PTRACE_R7(reg), R7; \ - MOVD offset+PTRACE_R8(reg), R8; \ - MOVD offset+PTRACE_R10(reg), R10; \ - MOVD offset+PTRACE_R11(reg), R11; \ - MOVD offset+PTRACE_R12(reg), R12; \ - MOVD offset+PTRACE_R13(reg), R13; \ - MOVD offset+PTRACE_R14(reg), R14; \ - MOVD offset+PTRACE_R15(reg), R15; \ - MOVD offset+PTRACE_R16(reg), R16; \ - MOVD offset+PTRACE_R17(reg), R17; \ - MOVD offset+PTRACE_R19(reg), R19; \ - MOVD offset+PTRACE_R20(reg), R20; \ - MOVD offset+PTRACE_R21(reg), R21; \ - MOVD offset+PTRACE_R22(reg), R22; \ - MOVD offset+PTRACE_R23(reg), R23; \ - MOVD offset+PTRACE_R24(reg), R24; \ - MOVD offset+PTRACE_R25(reg), R25; \ - MOVD offset+PTRACE_R26(reg), R26; \ - MOVD offset+PTRACE_R27(reg), R27; \ - MOVD offset+PTRACE_R28(reg), g; \ - MOVD offset+PTRACE_R29(reg), R29; \ - MOVD offset+PTRACE_R30(reg), R30; - -#define ESR_ELx_EC_UNKNOWN (0x00) -#define ESR_ELx_EC_WFx (0x01) -/* Unallocated EC: 0x02 */ -#define ESR_ELx_EC_CP15_32 (0x03) -#define ESR_ELx_EC_CP15_64 (0x04) -#define ESR_ELx_EC_CP14_MR (0x05) -#define ESR_ELx_EC_CP14_LS (0x06) -#define ESR_ELx_EC_FP_ASIMD (0x07) -#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ -#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ -/* Unallocated EC: 0x0A - 0x0B */ -#define ESR_ELx_EC_CP14_64 (0x0C) -/* Unallocated EC: 0x0d */ -#define ESR_ELx_EC_ILL (0x0E) -/* Unallocated EC: 0x0F - 0x10 */ -#define ESR_ELx_EC_SVC32 (0x11) -#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ -#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ -/* Unallocated EC: 0x14 */ -#define ESR_ELx_EC_SVC64 (0x15) -#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ -#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ -#define ESR_ELx_EC_SYS64 (0x18) -#define ESR_ELx_EC_SVE (0x19) -/* Unallocated EC: 0x1A - 0x1E */ -#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ -#define ESR_ELx_EC_IABT_LOW (0x20) -#define ESR_ELx_EC_IABT_CUR (0x21) -#define ESR_ELx_EC_PC_ALIGN (0x22) -/* Unallocated EC: 0x23 */ -#define ESR_ELx_EC_DABT_LOW (0x24) -#define ESR_ELx_EC_DABT_CUR (0x25) -#define ESR_ELx_EC_SP_ALIGN (0x26) -/* Unallocated EC: 0x27 */ -#define ESR_ELx_EC_FP_EXC32 (0x28) -/* Unallocated EC: 0x29 - 0x2B */ -#define ESR_ELx_EC_FP_EXC64 (0x2C) -/* Unallocated EC: 0x2D - 0x2E */ -#define ESR_ELx_EC_SERROR (0x2F) -#define ESR_ELx_EC_BREAKPT_LOW (0x30) -#define ESR_ELx_EC_BREAKPT_CUR (0x31) -#define ESR_ELx_EC_SOFTSTP_LOW (0x32) -#define ESR_ELx_EC_SOFTSTP_CUR (0x33) -#define ESR_ELx_EC_WATCHPT_LOW (0x34) -#define ESR_ELx_EC_WATCHPT_CUR (0x35) -/* Unallocated EC: 0x36 - 0x37 */ -#define ESR_ELx_EC_BKPT32 (0x38) -/* Unallocated EC: 0x39 */ -#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ -/* Unallocted EC: 0x3B */ -#define ESR_ELx_EC_BRK64 (0x3C) -/* Unallocated EC: 0x3D - 0x3F */ -#define ESR_ELx_EC_MAX (0x3F) - -#define ESR_ELx_EC_SHIFT (26) -#define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) -#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) - -#define ESR_ELx_IL_SHIFT (25) -#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) -#define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) - -/* ISS field definitions shared by different classes */ -#define ESR_ELx_WNR_SHIFT (6) -#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) - -/* Asynchronous Error Type */ -#define ESR_ELx_IDS_SHIFT (24) -#define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) -#define ESR_ELx_AET_SHIFT (10) -#define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) - -#define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) -#define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) -#define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) -#define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) -#define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) - -/* Shared ISS field definitions for Data/Instruction aborts */ -#define ESR_ELx_SET_SHIFT (11) -#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) -#define ESR_ELx_FnV_SHIFT (10) -#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) -#define ESR_ELx_EA_SHIFT (9) -#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) -#define ESR_ELx_S1PTW_SHIFT (7) -#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) - -/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ -#define ESR_ELx_FSC (0x3F) -#define ESR_ELx_FSC_TYPE (0x3C) -#define ESR_ELx_FSC_EXTABT (0x10) -#define ESR_ELx_FSC_SERROR (0x11) -#define ESR_ELx_FSC_ACCESS (0x08) -#define ESR_ELx_FSC_FAULT (0x04) -#define ESR_ELx_FSC_PERM (0x0C) - -/* ISS field definitions for Data Aborts */ -#define ESR_ELx_ISV_SHIFT (24) -#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) -#define ESR_ELx_SAS_SHIFT (22) -#define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) -#define ESR_ELx_SSE_SHIFT (21) -#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) -#define ESR_ELx_SRT_SHIFT (16) -#define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) -#define ESR_ELx_SF_SHIFT (15) -#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) -#define ESR_ELx_AR_SHIFT (14) -#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) -#define ESR_ELx_CM_SHIFT (8) -#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) - -/* ISS field definitions for exceptions taken in to Hyp */ -#define ESR_ELx_CV (UL(1) << 24) -#define ESR_ELx_COND_SHIFT (20) -#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) -#define ESR_ELx_WFx_ISS_TI (UL(1) << 0) -#define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) -#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) -#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) - -/* ISS field definitions for system error */ -#define ESR_ELx_SERR_MASK (0x1) -#define ESR_ELx_SERR_NMI (0x1) - -// LOAD_KERNEL_ADDRESS loads a kernel address. -#define LOAD_KERNEL_ADDRESS(from, to) \ - MOVD from, to; \ - ORR $0xffff000000000000, to, to; - -// LOAD_KERNEL_STACK loads the kernel temporary stack. -#define LOAD_KERNEL_STACK(from) \ - LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \ - MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \ - MOVD RSV_REG, RSP; \ - WORD $0xd538d092; //MRS TPIDR_EL1, R18 - -// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application. -#define SWITCH_TO_APP_PAGETABLE() \ - MOVD CPU_APP_ASID(RSV_REG), RSV_REG_APP; \ - MOVD CPU_TTBR0_APP(RSV_REG), RSV_REG; \ - BFI $48, RSV_REG_APP, $16, RSV_REG; \ - MSR RSV_REG, TTBR0_EL1; \ - ISB $15; - -// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable. -#define SWITCH_TO_KVM_PAGETABLE() \ - MOVD CPU_TTBR0_KVM(RSV_REG), RSV_REG; \ - MOVD $1, RSV_REG_APP; \ - BFI $48, RSV_REG_APP, $16, RSV_REG; \ - MSR RSV_REG, TTBR0_EL1; \ - ISB $15; - -TEXT ·EnableVFP(SB),NOSPLIT,$0 - MOVD $FPEN_ENABLE, R0 - WORD $0xd5181040 //MSR R0, CPACR_EL1 - ISB $15 - RET - -TEXT ·DisableVFP(SB),NOSPLIT,$0 - MOVD $0, R0 - WORD $0xd5181040 //MSR R0, CPACR_EL1 - ISB $15 - RET - -#define VFP_ENABLE \ - MOVD $FPEN_ENABLE, R0; \ - WORD $0xd5181040; \ //MSR R0, CPACR_EL1 - ISB $15; - -#define VFP_DISABLE \ - MOVD $0x0, R0; \ - WORD $0xd5181040; \ //MSR R0, CPACR_EL1 - ISB $15; - -// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1. -#define KERNEL_ENTRY_FROM_EL0 \ - SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack. - STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ - WORD $0xd538d092; \ // MRS TPIDR_EL1, R18 - MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer. - REGISTERS_SAVE(RSV_REG_APP, 0); \ // step3, save app context. - MOVD RSV_REG_APP, R20; \ - LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \ - ADD $16, RSP, RSP; \ - MOVD RSV_REG, PTRACE_R18(R20); \ - MOVD RSV_REG_APP, PTRACE_R9(R20); \ - MRS TPIDR_EL0, R3; \ - MOVD R3, PTRACE_TLS(R20); \ - WORD $0xd5384003; \ // MRS SPSR_EL1, R3 - MOVD R3, PTRACE_PSTATE(R20); \ - MRS ELR_EL1, R3; \ - MOVD R3, PTRACE_PC(R20); \ - WORD $0xd5384103; \ // MRS SP_EL0, R3 - MOVD R3, PTRACE_SP(R20); - -// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1. -#define KERNEL_ENTRY_FROM_EL1 \ - WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 - REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // Save sentry context. - MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \ - MRS TPIDR_EL0, R4; \ - MOVD R4, CPU_REGISTERS+PTRACE_TLS(RSV_REG); \ - WORD $0xd5384004; \ // MRS SPSR_EL1, R4 - MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \ - MRS ELR_EL1, R4; \ - MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \ - MOVD RSP, R4; \ - MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ - LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. - -// EXCEPTION_EL0 is a common el0 exception handler function. -#define EXCEPTION_EL0(vector) \ - WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 - WORD $0xd538601a; \ //MRS FAR_EL1, R26 - MOVD R26, CPU_FAULT_ADDR(RSV_REG); \ - MOVD $1, R3; \ - MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user. - MOVD $vector, R3; \ - MOVD R3, CPU_VECTOR_CODE(RSV_REG); \ - MRS ESR_EL1, R3; \ - MOVD R3, CPU_ERROR_CODE(RSV_REG); \ - B ·kernelExitToEl1(SB); - -// EXCEPTION_EL1 is a common el1 exception handler function. -#define EXCEPTION_EL1(vector) \ - MOVD $vector, R3; \ - MOVD R3, 8(RSP); \ - B ·HaltEl1ExceptionAndResume(SB); - -// storeAppASID writes the application's asid value. -TEXT ·storeAppASID(SB),NOSPLIT,$0-8 - MOVD asid+0(FP), R1 - MRS TPIDR_EL1, RSV_REG - MOVD R1, CPU_APP_ASID(RSV_REG) - RET - -// Halt halts execution. -TEXT ·Halt(SB),NOSPLIT,$0 - // Clear bluepill. - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - CMP RSV_REG, R9 - BNE mmio_exit - MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG) - -mmio_exit: - // Disable fpsimd. - WORD $0xd5381041 // MRS CPACR_EL1, R1 - MOVD R1, CPU_LAZY_VFP(RSV_REG) - VFP_DISABLE - - // Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT. - // - // To keep it simple, I used the address of exception table as the - // MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing - // a read-only space. - // Also, the length is engough to match a sufficient number of hypercall ID. - // Then, in host user space, I can calculate this address to find out - // which hypercall. - MRS VBAR_EL1, R9 - MOVD R0, 0x0(R9) - - RET - -// HaltAndResume halts execution and point the pointer to the resume function. -TEXT ·HaltAndResume(SB),NOSPLIT,$0 - BL ·Halt(SB) - B ·kernelExitToEl1(SB) // Resume. - -// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. -TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 - WORD $0xd538d092 // MRS TPIDR_EL1, R18 - MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. - MOVD R3, 8(RSP) // First argument (vCPU). - CALL ·kernelSyscall(SB) // Call the trampoline. - B ·kernelExitToEl1(SB) // Resume. - -// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. -TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0-8 - WORD $0xd538d092 // MRS TPIDR_EL1, R18 - MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. - MOVD R3, 8(RSP) // First argument (vCPU). - MOVD vector+0(FP), R3 - MOVD R3, 16(RSP) // Second argument (vector). - CALL ·kernelException(SB) // Call the trampoline. - B ·kernelExitToEl1(SB) // Resume. - -// Shutdown stops the guest. -TEXT ·Shutdown(SB),NOSPLIT,$0 - // PSCI EVENT. - MOVD $0x84000009, R0 - HVC $0 - -// See kernel.go. -TEXT ·Current(SB),NOSPLIT,$0-8 - MOVD CPU_SELF(RSV_REG), R8 - MOVD R8, ret+0(FP) - RET - -#define STACK_FRAME_SIZE 32 - -// kernelExitToEl0 is the entrypoint for application in guest_el0. -// Prepare the vcpu environment for container application. -TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 - // Step1, save sentry context into memory. - MRS TPIDR_EL1, RSV_REG - REGISTERS_SAVE(RSV_REG, CPU_REGISTERS) - MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG) - MRS TPIDR_EL0, R3 - MOVD R3, CPU_REGISTERS+PTRACE_TLS(RSV_REG) - - WORD $0xd5384003 // MRS SPSR_EL1, R3 - MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG) - MOVD R30, CPU_REGISTERS+PTRACE_PC(RSV_REG) - MOVD RSP, R3 - MOVD R3, CPU_REGISTERS+PTRACE_SP(RSV_REG) - - MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3 - - // Step2, switch to temporary stack. - LOAD_KERNEL_STACK(RSV_REG) - - // Step3, load app context pointer. - MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP - - // Step4, prepare the environment for container application. - // set sp_el0. - MOVD PTRACE_SP(RSV_REG_APP), R1 - WORD $0xd5184101 //MSR R1, SP_EL0 - // set pc. - MOVD PTRACE_PC(RSV_REG_APP), R1 - MSR R1, ELR_EL1 - // set pstate. - MOVD PTRACE_PSTATE(RSV_REG_APP), R1 - WORD $0xd5184001 //MSR R1, SPSR_EL1 - - // need use kernel space address to excute below code, since - // after SWITCH_TO_APP_PAGETABLE the ASID is changed to app's - // ASID. - WORD $0x10000061 // ADR R1, do_exit_to_el0 - ORR $0xffff000000000000, R1, R1 - JMP (R1) - -do_exit_to_el0: - // RSV_REG & RSV_REG_APP will be loaded at the end. - REGISTERS_LOAD(RSV_REG_APP, 0) - MOVD PTRACE_TLS(RSV_REG_APP), RSV_REG - MSR RSV_REG, TPIDR_EL0 - - // switch to user pagetable. - MOVD PTRACE_R18(RSV_REG_APP), RSV_REG - MOVD PTRACE_R9(RSV_REG_APP), RSV_REG_APP - - SUB $STACK_FRAME_SIZE, RSP, RSP - STP (RSV_REG, RSV_REG_APP), 16*0(RSP) - STP (R0, R1), 16*1(RSP) - - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - - SWITCH_TO_APP_PAGETABLE() - - LDP 16*1(RSP), (R0, R1) - LDP 16*0(RSP), (RSV_REG, RSV_REG_APP) - ADD $STACK_FRAME_SIZE, RSP, RSP - - ERET() - -// kernelExitToEl1 is the entrypoint for sentry in guest_el1. -// Prepare the vcpu environment for sentry. -TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1 - WORD $0xd5184001 //MSR R1, SPSR_EL1 - - MOVD CPU_REGISTERS+PTRACE_PC(RSV_REG), R1 - MSR R1, ELR_EL1 - - // restore sentry's tls. - MOVD CPU_REGISTERS+PTRACE_TLS(RSV_REG), R1 - MSR R1, TPIDR_EL0 - - MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1 - MOVD R1, RSP - - REGISTERS_LOAD(RSV_REG, CPU_REGISTERS) - SWITCH_TO_KVM_PAGETABLE() - MRS TPIDR_EL1, RSV_REG - - MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP - - ERET() - -// Start is the CPU entrypoint. -TEXT ·Start(SB),NOSPLIT,$0 - // Init. - WORD $0xd508871f // __tlbi(vmalle1) - DSB $7 // dsb(nsh) - - MOVD $1<<12, R1 // Reset mdscr_el1 and disable - MSR R1, MDSCR_EL1 // access to the DCC from EL0 - ISB $15 - - MRS TTBR1_EL1, R1 - MSR R1, TTBR0_EL1 - ISB $15 - - MOVD $CNTKCTL_EL1_DEFAULT, R1 - MSR R1, CNTKCTL_EL1 - - MOVD R8, RSV_REG - ORR $0xffff000000000000, RSV_REG, RSV_REG - WORD $0xd518d092 //MSR R18, TPIDR_EL1 - - // Init. - MOVD $SCTLR_EL1_DEFAULT, R1 // re-enable the mmu. - MSR R1, SCTLR_EL1 - ISB $15 - WORD $0xd508751f // ic iallu - - DSB $7 // dsb(nsh) - ISB $15 - - B ·kernelExitToEl1(SB) - -// El1_sync_invalid is the handler for an invalid EL1_sync. -TEXT ·El1_sync_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El1_irq_invalid is the handler for an invalid El1_irq. -TEXT ·El1_irq_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El1_fiq_invalid is the handler for an invalid El1_fiq. -TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El1_error_invalid is the handler for an invalid El1_error. -TEXT ·El1_error_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El1_sync is the handler for El1_sync. -TEXT ·El1_sync(SB),NOSPLIT,$0 - KERNEL_ENTRY_FROM_EL1 - MRS ESR_EL1, R25 // read the syndrome register - LSR $ESR_ELx_EC_SHIFT, R25, R24 // exception class - CMP $ESR_ELx_EC_DABT_CUR, R24 - BEQ el1_da // data abort in EL1 - CMP $ESR_ELx_EC_IABT_CUR, R24 - BEQ el1_ia // instruction abort in EL1 - CMP $ESR_ELx_EC_SP_ALIGN, R24 - BEQ el1_sp_pc // stack alignment exception - CMP $ESR_ELx_EC_PC_ALIGN, R24 - BEQ el1_sp_pc // pc alignment exception - CMP $ESR_ELx_EC_UNKNOWN, R24 - BEQ el1_undef // unknown exception in EL1 - CMP $ESR_ELx_EC_SVC64, R24 - BEQ el1_svc // SVC in 64-bit state - CMP $ESR_ELx_EC_BREAKPT_CUR, R24 - BEQ el1_dbg // debug exception in EL1 - CMP $ESR_ELx_EC_FP_ASIMD, R24 - BEQ el1_fpsimd_acc // FP/ASIMD access - CMP $ESR_ELx_EC_SVE, R24 - BEQ el1_sve_acc // SVE access - B el1_invalid - -el1_da: - EXCEPTION_EL1(El1SyncDa) -el1_ia: - EXCEPTION_EL1(El1SyncIa) -el1_sp_pc: - EXCEPTION_EL1(El1SyncSpPc) -el1_undef: - EXCEPTION_EL1(El1SyncUndef) -el1_svc: - B ·HaltEl1SvcAndResume(SB) -el1_dbg: - EXCEPTION_EL1(El1SyncDbg) -el1_fpsimd_acc: -el1_sve_acc: - VFP_ENABLE - B ·kernelExitToEl1(SB) // Resume. -el1_invalid: - EXCEPTION_EL1(El1SyncInv) - -// El1_irq is the handler for El1_irq. -TEXT ·El1_irq(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El1_fiq is the handler for El1_fiq. -TEXT ·El1_fiq(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El1_error is the handler for El1_error. -TEXT ·El1_error(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// El0_sync is the handler for El0_sync. -TEXT ·El0_sync(SB),NOSPLIT,$0 - KERNEL_ENTRY_FROM_EL0 - MRS ESR_EL1, R25 // read the syndrome register - LSR $ESR_ELx_EC_SHIFT, R25, R24 // exception class - CMP $ESR_ELx_EC_SVC64, R24 - BEQ el0_svc // SVC in 64-bit state - CMP $ESR_ELx_EC_DABT_LOW, R24 - BEQ el0_da // data abort in EL0 - CMP $ESR_ELx_EC_IABT_LOW, R24 - BEQ el0_ia // instruction abort in EL0 - CMP $ESR_ELx_EC_FP_ASIMD, R24 - BEQ el0_fpsimd_acc // FP/ASIMD access - CMP $ESR_ELx_EC_SVE, R24 - BEQ el0_sve_acc // SVE access - CMP $ESR_ELx_EC_FP_EXC64, R24 - BEQ el0_fpsimd_exc // FP/ASIMD exception - CMP $ESR_ELx_EC_SP_ALIGN, R24 - BEQ el0_sp_pc // stack alignment exception - CMP $ESR_ELx_EC_PC_ALIGN, R24 - BEQ el0_sp_pc // pc alignment exception - CMP $ESR_ELx_EC_UNKNOWN, R24 - BEQ el0_undef // unknown exception in EL0 - CMP $ESR_ELx_EC_BREAKPT_LOW, R24 - BEQ el0_dbg // debug exception in EL0 - CMP $ESR_ELx_EC_SYS64, R24 - BEQ el0_sys // configurable trap - CMP $ESR_ELx_EC_WFx, R24 - BEQ el0_wfx // WFX trap - B el0_invalid - -el0_svc: - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - - MOVD $0, CPU_ERROR_CODE(RSV_REG) // Clear error code. - - MOVD $1, R3 - MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. - - MOVD $Syscall, R3 - MOVD R3, CPU_VECTOR_CODE(RSV_REG) - - B ·kernelExitToEl1(SB) - -el0_da: -el0_ia: - EXCEPTION_EL0(PageFault) -el0_fpsimd_acc: - EXCEPTION_EL0(El0SyncFpsimdAcc) -el0_sve_acc: - EXCEPTION_EL0(El0SyncSveAcc) -el0_fpsimd_exc: - EXCEPTION_EL0(El0SyncFpsimdExc) -el0_sp_pc: - EXCEPTION_EL0(El0SyncSpPc) -el0_undef: - EXCEPTION_EL0(El0SyncUndef) -el0_dbg: - EXCEPTION_EL0(El0SyncDbg) -el0_sys: - EXCEPTION_EL0(El0SyncSys) -el0_wfx: - EXCEPTION_EL0(El0SyncWfx) -el0_invalid: - EXCEPTION_EL0(El0SyncInv) - -TEXT ·El0_irq(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -TEXT ·El0_fiq(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -TEXT ·El0_error(SB),NOSPLIT,$0 - KERNEL_ENTRY_FROM_EL0 - WORD $0xd5385219 // MRS ESR_EL1, R25 - AND $ESR_ELx_SERR_MASK, R25, R24 - CMP $ESR_ELx_SERR_NMI, R24 - BEQ el0_nmi - B el0_bounce - -el0_nmi: - EXCEPTION_EL0(El0ErrNMI) -el0_bounce: - EXCEPTION_EL0(VirtualizationException) - -TEXT ·El0_sync_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -TEXT ·El0_irq_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -TEXT ·El0_error_invalid(SB),NOSPLIT,$0 - B ·Shutdown(SB) - -// Vectors implements exception vector table. -// The start address of exception vector table should be 11-bits aligned. -// For detail, please refer to arm developer document: -// https://developer.arm.com/documentation/100933/0100/AArch64-exception-vector-table -// Also can refer to the code in linux kernel: arch/arm64/kernel/entry.S -TEXT ·Vectors(SB),NOSPLIT,$0 - PCALIGN $2048 - B ·El1_sync_invalid(SB) - PCALIGN $128 - B ·El1_irq_invalid(SB) - PCALIGN $128 - B ·El1_fiq_invalid(SB) - PCALIGN $128 - B ·El1_error_invalid(SB) - - PCALIGN $128 - B ·El1_sync(SB) - PCALIGN $128 - B ·El1_irq(SB) - PCALIGN $128 - B ·El1_fiq(SB) - PCALIGN $128 - B ·El1_error(SB) - - PCALIGN $128 - B ·El0_sync(SB) - PCALIGN $128 - B ·El0_irq(SB) - PCALIGN $128 - B ·El0_fiq(SB) - PCALIGN $128 - B ·El0_error(SB) - - PCALIGN $128 - B ·El0_sync_invalid(SB) - PCALIGN $128 - B ·El0_irq_invalid(SB) - PCALIGN $128 - B ·El0_fiq_invalid(SB) - PCALIGN $128 - B ·El0_error_invalid(SB) diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD deleted file mode 100644 index a9703baf6..000000000 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ /dev/null @@ -1,40 +0,0 @@ -load("//tools:defs.bzl", "go_binary") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "defs_impl_arm64", - out = "defs_impl_arm64.go", - package = "main", - template = "//pkg/sentry/platform/ring0:defs_arm64", -) - -go_template_instance( - name = "defs_impl_amd64", - out = "defs_impl_amd64.go", - package = "main", - template = "//pkg/sentry/platform/ring0:defs_amd64", -) - -go_binary( - name = "gen_offsets", - srcs = [ - "defs_impl_amd64.go", - "defs_impl_arm64.go", - "main.go", - ], - # Use the libc malloc to avoid any extra dependencies. This is required to - # pass the sentry deps test. - system_malloc = True, - visibility = [ - "//pkg/sentry/platform/kvm:__pkg__", - "//pkg/sentry/platform/ring0:__pkg__", - ], - deps = [ - "//pkg/cpuid", - "//pkg/sentry/arch", - "//pkg/sentry/platform/ring0/pagetables", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go deleted file mode 100644 index a4927da2f..000000000 --- a/pkg/sentry/platform/ring0/gen_offsets/main.go +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary gen_offsets is a helper for generating offset headers. -package main - -import ( - "os" -) - -func main() { - Emit(os.Stdout) -} diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go deleted file mode 100644 index 292f9d0cc..000000000 --- a/pkg/sentry/platform/ring0/kernel.go +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ring0 - -// Init initializes a new kernel. -// -//go:nosplit -func (k *Kernel) Init(maxCPUs int) { - k.init(maxCPUs) -} - -// Halt halts execution. -func Halt() - -// defaultHooks implements hooks. -type defaultHooks struct{} - -// KernelSyscall implements Hooks.KernelSyscall. -// -// +checkescape:all -// -//go:nosplit -func (defaultHooks) KernelSyscall() { - Halt() -} - -// KernelException implements Hooks.KernelException. -// -// +checkescape:all -// -//go:nosplit -func (defaultHooks) KernelException(Vector) { - Halt() -} - -// kernelSyscall is a trampoline. -// -// When in amd64, it is called with %rip on the upper half, so it can -// NOT access to any global data which is not mapped on upper and must -// call to function pointers or interfaces to switch to the lower half -// so that callee can access to global data. -// -// +checkescape:hard,stack -// -//go:nosplit -func kernelSyscall(c *CPU) { - c.hooks.KernelSyscall() -} - -// kernelException is a trampoline. -// -// When in amd64, it is called with %rip on the upper half, so it can -// NOT access to any global data which is not mapped on upper and must -// call to function pointers or interfaces to switch to the lower half -// so that callee can access to global data. -// -// +checkescape:hard,stack -// -//go:nosplit -func kernelException(c *CPU, vector Vector) { - c.hooks.KernelException(vector) -} - -// Init initializes a new CPU. -// -// Init allows embedding in other objects. -func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) { - c.self = c // Set self reference. - c.kernel = k // Set kernel reference. - c.init(cpuID) // Perform architectural init. - - // Require hooks. - if hooks != nil { - c.hooks = hooks - } else { - c.hooks = defaultHooks{} - } -} diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go deleted file mode 100644 index 36a60700e..000000000 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ /dev/null @@ -1,323 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "encoding/binary" - "reflect" - - "gvisor.dev/gvisor/pkg/usermem" -) - -// init initializes architecture-specific state. -func (k *Kernel) init(maxCPUs int) { - entrySize := reflect.TypeOf(kernelEntry{}).Size() - var ( - entries []kernelEntry - padding = 1 - ) - for { - entries = make([]kernelEntry, maxCPUs+padding-1) - totalSize := entrySize * uintptr(maxCPUs+padding-1) - addr := reflect.ValueOf(&entries[0]).Pointer() - if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize { - // The runtime forces power-of-2 alignment for allocations, and we are therefore - // safe once the first address is aligned and the chunk is at least a full page. - break - } - padding = padding << 1 - } - k.cpuEntries = entries - - k.globalIDT = &idt64{} - if reflect.TypeOf(idt64{}).Size() != usermem.PageSize { - panic("Size of globalIDT should be PageSize") - } - if reflect.ValueOf(k.globalIDT).Pointer()&(usermem.PageSize-1) != 0 { - panic("Allocated globalIDT should be page aligned") - } - - // Setup the IDT, which is uniform. - for v, handler := range handlers { - // Allow Breakpoint and Overflow to be called from all - // privilege levels. - dpl := 0 - if v == Breakpoint || v == Overflow { - dpl = 3 - } - // Note that we set all traps to use the interrupt stack, this - // is defined below when setting up the TSS. - k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */) - } -} - -// EntryRegions returns the set of kernel entry regions (must be mapped). -func (k *Kernel) EntryRegions() map[uintptr]uintptr { - regions := make(map[uintptr]uintptr) - - addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer() - size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries)) - end, _ := usermem.Addr(addr + size).RoundUp() - regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end) - - addr = reflect.ValueOf(k.globalIDT).Pointer() - size = reflect.TypeOf(idt64{}).Size() - end, _ = usermem.Addr(addr + size).RoundUp() - regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end) - - return regions -} - -// init initializes architecture-specific state. -func (c *CPU) init(cpuID int) { - c.kernelEntry = &c.kernel.cpuEntries[cpuID] - c.cpuSelf = c - // Null segment. - c.gdt[0].setNull() - - // Kernel & user segments. - c.gdt[segKcode] = KernelCodeSegment - c.gdt[segKdata] = KernelDataSegment - c.gdt[segUcode32] = UserCodeSegment32 - c.gdt[segUdata] = UserDataSegment - c.gdt[segUcode64] = UserCodeSegment64 - - // The task segment, this spans two entries. - tssBase, tssLimit, _ := c.TSS() - c.gdt[segTss].set( - uint32(tssBase), - uint32(tssLimit), - 0, // Privilege level zero. - SegmentDescriptorPresent| - SegmentDescriptorAccess| - SegmentDescriptorWrite| - SegmentDescriptorExecute) - c.gdt[segTssHi].setHi(uint32((tssBase) >> 32)) - - // Set the kernel stack pointer in the TSS (virtual address). - stackAddr := c.StackTop() - c.stackTop = stackAddr - c.tss.rsp0Lo = uint32(stackAddr) - c.tss.rsp0Hi = uint32(stackAddr >> 32) - c.tss.ist1Lo = uint32(stackAddr) - c.tss.ist1Hi = uint32(stackAddr >> 32) - - // Set the I/O bitmap base address beyond the last byte in the TSS - // to block access to the entire I/O address range. - // - // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: - // I/O addresses not spanned by the map are treated as if they had set - // bits in the map. - c.tss.ioPerm = tssLimit + 1 - - // Permanently set the kernel segments. - c.registers.Cs = uint64(Kcode) - c.registers.Ds = uint64(Kdata) - c.registers.Es = uint64(Kdata) - c.registers.Ss = uint64(Kdata) - c.registers.Fs = uint64(Kdata) - c.registers.Gs = uint64(Kdata) - - // Set mandatory flags. - c.registers.Eflags = KernelFlagsSet -} - -// StackTop returns the kernel's stack address. -// -//go:nosplit -func (c *CPU) StackTop() uint64 { - return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) -} - -// IDT returns the CPU's IDT base and limit. -// -//go:nosplit -func (c *CPU) IDT() (uint64, uint16) { - return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1) -} - -// GDT returns the CPU's GDT base and limit. -// -//go:nosplit -func (c *CPU) GDT() (uint64, uint16) { - return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1) -} - -// TSS returns the CPU's TSS base, limit and value. -// -//go:nosplit -func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { - return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss] -} - -// CR0 returns the CPU's CR0 value. -// -//go:nosplit -func (c *CPU) CR0() uint64 { - return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET -} - -// CR4 returns the CPU's CR4 value. -// -//go:nosplit -func (c *CPU) CR4() uint64 { - cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT) - if hasPCID { - cr4 |= _CR4_PCIDE - } - if hasXSAVE { - cr4 |= _CR4_OSXSAVE - } - if hasSMEP { - cr4 |= _CR4_SMEP - } - if hasFSGSBASE { - cr4 |= _CR4_FSGSBASE - } - return cr4 -} - -// EFER returns the CPU's EFER value. -// -//go:nosplit -func (c *CPU) EFER() uint64 { - return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX -} - -// IsCanonical indicates whether addr is canonical per the amd64 spec. -// -//go:nosplit -func IsCanonical(addr uint64) bool { - return addr <= 0x00007fffffffffff || addr > 0xffff800000000000 -} - -// SwitchToUser performs either a sysret or an iret. -// -// The return value is the vector that interrupted execution. -// -// This function will not split the stack. Callers will probably want to call -// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to -// calling this function. -// -// When this is done, this region is quite sensitive to things like system -// calls. After calling entersyscall, any memory used must have been allocated -// and no function calls without go:nosplit are permitted. Any calls made here -// are protected appropriately (e.g. IsCanonical and CR3). -// -// Also note that this function transitively depends on the compiler generating -// code that uses IP-relative addressing inside of absolute addresses. That's -// the case for amd64, but may not be the case for other architectures. -// -// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. -// -// +checkescape:all -// -//go:nosplit -func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { - userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) - c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) - - // Sanitize registers. - regs := switchOpts.Registers - regs.Eflags &= ^uint64(UserFlagsClear) - regs.Eflags |= UserFlagsSet - regs.Cs = uint64(Ucode64) // Required for iret. - regs.Ss = uint64(Udata) // Ditto. - - // Perform the switch. - swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. - WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. - LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. - if switchOpts.FullRestore { - vector = iret(c, regs, uintptr(userCR3)) - } else { - vector = sysret(c, regs, uintptr(userCR3)) - } - SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. - return -} - -// start is the CPU entrypoint. -// -// This is called from the Start asm stub (see entry_amd64.go); on return the -// registers in c.registers will be restored (not segments). -// -//go:nosplit -func start(c *CPU) { - // Save per-cpu & FS segment. - WriteGS(kernelAddr(c.kernelEntry)) - WriteFS(uintptr(c.registers.Fs_base)) - - // Initialize floating point. - // - // Note that on skylake, the valid XCR0 mask reported seems to be 0xff. - // This breaks down as: - // - // bit0 - x87 - // bit1 - SSE - // bit2 - AVX - // bit3-4 - MPX - // bit5-7 - AVX512 - // - // For some reason, enabled MPX & AVX512 on platforms that report them - // seems to be cause a general protection fault. (Maybe there are some - // virtualization issues and these aren't exported to the guest cpuid.) - // This needs further investigation, but we can limit the floating - // point operations to x87, SSE & AVX for now. - fninit() - xsetbv(0, validXCR0Mask&0x7) - - // Set the syscall target. - wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) - wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) - - // NOTE: This depends on having the 64-bit segments immediately - // following the 32-bit user segments. This is simply the way the - // sysret instruction is designed to work (it assumes they follow). - wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) - wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) -} - -// SetCPUIDFaulting sets CPUID faulting per the boolean value. -// -// True is returned if faulting could be set. -// -//go:nosplit -func SetCPUIDFaulting(on bool) bool { - // Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support - // for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR. - if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 { - features := rdmsr(_MSR_MISC_FEATURES) - if on { - features |= _MISC_FEATURE_CPUID_TRAP - } else { - features &^= _MISC_FEATURE_CPUID_TRAP - } - wrmsr(_MSR_MISC_FEATURES, features) - return true // Setting successful. - } - return false -} - -// ReadCR2 reads the current CR2 value. -// -//go:nosplit -func ReadCR2() uintptr { - return readCR2() -} diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go deleted file mode 100644 index c05284641..000000000 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -// HaltAndResume halts execution and point the pointer to the resume function. -//go:nosplit -func HaltAndResume() - -// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. -//go:nosplit -func HaltEl1SvcAndResume() - -// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. -//go:nosplit -func HaltEl1ExceptionAndResume() - -// init initializes architecture-specific state. -func (k *Kernel) init(maxCPUs int) { -} - -// init initializes architecture-specific state. -func (c *CPU) init(cpuID int) { - // Set the kernel stack pointer(virtual address). - c.registers.Sp = uint64(c.StackTop()) - -} - -// StackTop returns the kernel's stack address. -// -//go:nosplit -func (c *CPU) StackTop() uint64 { - return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) -} - -// IsCanonical indicates whether addr is canonical per the arm64 spec. -// -//go:nosplit -func IsCanonical(addr uint64) bool { - return addr <= 0x0000ffffffffffff || addr > 0xffff000000000000 -} - -// SwitchToUser performs an eret. -// -// The return value is the exception vector. -// -// +checkescape:all -// -//go:nosplit -func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { - storeAppASID(uintptr(switchOpts.UserASID)) - if switchOpts.Flush { - FlushTlbByASID(uintptr(switchOpts.UserASID)) - } - - regs := switchOpts.Registers - - regs.Pstate &= ^uint64(PsrFlagsClear) - regs.Pstate |= UserFlagsSet - - EnableVFP() - LoadFloatingPoint(switchOpts.FloatingPointState) - - kernelExitToEl0() - - SaveFloatingPoint(switchOpts.FloatingPointState) - DisableVFP() - - vector = c.vecCode - - return -} diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go deleted file mode 100644 index 16955ad91..000000000 --- a/pkg/sentry/platform/ring0/kernel_unsafe.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ring0 - -import ( - "unsafe" -) - -// eface mirrors runtime.eface. -type eface struct { - typ uintptr - data unsafe.Pointer -} - -// kernelAddr returns the kernel virtual address for the given object. -// -//go:nosplit -func kernelAddr(obj interface{}) uintptr { - e := (*eface)(unsafe.Pointer(&obj)) - return KernelStartAddress | uintptr(e.data) -} - -// kernelFunc returns the address of the given function. -// -//go:nosplit -func kernelFunc(fn func()) uintptr { - fnptr := (**uintptr)(unsafe.Pointer(&fn)) - return KernelStartAddress | **fnptr -} diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go deleted file mode 100644 index 0ec5c3bc5..000000000 --- a/pkg/sentry/platform/ring0/lib_amd64.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/cpuid" -) - -// LoadFloatingPoint loads floating point state by the most efficient mechanism -// available (set by Init). -var LoadFloatingPoint func(*byte) - -// SaveFloatingPoint saves floating point state by the most efficient mechanism -// available (set by Init). -var SaveFloatingPoint func(*byte) - -// fxrstor uses fxrstor64 to load floating point state. -func fxrstor(*byte) - -// xrstor uses xrstor to load floating point state. -func xrstor(*byte) - -// fxsave uses fxsave64 to save floating point state. -func fxsave(*byte) - -// xsave uses xsave to save floating point state. -func xsave(*byte) - -// xsaveopt uses xsaveopt to save floating point state. -func xsaveopt(*byte) - -// WriteFS sets the GS address (set by init). -var WriteFS func(addr uintptr) - -// wrfsbase writes to the GS base address. -func wrfsbase(addr uintptr) - -// wrfsmsr writes to the GS_BASE MSR. -func wrfsmsr(addr uintptr) - -// WriteGS sets the GS address (set by init). -var WriteGS func(addr uintptr) - -// wrgsbase writes to the GS base address. -func wrgsbase(addr uintptr) - -// wrgsmsr writes to the GS_BASE MSR. -func wrgsmsr(addr uintptr) - -// readCR2 reads the current CR2 value. -func readCR2() uintptr - -// fninit initializes the floating point unit. -func fninit() - -// xsetbv writes to an extended control register. -func xsetbv(reg, value uintptr) - -// xgetbv reads an extended control register. -func xgetbv(reg uintptr) uintptr - -// wrmsr reads to the given MSR. -func wrmsr(reg, value uintptr) - -// rdmsr reads the given MSR. -func rdmsr(reg uintptr) uintptr - -// Mostly-constants set by Init. -var ( - hasSMEP bool - hasPCID bool - hasXSAVEOPT bool - hasXSAVE bool - hasFSGSBASE bool - validXCR0Mask uintptr -) - -// Init sets function pointers based on architectural features. -// -// This must be called prior to using ring0. -func Init(featureSet *cpuid.FeatureSet) { - hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP) - hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID) - hasXSAVEOPT = featureSet.UseXsaveopt() - hasXSAVE = featureSet.UseXsave() - hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase) - validXCR0Mask = uintptr(featureSet.ValidXCR0Mask()) - if hasXSAVEOPT { - SaveFloatingPoint = xsaveopt - LoadFloatingPoint = xrstor - } else if hasXSAVE { - SaveFloatingPoint = xsave - LoadFloatingPoint = xrstor - } else { - SaveFloatingPoint = fxsave - LoadFloatingPoint = fxrstor - } - if hasFSGSBASE { - WriteFS = wrfsbase - WriteGS = wrgsbase - } else { - WriteFS = wrfsmsr - WriteGS = wrgsmsr - } -} diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s deleted file mode 100644 index 2fe83568a..000000000 --- a/pkg/sentry/platform/ring0/lib_amd64.s +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "funcdata.h" -#include "textflag.h" - -// fxrstor loads floating point state. -// -// The code corresponds to: -// -// fxrstor64 (%rbx) -// -TEXT ·fxrstor(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), BX - MOVL $0xffffffff, AX - MOVL $0xffffffff, DX - BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b; - RET - -// xrstor loads floating point state. -// -// The code corresponds to: -// -// xrstor (%rdi) -// -TEXT ·xrstor(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), DI - MOVL $0xffffffff, AX - MOVL $0xffffffff, DX - BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f; - RET - -// fxsave saves floating point state. -// -// The code corresponds to: -// -// fxsave64 (%rbx) -// -TEXT ·fxsave(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), BX - MOVL $0xffffffff, AX - MOVL $0xffffffff, DX - BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03; - RET - -// xsave saves floating point state. -// -// The code corresponds to: -// -// xsave (%rdi) -// -TEXT ·xsave(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), DI - MOVL $0xffffffff, AX - MOVL $0xffffffff, DX - BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; - RET - -// xsaveopt saves floating point state. -// -// The code corresponds to: -// -// xsaveopt (%rdi) -// -TEXT ·xsaveopt(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), DI - MOVL $0xffffffff, AX - MOVL $0xffffffff, DX - BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; - RET - -// wrfsbase writes to the FS base. -// -// The code corresponds to: -// -// wrfsbase %rax -// -TEXT ·wrfsbase(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), AX - BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0; - RET - -// wrfsmsr writes to the FSBASE MSR. -// -// The code corresponds to: -// -// wrmsr (writes EDX:EAX to the MSR in ECX) -// -TEXT ·wrfsmsr(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), AX - MOVQ AX, DX - SHRQ $32, DX - MOVQ $0xc0000100, CX // MSR_FS_BASE - BYTE $0x0f; BYTE $0x30; - RET - -// wrgsbase writes to the GS base. -// -// The code corresponds to: -// -// wrgsbase %rax -// -TEXT ·wrgsbase(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), AX - BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8; - RET - -// wrgsmsr writes to the GSBASE MSR. -// -// See wrfsmsr. -TEXT ·wrgsmsr(SB),NOSPLIT,$0-8 - MOVQ addr+0(FP), AX - MOVQ AX, DX - SHRQ $32, DX - MOVQ $0xc0000101, CX // MSR_GS_BASE - BYTE $0x0f; BYTE $0x30; // WRMSR - RET - -// readCR2 reads the current CR2 value. -// -// The code corresponds to: -// -// mov %cr2, %rax -// -TEXT ·readCR2(SB),NOSPLIT,$0-8 - BYTE $0x0f; BYTE $0x20; BYTE $0xd0; - MOVQ AX, ret+0(FP) - RET - -// fninit initializes the floating point unit. -// -// The code corresponds to: -// -// fninit -TEXT ·fninit(SB),NOSPLIT,$0 - BYTE $0xdb; BYTE $0xe3; - RET - -// xsetbv writes to an extended control register. -// -// The code corresponds to: -// -// xsetbv -// -TEXT ·xsetbv(SB),NOSPLIT,$0-16 - MOVL reg+0(FP), CX - MOVL value+8(FP), AX - MOVL value+12(FP), DX - BYTE $0x0f; BYTE $0x01; BYTE $0xd1; - RET - -// xgetbv reads an extended control register. -// -// The code corresponds to: -// -// xgetbv -// -TEXT ·xgetbv(SB),NOSPLIT,$0-16 - MOVL reg+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0; - MOVL AX, ret+8(FP) - MOVL DX, ret+12(FP) - RET - -// wrmsr writes to a control register. -// -// The code corresponds to: -// -// wrmsr -// -TEXT ·wrmsr(SB),NOSPLIT,$0-16 - MOVL reg+0(FP), CX - MOVL value+8(FP), AX - MOVL value+12(FP), DX - BYTE $0x0f; BYTE $0x30; - RET - -// rdmsr reads a control register. -// -// The code corresponds to: -// -// rdmsr -// -TEXT ·rdmsr(SB),NOSPLIT,$0-16 - MOVL reg+0(FP), CX - BYTE $0x0f; BYTE $0x32; - MOVL AX, ret+8(FP) - MOVL DX, ret+12(FP) - RET diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go deleted file mode 100644 index a490bf3af..000000000 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -// storeAppASID writes the application's asid value. -func storeAppASID(asid uintptr) - -// LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU. -func LocalFlushTlbAll() - -// FlushTlbByVA invalidates tlb by VA/Last-level/Inner-Shareable. -func FlushTlbByVA(addr uintptr) - -// FlushTlbByASID invalidates tlb by ASID/Inner-Shareable. -func FlushTlbByASID(asid uintptr) - -// FlushTlbAll invalidates all tlb. -func FlushTlbAll() - -// CPACREL1 returns the value of the CPACR_EL1 register. -func CPACREL1() (value uintptr) - -// GetFPCR returns the value of FPCR register. -func GetFPCR() (value uintptr) - -// SetFPCR writes the FPCR value. -func SetFPCR(value uintptr) - -// GetFPSR returns the value of FPSR register. -func GetFPSR() (value uintptr) - -// SetFPSR writes the FPSR value. -func SetFPSR(value uintptr) - -// SaveVRegs saves V0-V31 registers. -// V0-V31: 32 128-bit registers for floating point and simd. -func SaveVRegs(*byte) - -// LoadVRegs loads V0-V31 registers. -func LoadVRegs(*byte) - -// LoadFloatingPoint loads floating point state. -func LoadFloatingPoint(*byte) - -// SaveFloatingPoint saves floating point state. -func SaveFloatingPoint(*byte) - -// EnableVFP enables fpsimd. -func EnableVFP() - -// DisableVFP disables fpsimd. -func DisableVFP() - -// Init sets function pointers based on architectural features. -// -// This must be called prior to using ring0. -func Init() {} diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s deleted file mode 100644 index e39b32841..000000000 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "funcdata.h" -#include "textflag.h" - -#define TLBI_ASID_SHIFT 48 - -TEXT ·FlushTlbByVA(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R1 - DSB $10 // dsb(ishst) - WORD $0xd50883a1 // tlbi vale1is, x1 - DSB $11 // dsb(ish) - RET - -TEXT ·FlushTlbByASID(SB),NOSPLIT,$0-8 - MOVD asid+0(FP), R1 - LSL $TLBI_ASID_SHIFT, R1, R1 - DSB $10 // dsb(ishst) - WORD $0xd5088341 // tlbi aside1is, x1 - DSB $11 // dsb(ish) - RET - -TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0 - DSB $6 // dsb(nshst) - WORD $0xd508871f // __tlbi(vmalle1) - DSB $7 // dsb(nsh) - ISB $15 - RET - -TEXT ·FlushTlbAll(SB),NOSPLIT,$0 - DSB $10 // dsb(ishst) - WORD $0xd508831f // __tlbi(vmalle1is) - DSB $11 // dsb(ish) - ISB $15 - RET - -TEXT ·CPACREL1(SB),NOSPLIT,$0-8 - WORD $0xd5381041 // MRS CPACR_EL1, R1 - MOVD R1, ret+0(FP) - RET - -TEXT ·GetFPCR(SB),NOSPLIT,$0-8 - MOVD FPCR, R1 - MOVD R1, ret+0(FP) - RET - -TEXT ·GetFPSR(SB),NOSPLIT,$0-8 - MOVD FPSR, R1 - MOVD R1, ret+0(FP) - RET - -TEXT ·SetFPCR(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R1 - MOVD R1, FPCR - RET - -TEXT ·SetFPSR(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R1 - MOVD R1, FPSR - RET - -TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R0 - - // Skip aarch64_ctx, fpsr, fpcr. - ADD $16, R0, R0 - - WORD $0xad000400 // stp q0, q1, [x0] - WORD $0xad010c02 // stp q2, q3, [x0, #32] - WORD $0xad021404 // stp q4, q5, [x0, #64] - WORD $0xad031c06 // stp q6, q7, [x0, #96] - WORD $0xad042408 // stp q8, q9, [x0, #128] - WORD $0xad052c0a // stp q10, q11, [x0, #160] - WORD $0xad06340c // stp q12, q13, [x0, #192] - WORD $0xad073c0e // stp q14, q15, [x0, #224] - WORD $0xad084410 // stp q16, q17, [x0, #256] - WORD $0xad094c12 // stp q18, q19, [x0, #288] - WORD $0xad0a5414 // stp q20, q21, [x0, #320] - WORD $0xad0b5c16 // stp q22, q23, [x0, #352] - WORD $0xad0c6418 // stp q24, q25, [x0, #384] - WORD $0xad0d6c1a // stp q26, q27, [x0, #416] - WORD $0xad0e741c // stp q28, q29, [x0, #448] - WORD $0xad0f7c1e // stp q30, q31, [x0, #480] - - RET - -TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R0 - - // Skip aarch64_ctx, fpsr, fpcr. - ADD $16, R0, R0 - - WORD $0xad400400 // ldp q0, q1, [x0] - WORD $0xad410c02 // ldp q2, q3, [x0, #32] - WORD $0xad421404 // ldp q4, q5, [x0, #64] - WORD $0xad431c06 // ldp q6, q7, [x0, #96] - WORD $0xad442408 // ldp q8, q9, [x0, #128] - WORD $0xad452c0a // ldp q10, q11, [x0, #160] - WORD $0xad46340c // ldp q12, q13, [x0, #192] - WORD $0xad473c0e // ldp q14, q15, [x0, #224] - WORD $0xad484410 // ldp q16, q17, [x0, #256] - WORD $0xad494c12 // ldp q18, q19, [x0, #288] - WORD $0xad4a5414 // ldp q20, q21, [x0, #320] - WORD $0xad4b5c16 // ldp q22, q23, [x0, #352] - WORD $0xad4c6418 // ldp q24, q25, [x0, #384] - WORD $0xad4d6c1a // ldp q26, q27, [x0, #416] - WORD $0xad4e741c // ldp q28, q29, [x0, #448] - WORD $0xad4f7c1e // ldp q30, q31, [x0, #480] - - RET - -TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R0 - - MOVD 0(R0), R1 - MOVD R1, FPSR - MOVD 8(R0), R1 - MOVD R1, FPCR - - ADD $16, R0, R0 - - WORD $0xad400400 // ldp q0, q1, [x0] - WORD $0xad410c02 // ldp q2, q3, [x0, #32] - WORD $0xad421404 // ldp q4, q5, [x0, #64] - WORD $0xad431c06 // ldp q6, q7, [x0, #96] - WORD $0xad442408 // ldp q8, q9, [x0, #128] - WORD $0xad452c0a // ldp q10, q11, [x0, #160] - WORD $0xad46340c // ldp q12, q13, [x0, #192] - WORD $0xad473c0e // ldp q14, q15, [x0, #224] - WORD $0xad484410 // ldp q16, q17, [x0, #256] - WORD $0xad494c12 // ldp q18, q19, [x0, #288] - WORD $0xad4a5414 // ldp q20, q21, [x0, #320] - WORD $0xad4b5c16 // ldp q22, q23, [x0, #352] - WORD $0xad4c6418 // ldp q24, q25, [x0, #384] - WORD $0xad4d6c1a // ldp q26, q27, [x0, #416] - WORD $0xad4e741c // ldp q28, q29, [x0, #448] - WORD $0xad4f7c1e // ldp q30, q31, [x0, #480] - - RET - -TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R0 - - MOVD FPSR, R1 - MOVD R1, 0(R0) - MOVD FPCR, R1 - MOVD R1, 8(R0) - - ADD $16, R0, R0 - - WORD $0xad000400 // stp q0, q1, [x0] - WORD $0xad010c02 // stp q2, q3, [x0, #32] - WORD $0xad021404 // stp q4, q5, [x0, #64] - WORD $0xad031c06 // stp q6, q7, [x0, #96] - WORD $0xad042408 // stp q8, q9, [x0, #128] - WORD $0xad052c0a // stp q10, q11, [x0, #160] - WORD $0xad06340c // stp q12, q13, [x0, #192] - WORD $0xad073c0e // stp q14, q15, [x0, #224] - WORD $0xad084410 // stp q16, q17, [x0, #256] - WORD $0xad094c12 // stp q18, q19, [x0, #288] - WORD $0xad0a5414 // stp q20, q21, [x0, #320] - WORD $0xad0b5c16 // stp q22, q23, [x0, #352] - WORD $0xad0c6418 // stp q24, q25, [x0, #384] - WORD $0xad0d6c1a // stp q26, q27, [x0, #416] - WORD $0xad0e741c // stp q28, q29, [x0, #448] - WORD $0xad0f7c1e // stp q30, q31, [x0, #480] - - RET diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go deleted file mode 100644 index ca4075b09..000000000 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "fmt" - "io" - "reflect" - - "gvisor.dev/gvisor/pkg/sentry/arch" -) - -// Emit prints architecture-specific offsets. -func Emit(w io.Writer) { - fmt.Fprintf(w, "// Automatically generated, do not edit.\n") - - c := &CPU{} - fmt.Fprintf(w, "\n// CPU offsets.\n") - fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) - - e := &kernelEntry{} - fmt.Fprintf(w, "\n// CPU entry offsets.\n") - fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer()) - fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer()) - fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer()) - fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer()) - - fmt.Fprintf(w, "\n// Bits.\n") - fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) - fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0) - fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) - - fmt.Fprintf(w, "\n// Vectors.\n") - fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero) - fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug) - fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI) - fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint) - fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow) - fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded) - fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode) - fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable) - fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault) - fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun) - fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS) - fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent) - fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault) - fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault) - fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) - fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException) - fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck) - fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck) - fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException) - fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) - fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException) - fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) - fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) - - p := &arch.Registers{} - fmt.Fprintf(w, "\n// Ptrace registers.\n") - fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) -} diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go deleted file mode 100644 index 164db6d5a..000000000 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -import ( - "fmt" - "io" - "reflect" - - "gvisor.dev/gvisor/pkg/sentry/arch" -) - -// Emit prints architecture-specific offsets. -func Emit(w io.Writer) { - fmt.Fprintf(w, "// Automatically generated, do not edit.\n") - - c := &CPU{} - fmt.Fprintf(w, "\n// CPU offsets.\n") - fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) - fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_FAULT_ADDR 0x%02x\n", reflect.ValueOf(&c.faultAddr).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_TTBR0_KVM 0x%02x\n", reflect.ValueOf(&c.ttbr0Kvm).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_TTBR0_APP 0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_APP_ASID 0x%02x\n", reflect.ValueOf(&c.appASID).Pointer()-reflect.ValueOf(c).Pointer()) - - fmt.Fprintf(w, "\n// Bits.\n") - fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) - - fmt.Fprintf(w, "\n// Vectors.\n") - - fmt.Fprintf(w, "#define El1Sync 0x%02x\n", El1Sync) - fmt.Fprintf(w, "#define El1Irq 0x%02x\n", El1Irq) - fmt.Fprintf(w, "#define El1Fiq 0x%02x\n", El1Fiq) - fmt.Fprintf(w, "#define El1Err 0x%02x\n", El1Err) - - fmt.Fprintf(w, "#define El0Sync 0x%02x\n", El0Sync) - fmt.Fprintf(w, "#define El0Irq 0x%02x\n", El0Irq) - fmt.Fprintf(w, "#define El0Fiq 0x%02x\n", El0Fiq) - fmt.Fprintf(w, "#define El0Err 0x%02x\n", El0Err) - - fmt.Fprintf(w, "#define El1SyncDa 0x%02x\n", El1SyncDa) - fmt.Fprintf(w, "#define El1SyncIa 0x%02x\n", El1SyncIa) - fmt.Fprintf(w, "#define El1SyncSpPc 0x%02x\n", El1SyncSpPc) - fmt.Fprintf(w, "#define El1SyncUndef 0x%02x\n", El1SyncUndef) - fmt.Fprintf(w, "#define El1SyncDbg 0x%02x\n", El1SyncDbg) - fmt.Fprintf(w, "#define El1SyncInv 0x%02x\n", El1SyncInv) - - fmt.Fprintf(w, "#define El0SyncSVC 0x%02x\n", El0SyncSVC) - fmt.Fprintf(w, "#define El0SyncDa 0x%02x\n", El0SyncDa) - fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa) - fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc) - fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc) - fmt.Fprintf(w, "#define El0SyncFpsimdExc 0x%02x\n", El0SyncFpsimdExc) - fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys) - fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc) - fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef) - fmt.Fprintf(w, "#define El0SyncDbg 0x%02x\n", El0SyncDbg) - fmt.Fprintf(w, "#define El0SyncWfx 0x%02x\n", El0SyncWfx) - fmt.Fprintf(w, "#define El0SyncInv 0x%02x\n", El0SyncInv) - - fmt.Fprintf(w, "#define El0ErrNMI 0x%02x\n", El0ErrNMI) - - fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) - fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) - fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) - - p := &arch.Registers{} - fmt.Fprintf(w, "\n// Ptrace registers.\n") - fmt.Fprintf(w, "#define PTRACE_R0 0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R1 0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R2 0x%02x\n", reflect.ValueOf(&p.Regs[2]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R3 0x%02x\n", reflect.ValueOf(&p.Regs[3]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R4 0x%02x\n", reflect.ValueOf(&p.Regs[4]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R5 0x%02x\n", reflect.ValueOf(&p.Regs[5]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R6 0x%02x\n", reflect.ValueOf(&p.Regs[6]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R7 0x%02x\n", reflect.ValueOf(&p.Regs[7]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.Regs[8]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.Regs[9]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.Regs[10]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.Regs[11]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.Regs[12]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.Regs[13]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.Regs[14]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.Regs[15]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R16 0x%02x\n", reflect.ValueOf(&p.Regs[16]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R17 0x%02x\n", reflect.ValueOf(&p.Regs[17]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R18 0x%02x\n", reflect.ValueOf(&p.Regs[18]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R19 0x%02x\n", reflect.ValueOf(&p.Regs[19]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R20 0x%02x\n", reflect.ValueOf(&p.Regs[20]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R21 0x%02x\n", reflect.ValueOf(&p.Regs[21]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R22 0x%02x\n", reflect.ValueOf(&p.Regs[22]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R23 0x%02x\n", reflect.ValueOf(&p.Regs[23]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R24 0x%02x\n", reflect.ValueOf(&p.Regs[24]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R25 0x%02x\n", reflect.ValueOf(&p.Regs[25]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R26 0x%02x\n", reflect.ValueOf(&p.Regs[26]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R27 0x%02x\n", reflect.ValueOf(&p.Regs[27]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R28 0x%02x\n", reflect.ValueOf(&p.Regs[28]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R29 0x%02x\n", reflect.ValueOf(&p.Regs[29]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R30 0x%02x\n", reflect.ValueOf(&p.Regs[30]).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_SP 0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_PC 0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_PSTATE 0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_TLS 0x%02x\n", reflect.ValueOf(&p.TPIDR_EL0).Pointer()-reflect.ValueOf(p).Pointer()) -} diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD deleted file mode 100644 index 9e3539e4c..000000000 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ /dev/null @@ -1,84 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") - -package(licenses = ["notice"]) - -[ - # These files are tagged with relevant build architectures. We can always - # build all the input files, which will be included only in the relevant - # architecture builds. - go_template( - name = "generic_walker_%s" % arch, - srcs = ["walker_%s.go" % arch], - opt_types = [ - "Visitor", - ], - visibility = [":__pkg__"], - ) - for arch in ("amd64", "arm64") -] - -[ - # See above. - go_template_instance( - name = "walker_%s_%s" % (op, arch), - out = "walker_%s_%s.go" % (op, arch), - package = "pagetables", - prefix = op, - template = ":generic_walker_%s" % arch, - types = { - "Visitor": "%sVisitor" % op, - }, - ) - for op in ("map", "unmap", "lookup", "empty", "check") - for arch in ("amd64", "arm64") -] - -go_library( - name = "pagetables", - srcs = [ - "allocator.go", - "allocator_unsafe.go", - "pagetables.go", - "pagetables_aarch64.go", - "pagetables_amd64.go", - "pagetables_arm64.go", - "pagetables_x86.go", - "pcids.go", - "pcids_aarch64.go", - "pcids_aarch64.s", - "pcids_x86.go", - "walker_amd64.go", - "walker_arm64.go", - ":walker_empty_amd64", - ":walker_empty_arm64", - ":walker_lookup_amd64", - ":walker_lookup_arm64", - ":walker_map_amd64", - ":walker_map_arm64", - ":walker_unmap_amd64", - ":walker_unmap_arm64", - ], - visibility = [ - "//pkg/sentry/platform/kvm:__subpackages__", - "//pkg/sentry/platform/ring0:__subpackages__", - ], - deps = [ - "//pkg/sync", - "//pkg/usermem", - ], -) - -go_test( - name = "pagetables_test", - size = "small", - srcs = [ - "pagetables_amd64_test.go", - "pagetables_arm64_test.go", - "pagetables_test.go", - ":walker_check_amd64", - ":walker_check_arm64", - ], - library = ":pagetables", - deps = ["//pkg/usermem"], -) diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go deleted file mode 100644 index 8d75b7599..000000000 --- a/pkg/sentry/platform/ring0/pagetables/allocator.go +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -// Allocator is used to allocate and map PTEs. -// -// Note that allocators may be called concurrently. -type Allocator interface { - // NewPTEs returns a new set of PTEs and their physical address. - NewPTEs() *PTEs - - // PhysicalFor gives the physical address for a set of PTEs. - PhysicalFor(ptes *PTEs) uintptr - - // LookupPTEs looks up PTEs by physical address. - LookupPTEs(physical uintptr) *PTEs - - // FreePTEs marks a set of PTEs a freed, although they may not be available - // for use again until Recycle is called, below. - FreePTEs(ptes *PTEs) - - // Recycle makes freed PTEs available for use again. - Recycle() -} - -// RuntimeAllocator is a trivial allocator. -type RuntimeAllocator struct { - // used is the set of PTEs that have been allocated. This includes any - // PTEs that may be in the pool below. PTEs are only freed from this - // map by the Drain call. - // - // This exists to prevent accidental garbage collection. - used map[*PTEs]struct{} - - // pool is the set of free-to-use PTEs. - pool []*PTEs - - // freed is the set of recently-freed PTEs. - freed []*PTEs -} - -// NewRuntimeAllocator returns an allocator that uses runtime allocation. -func NewRuntimeAllocator() *RuntimeAllocator { - r := new(RuntimeAllocator) - r.Init() - return r -} - -// Init initializes a RuntimeAllocator. -func (r *RuntimeAllocator) Init() { - r.used = make(map[*PTEs]struct{}) -} - -// Recycle returns freed pages to the pool. -func (r *RuntimeAllocator) Recycle() { - r.pool = append(r.pool, r.freed...) - r.freed = r.freed[:0] -} - -// Drain empties the pool. -func (r *RuntimeAllocator) Drain() { - r.Recycle() - for i, ptes := range r.pool { - // Zap the entry in the underlying array to ensure that it can - // be properly garbage collected. - r.pool[i] = nil - // Similarly, free the reference held by the used map (these - // also apply for the pool entries). - delete(r.used, ptes) - } - r.pool = r.pool[:0] -} - -// NewPTEs implements Allocator.NewPTEs. -// -// Note that the "physical" address here is actually the virtual address of the -// PTEs structure. The entries are tracked only to avoid garbage collection. -// -// This is guaranteed not to split as long as the pool is sufficiently full. -// -//go:nosplit -func (r *RuntimeAllocator) NewPTEs() *PTEs { - // Pull from the pool if we can. - if len(r.pool) > 0 { - ptes := r.pool[len(r.pool)-1] - r.pool = r.pool[:len(r.pool)-1] - return ptes - } - - // Allocate a new entry. - ptes := newAlignedPTEs() - r.used[ptes] = struct{}{} - return ptes -} - -// PhysicalFor returns the physical address for the given PTEs. -// -//go:nosplit -func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr { - return physicalFor(ptes) -} - -// LookupPTEs implements Allocator.LookupPTEs. -// -//go:nosplit -func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs { - return fromPhysical(physical) -} - -// FreePTEs implements Allocator.FreePTEs. -// -//go:nosplit -func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) { - r.freed = append(r.freed, ptes) -} diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go deleted file mode 100644 index d08bfdeb3..000000000 --- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -import ( - "unsafe" - - "gvisor.dev/gvisor/pkg/usermem" -) - -// newAlignedPTEs returns a set of aligned PTEs. -func newAlignedPTEs() *PTEs { - ptes := new(PTEs) - offset := physicalFor(ptes) & (usermem.PageSize - 1) - if offset == 0 { - // Already aligned. - return ptes - } - - // Need to force an aligned allocation. - unaligned := make([]byte, (2*usermem.PageSize)-1) - offset = uintptr(unsafe.Pointer(&unaligned[0])) & (usermem.PageSize - 1) - if offset != 0 { - offset = usermem.PageSize - offset - } - return (*PTEs)(unsafe.Pointer(&unaligned[offset])) -} - -// physicalFor returns the "physical" address for PTEs. -// -//go:nosplit -func physicalFor(ptes *PTEs) uintptr { - return uintptr(unsafe.Pointer(ptes)) -} - -// fromPhysical returns the PTEs from the "physical" address. -// -//go:nosplit -func fromPhysical(physical uintptr) *PTEs { - return (*PTEs)(unsafe.Pointer(physical)) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go deleted file mode 100644 index 7605d0cb2..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package pagetables provides a generic implementation of pagetables. -// -// The core functions must be safe to call from a nosplit context. Furthermore, -// this pagetables implementation goes to lengths to ensure that all functions -// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made -// during walks, but these can be cached elsewhere if required. -package pagetables - -import ( - "gvisor.dev/gvisor/pkg/usermem" -) - -// PageTables is a set of page tables. -type PageTables struct { - // Allocator is used to allocate nodes. - Allocator Allocator - - // root is the pagetable root. - // - // For same archs such as amd64, the upper of the PTEs is cloned - // from and owned by upperSharedPageTables which are shared among - // many PageTables if upperSharedPageTables is not nil. - root *PTEs - - // rootPhysical is the cached physical address of the root. - // - // This is saved only to prevent constant translation. - rootPhysical uintptr - - // archPageTables includes architecture-specific features. - archPageTables - - // upperSharedPageTables represents a read-only shared upper - // of the Pagetable. When it is not nil, the upper is not - // allowed to be modified. - upperSharedPageTables *PageTables - - // upperStart is the start address of the upper portion that - // are shared from upperSharedPageTables - upperStart uintptr - - // readOnlyShared indicates the Pagetables are read-only and - // own the ranges that are shared with other Pagetables. - readOnlyShared bool -} - -// Init initializes a set of PageTables. -// -//go:nosplit -func (p *PageTables) Init(allocator Allocator) { - p.Allocator = allocator - p.root = p.Allocator.NewPTEs() - p.rootPhysical = p.Allocator.PhysicalFor(p.root) -} - -// NewWithUpper returns new PageTables. -// -// upperSharedPageTables are used for mapping the upper of addresses, -// starting at upperStart. These pageTables should not be touched (as -// invalidations may be incorrect) after they are passed as an -// upperSharedPageTables. Only when all dependent PageTables are gone -// may they be used. The intenteded use case is for kernel page tables, -// which are static and fixed. -// -// Precondition: upperStart must be between canonical ranges. -// Precondition: upperStart must be pgdSize aligned. -// precondition: upperSharedPageTables must be marked read-only shared. -func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables { - p := new(PageTables) - p.Init(a) - - if upperSharedPageTables != nil { - if !upperSharedPageTables.readOnlyShared { - panic("Only read-only shared pagetables can be used as upper") - } - p.upperSharedPageTables = upperSharedPageTables - p.upperStart = upperStart - } - - p.InitArch(a) - - return p -} - -// New returns new PageTables. -func New(a Allocator) *PageTables { - return NewWithUpper(a, nil, 0) -} - -// mapVisitor is used for map. -type mapVisitor struct { - target uintptr // Input. - physical uintptr // Input. - opts MapOpts // Input. - prev bool // Output. -} - -// visit is used for map. -// -//go:nosplit -func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) { - p := v.physical + (start - uintptr(v.target)) - if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { - v.prev = true - } - if p&align != 0 { - // We will install entries at a smaller granulaity if we don't - // install a valid entry here, however we must zap any existing - // entry to ensure this happens. - pte.Clear() - return - } - pte.Set(p, v.opts) -} - -//go:nosplit -func (*mapVisitor) requiresAlloc() bool { return true } - -//go:nosplit -func (*mapVisitor) requiresSplit() bool { return true } - -// Map installs a mapping with the given physical address. -// -// True is returned iff there was a previous mapping in the range. -// -// Precondition: addr & length must be page-aligned, their sum must not overflow. -// -// +checkescape:hard,stack -// -//go:nosplit -func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { - if p.readOnlyShared { - panic("Should not modify read-only shared pagetables.") - } - if uintptr(addr)+length < uintptr(addr) { - panic("addr & length overflow") - } - if p.upperSharedPageTables != nil { - // ignore change to the read-only upper shared portion. - if uintptr(addr) >= p.upperStart { - return false - } - if uintptr(addr)+length > p.upperStart { - length = p.upperStart - uintptr(addr) - } - } - if !opts.AccessType.Any() { - return p.Unmap(addr, length) - } - w := mapWalker{ - pageTables: p, - visitor: mapVisitor{ - target: uintptr(addr), - physical: physical, - opts: opts, - }, - } - w.iterateRange(uintptr(addr), uintptr(addr)+length) - return w.visitor.prev -} - -// unmapVisitor is used for unmap. -type unmapVisitor struct { - count int -} - -//go:nosplit -func (*unmapVisitor) requiresAlloc() bool { return false } - -//go:nosplit -func (*unmapVisitor) requiresSplit() bool { return true } - -// visit unmaps the given entry. -// -//go:nosplit -func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { - pte.Clear() - v.count++ -} - -// Unmap unmaps the given range. -// -// True is returned iff there was a previous mapping in the range. -// -// Precondition: addr & length must be page-aligned, their sum must not overflow. -// -// +checkescape:hard,stack -// -//go:nosplit -func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { - if p.readOnlyShared { - panic("Should not modify read-only shared pagetables.") - } - if uintptr(addr)+length < uintptr(addr) { - panic("addr & length overflow") - } - if p.upperSharedPageTables != nil { - // ignore change to the read-only upper shared portion. - if uintptr(addr) >= p.upperStart { - return false - } - if uintptr(addr)+length > p.upperStart { - length = p.upperStart - uintptr(addr) - } - } - w := unmapWalker{ - pageTables: p, - visitor: unmapVisitor{ - count: 0, - }, - } - w.iterateRange(uintptr(addr), uintptr(addr)+length) - return w.visitor.count > 0 -} - -// emptyVisitor is used for emptiness checks. -type emptyVisitor struct { - count int -} - -//go:nosplit -func (*emptyVisitor) requiresAlloc() bool { return false } - -//go:nosplit -func (*emptyVisitor) requiresSplit() bool { return false } - -// visit unmaps the given entry. -// -//go:nosplit -func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { - v.count++ -} - -// IsEmpty checks if the given range is empty. -// -// Precondition: addr & length must be page-aligned. -// -// +checkescape:hard,stack -// -//go:nosplit -func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { - w := emptyWalker{ - pageTables: p, - } - w.iterateRange(uintptr(addr), uintptr(addr)+length) - return w.visitor.count == 0 -} - -// lookupVisitor is used for lookup. -type lookupVisitor struct { - target uintptr // Input. - physical uintptr // Output. - opts MapOpts // Output. -} - -// visit matches the given address. -// -//go:nosplit -func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) { - if !pte.Valid() { - return - } - v.physical = pte.Address() + (start - uintptr(v.target)) - v.opts = pte.Opts() -} - -//go:nosplit -func (*lookupVisitor) requiresAlloc() bool { return false } - -//go:nosplit -func (*lookupVisitor) requiresSplit() bool { return false } - -// Lookup returns the physical address for the given virtual address. -// -// +checkescape:hard,stack -// -//go:nosplit -func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { - mask := uintptr(usermem.PageSize - 1) - offset := uintptr(addr) & mask - w := lookupWalker{ - pageTables: p, - visitor: lookupVisitor{ - target: uintptr(addr &^ usermem.Addr(mask)), - }, - } - w.iterateRange(uintptr(addr), uintptr(addr)+1) - return w.visitor.physical + offset, w.visitor.opts -} - -// MarkReadOnlyShared marks the pagetables read-only and can be shared. -// -// It is usually used on the pagetables that are used as the upper -func (p *PageTables) MarkReadOnlyShared() { - p.readOnlyShared = true -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go deleted file mode 100644 index 520161755..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package pagetables - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/usermem" -) - -// archPageTables is architecture-specific data. -type archPageTables struct { - // root is the pagetable root for kernel space. - root *PTEs - - // rootPhysical is the cached physical address of the root. - // - // This is saved only to prevent constant translation. - rootPhysical uintptr - - asid uint16 -} - -// TTBR0_EL1 returns the translation table base register 0. -// -//go:nosplit -func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 { - return uint64(p.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset -} - -// TTBR1_EL1 returns the translation table base register 1. -// -//go:nosplit -func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 { - return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset -} - -// Bits in page table entries. -const ( - typeTable = 0x3 << 0 - typeSect = 0x1 << 0 - typePage = 0x3 << 0 - pteValid = 0x1 << 0 - pteTableBit = 0x1 << 1 - pteTypeMask = 0x3 << 0 - present = pteValid | pteTableBit - user = 0x1 << 6 /* AP[1] */ - readOnly = 0x1 << 7 /* AP[2] */ - accessed = 0x1 << 10 - dbm = 0x1 << 51 - writable = dbm - cont = 0x1 << 52 - pxn = 0x1 << 53 - xn = 0x1 << 54 - dirty = 0x1 << 55 - nG = 0x1 << 11 - shared = 0x3 << 8 -) - -const ( - mtDevicenGnRE = 0x1 << 2 - mtNormal = 0x4 << 2 -) - -const ( - executeDisable = xn - optionMask = 0xfff | 0xffff<<48 - protDefault = accessed | shared -) - -// MapOpts are x86 options. -type MapOpts struct { - // AccessType defines permissions. - AccessType usermem.AccessType - - // Global indicates the page is globally accessible. - Global bool - - // User indicates the page is a user page. - User bool -} - -// PTE is a page table entry. -type PTE uintptr - -// Clear clears this PTE, including sect page information. -// -//go:nosplit -func (p *PTE) Clear() { - atomic.StoreUintptr((*uintptr)(p), 0) -} - -// Valid returns true iff this entry is valid. -// -//go:nosplit -func (p *PTE) Valid() bool { - return atomic.LoadUintptr((*uintptr)(p))&present != 0 -} - -// Opts returns the PTE options. -// -// These are all options except Valid and Sect. -// -//go:nosplit -func (p *PTE) Opts() MapOpts { - v := atomic.LoadUintptr((*uintptr)(p)) - - return MapOpts{ - AccessType: usermem.AccessType{ - Read: true, - Write: v&readOnly == 0, - Execute: v&xn == 0, - }, - Global: v&nG == 0, - User: v&user != 0, - } -} - -// SetSect sets this page as a sect page. -// -// The page must not be valid or a panic will result. -// -//go:nosplit -func (p *PTE) SetSect() { - if p.Valid() { - // This is not allowed. - panic("SetSect called on valid page!") - } - atomic.StoreUintptr((*uintptr)(p), typeSect) -} - -// IsSect returns true iff this page is a sect page. -// -//go:nosplit -func (p *PTE) IsSect() bool { - return atomic.LoadUintptr((*uintptr)(p))&pteTypeMask == typeSect -} - -// Set sets this PTE value. -// -// This does not change the sect page property. -// -//go:nosplit -func (p *PTE) Set(addr uintptr, opts MapOpts) { - if !opts.AccessType.Any() { - p.Clear() - return - } - v := (addr &^ optionMask) | protDefault | nG | readOnly - - if p.IsSect() { - // Note that this is inherited from the previous instance. Set - // does not change the value of Sect. See above. - v |= typeSect - } else { - v |= typePage - } - - if opts.Global { - v = v &^ nG - } - - if opts.AccessType.Execute { - v = v &^ executeDisable - } else { - v |= executeDisable - } - if opts.AccessType.Write { - v = v &^ readOnly - } - - if opts.User { - v |= user - v |= mtNormal - } else { - v = v &^ user - v |= mtNormal - } - atomic.StoreUintptr((*uintptr)(p), v) -} - -// setPageTable sets this PTE value and forces the write bit and sect bit to -// be cleared. This is used explicitly for breaking sect pages. -// -//go:nosplit -func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { - addr := pt.Allocator.PhysicalFor(ptes) - if addr&^optionMask != addr { - // This should never happen. - panic("unaligned physical address!") - } - v := addr | typeTable | protDefault | mtNormal - atomic.StoreUintptr((*uintptr)(p), v) -} - -// Address extracts the address. This should only be used if Valid returns true. -// -//go:nosplit -func (p *PTE) Address() uintptr { - return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go deleted file mode 100644 index 4bdde8448..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -// Address constraints. -// -// The lowerTop and upperBottom currently apply to four-level pagetables; -// additional refactoring would be necessary to support five-level pagetables. -const ( - lowerTop = 0x00007fffffffffff - upperBottom = 0xffff800000000000 - - pteShift = 12 - pmdShift = 21 - pudShift = 30 - pgdShift = 39 - - pteMask = 0x1ff << pteShift - pmdMask = 0x1ff << pmdShift - pudMask = 0x1ff << pudShift - pgdMask = 0x1ff << pgdShift - - pteSize = 1 << pteShift - pmdSize = 1 << pmdShift - pudSize = 1 << pudShift - pgdSize = 1 << pgdShift - - executeDisable = 1 << 63 - entriesPerPage = 512 -) - -// InitArch does some additional initialization related to the architecture. -// -//go:nosplit -func (p *PageTables) InitArch(allocator Allocator) { - if p.upperSharedPageTables != nil { - p.cloneUpperShared() - } -} - -func pgdIndex(upperStart uintptr) uintptr { - if upperStart&(pgdSize-1) != 0 { - panic("upperStart should be pgd size aligned") - } - if upperStart >= upperBottom { - return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize - } - if upperStart < lowerTop { - return upperStart / pgdSize - } - panic("upperStart should be in canonical range") -} - -// cloneUpperShared clone the upper from the upper shared page tables. -// -//go:nosplit -func (p *PageTables) cloneUpperShared() { - start := pgdIndex(p.upperStart) - copy(p.root[start:entriesPerPage], p.upperSharedPageTables.root[start:entriesPerPage]) -} - -// PTEs is a collection of entries. -type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go deleted file mode 100644 index 54e8e554f..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package pagetables - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -func Test2MAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a huge page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x00007f0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read}}, - }) -} - -func Test1GAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a super page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x00007f0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestSplit1GPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a super page and knock out the middle. - pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*42) - pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x00007f0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read}}, - {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestSplit2MPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a huge page and knock out the middle. - pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*42) - pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x00007f0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read}}, - {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read}}, - }) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go deleted file mode 100644 index ad0e30c88..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -// Address constraints. -// -// The lowerTop and upperBottom currently apply to four-level pagetables; -// additional refactoring would be necessary to support five-level pagetables. -const ( - lowerTop = 0x0000ffffffffffff - upperBottom = 0xffff000000000000 - pteShift = 12 - pmdShift = 21 - pudShift = 30 - pgdShift = 39 - - pteMask = 0x1ff << pteShift - pmdMask = 0x1ff << pmdShift - pudMask = 0x1ff << pudShift - pgdMask = 0x1ff << pgdShift - - pteSize = 1 << pteShift - pmdSize = 1 << pmdShift - pudSize = 1 << pudShift - pgdSize = 1 << pgdShift - - ttbrASIDOffset = 48 - ttbrASIDMask = 0xff - - entriesPerPage = 512 -) - -// InitArch does some additional initialization related to the architecture. -// -//go:nosplit -func (p *PageTables) InitArch(allocator Allocator) { - if p.upperSharedPageTables != nil { - p.cloneUpperShared() - } else { - p.archPageTables.root = p.Allocator.NewPTEs() - p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root) - } -} - -// cloneUpperShared clone the upper from the upper shared page tables. -// -//go:nosplit -func (p *PageTables) cloneUpperShared() { - if p.upperStart != upperBottom { - panic("upperStart should be the same as upperBottom") - } - - p.archPageTables.root = p.upperSharedPageTables.archPageTables.root - p.archPageTables.rootPhysical = p.upperSharedPageTables.archPageTables.rootPhysical -} - -// PTEs is a collection of entries. -type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go deleted file mode 100644 index 2f73d424f..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package pagetables - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -func Test2MAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a huge page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) - pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*47) - - pt.Map(0xffff000000400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: false}, pteSize*42) - pt.Map(0xffffff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: false}, pmdSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, - {0x0000ff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, - {0xffff000000400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: false}}, - {0xffffff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: false}}, - }) -} - -func Test1GAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a super page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) - pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, - {0x0000ff0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, - }) -} - -func TestSplit1GPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a super page and knock out the middle. - pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*42) - pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pudSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x0000ff0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, - {0x0000ff0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, - }) -} - -func TestSplit2MPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a huge page and knock out the middle. - pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*42) - pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pmdSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x0000ff0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, - {0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, - }) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go deleted file mode 100644 index 5c88d087d..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -type mapping struct { - start uintptr - length uintptr - addr uintptr - opts MapOpts -} - -type checkVisitor struct { - expected []mapping // Input. - current int // Temporary. - found []mapping // Output. - failed string // Output. -} - -func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) { - v.found = append(v.found, mapping{ - start: start, - length: align + 1, - addr: pte.Address(), - opts: pte.Opts(), - }) - if v.failed != "" { - // Don't keep looking for errors. - return - } - - if v.current >= len(v.expected) { - v.failed = "more mappings than expected" - } else if v.expected[v.current].start != start { - v.failed = "start didn't match expected" - } else if v.expected[v.current].length != (align + 1) { - v.failed = "end didn't match expected" - } else if v.expected[v.current].addr != pte.Address() { - v.failed = "address didn't match expected" - } else if v.expected[v.current].opts != pte.Opts() { - v.failed = "opts didn't match" - } - v.current++ -} - -func (*checkVisitor) requiresAlloc() bool { return false } - -func (*checkVisitor) requiresSplit() bool { return false } - -func checkMappings(t *testing.T, pt *PageTables, m []mapping) { - // Iterate over all the mappings. - w := checkWalker{ - pageTables: pt, - visitor: checkVisitor{ - expected: m, - }, - } - w.iterateRange(0, ^uintptr(0)) - - // Were we expected additional mappings? - if w.visitor.failed == "" && w.visitor.current != len(w.visitor.expected) { - w.visitor.failed = "insufficient mappings found" - } - - // Emit a meaningful error message on failure. - if w.visitor.failed != "" { - t.Errorf("%s; got %#v, wanted %#v", w.visitor.failed, w.visitor.found, w.visitor.expected) - } -} - -func TestUnmap(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map and unmap one entry. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Unmap(0x400000, pteSize) - - checkMappings(t, pt, nil) -} - -func TestReadOnly(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map one entry. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestReadWrite(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map one entry. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - }) -} - -func TestSerialEntries(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map two sequential entries. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x401000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x401000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.ReadWrite}}, - }) -} - -func TestSpanningEntries(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Span a pgd with two pages. - pt.Map(0x00007efffffff000, 2*pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42) - - checkMappings(t, pt, []mapping{ - {0x00007efffffff000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}}, - {0x00007f0000000000, pteSize, pteSize * 43, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestSparseEntries(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map two entries in different pgds. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x00007f0000000000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x00007f0000000000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.Read}}, - }) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go deleted file mode 100644 index 157438d9b..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build 386 amd64 - -package pagetables - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/usermem" -) - -// archPageTables is architecture-specific data. -type archPageTables struct { - // pcid is the value assigned by PCIDs.Assign. - // - // Note that zero is a valid PCID. - pcid uint16 -} - -// CR3 returns the CR3 value for these tables. -// -// This may be called in interrupt contexts. A PCID of zero always implies a -// flush and should be passed when PCIDs are not enabled. See pcids_x86.go for -// more information. -// -//go:nosplit -func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 { - // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1). - const noFlushBit uint64 = 0x8000000000000000 - if noFlush && pcid != 0 { - return noFlushBit | uint64(p.rootPhysical) | uint64(pcid) - } - return uint64(p.rootPhysical) | uint64(pcid) -} - -// Bits in page table entries. -const ( - present = 0x001 - writable = 0x002 - user = 0x004 - writeThrough = 0x008 - cacheDisable = 0x010 - accessed = 0x020 - dirty = 0x040 - super = 0x080 - global = 0x100 - optionMask = executeDisable | 0xfff -) - -// MapOpts are x86 options. -type MapOpts struct { - // AccessType defines permissions. - AccessType usermem.AccessType - - // Global indicates the page is globally accessible. - Global bool - - // User indicates the page is a user page. - User bool -} - -// PTE is a page table entry. -type PTE uintptr - -// Clear clears this PTE, including super page information. -// -//go:nosplit -func (p *PTE) Clear() { - atomic.StoreUintptr((*uintptr)(p), 0) -} - -// Valid returns true iff this entry is valid. -// -//go:nosplit -func (p *PTE) Valid() bool { - return atomic.LoadUintptr((*uintptr)(p))&present != 0 -} - -// Opts returns the PTE options. -// -// These are all options except Valid and Super. -// -//go:nosplit -func (p *PTE) Opts() MapOpts { - v := atomic.LoadUintptr((*uintptr)(p)) - return MapOpts{ - AccessType: usermem.AccessType{ - Read: v&present != 0, - Write: v&writable != 0, - Execute: v&executeDisable == 0, - }, - Global: v&global != 0, - User: v&user != 0, - } -} - -// SetSuper sets this page as a super page. -// -// The page must not be valid or a panic will result. -// -//go:nosplit -func (p *PTE) SetSuper() { - if p.Valid() { - // This is not allowed. - panic("SetSuper called on valid page!") - } - atomic.StoreUintptr((*uintptr)(p), super) -} - -// IsSuper returns true iff this page is a super page. -// -//go:nosplit -func (p *PTE) IsSuper() bool { - return atomic.LoadUintptr((*uintptr)(p))&super != 0 -} - -// Set sets this PTE value. -// -// This does not change the super page property. -// -//go:nosplit -func (p *PTE) Set(addr uintptr, opts MapOpts) { - if !opts.AccessType.Any() { - p.Clear() - return - } - v := (addr &^ optionMask) | present | accessed - if opts.User { - v |= user - } - if opts.Global { - v |= global - } - if !opts.AccessType.Execute { - v |= executeDisable - } - if opts.AccessType.Write { - v |= writable | dirty - } - if p.IsSuper() { - // Note that this is inherited from the previous instance. Set - // does not change the value of Super. See above. - v |= super - } - atomic.StoreUintptr((*uintptr)(p), v) -} - -// setPageTable sets this PTE value and forces the write bit and super bit to -// be cleared. This is used explicitly for breaking super pages. -// -//go:nosplit -func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { - addr := pt.Allocator.PhysicalFor(ptes) - if addr&^optionMask != addr { - // This should never happen. - panic("unaligned physical address!") - } - v := addr | present | user | writable | accessed | dirty - atomic.StoreUintptr((*uintptr)(p), v) -} - -// Address extracts the address. This should only be used if Valid returns true. -// -//go:nosplit -func (p *PTE) Address() uintptr { - return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask -} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids.go b/pkg/sentry/platform/ring0/pagetables/pcids.go deleted file mode 100644 index 964496aac..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pcids.go +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -import ( - "gvisor.dev/gvisor/pkg/sync" -) - -// PCIDs is a simple PCID database. -// -// This is not protected by locks and is thus suitable for use only with a -// single CPU at a time. -type PCIDs struct { - // mu protects below. - mu sync.Mutex - - // cache are the assigned page tables. - cache map[*PageTables]uint16 - - // avail are available PCIDs. - avail []uint16 -} - -// NewPCIDs returns a new PCID database. -// -// start is the first index to assign. Typically this will be one, as the zero -// pcid will always be flushed on transition (see pagetables_x86.go). This may -// be more than one if specific PCIDs are reserved. -// -// Nil is returned iff the start and size are out of range. -func NewPCIDs(start, size uint16) *PCIDs { - if start+uint16(size) > limitPCID { - return nil // See comment. - } - p := &PCIDs{ - cache: make(map[*PageTables]uint16), - } - for pcid := start; pcid < start+size; pcid++ { - p.avail = append(p.avail, pcid) - } - return p -} - -// Assign assigns a PCID to the given PageTables. -// -// This may overwrite any previous assignment provided. If this in the case, -// true is returned to indicate that the PCID should be flushed. -func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) { - p.mu.Lock() - if pcid, ok := p.cache[pt]; ok { - p.mu.Unlock() - return pcid, false // No flush. - } - - // Is there something available? - if len(p.avail) > 0 { - pcid := p.avail[len(p.avail)-1] - p.avail = p.avail[:len(p.avail)-1] - p.cache[pt] = pcid - - // We need to flush because while this is in the available - // pool, it may have been used previously. - p.mu.Unlock() - return pcid, true - } - - // Evict an existing table. - for old, pcid := range p.cache { - delete(p.cache, old) - p.cache[pt] = pcid - - // A flush is definitely required in this case, these page - // tables may still be active. (They will just be assigned some - // other PCID if and when they hit the given CPU again.) - p.mu.Unlock() - return pcid, true - } - - // No PCID. - p.mu.Unlock() - return 0, false -} - -// Drop drops references to a set of page tables. -func (p *PCIDs) Drop(pt *PageTables) { - p.mu.Lock() - if pcid, ok := p.cache[pt]; ok { - delete(p.cache, pt) - p.avail = append(p.avail, pcid) - } - p.mu.Unlock() -} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go deleted file mode 100644 index fbfd41d83..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package pagetables - -// limitPCID is the maximum value of PCIDs. -// -// In VMSAv8-64, the PCID(ASID) size is an IMPLEMENTATION DEFINED choice -// of 8 bits or 16 bits, and ID_AA64MMFR0_EL1.ASIDBits identifies the -// supported size. When an implementation supports a 16-bit ASID, TCR_ELx.AS -// selects whether the top 8 bits of the ASID are used. -var limitPCID uint16 - -// GetASIDBits return the system ASID bits, 8 or 16 bits. -func GetASIDBits() uint8 - -func init() { - limitPCID = uint16(1)<<GetASIDBits() - 1 -} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s deleted file mode 100644 index e9d62d768..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -#include "funcdata.h" -#include "textflag.h" - -#define ID_AA64MMFR0_ASIDBITS_SHIFT 4 -#define ID_AA64MMFR0_ASIDBITS_16 2 -#define TCR_EL1_AS_BIT 36 - -// GetASIDBits return the system ASID bits, 8 or 16 bits. -// -// func GetASIDBits() uint8 -TEXT ·GetASIDBits(SB),NOSPLIT,$0-1 - // First, check whether 16bits ASID is supported. - // ID_AA64MMFR0_EL1.ASIDBITS[7:4] == 0010. - WORD $0xd5380700 // MRS ID_AA64MMFR0_EL1, R0 - UBFX $ID_AA64MMFR0_ASIDBITS_SHIFT, R0, $4, R0 - CMPW $ID_AA64MMFR0_ASIDBITS_16, R0 - BNE bits_8 - - // Second, check whether 16bits ASID is enabled. - // TCR_EL1.AS[36] == 1. - WORD $0xd5382040 // MRS TCR_EL1, R0 - TBZ $TCR_EL1_AS_BIT, R0, bits_8 - MOVD $16, R0 - B done -bits_8: - MOVD $8, R0 -done: - MOVB R0, ret+0(FP) - RET diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go deleted file mode 100644 index 91fc5e8dd..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build i386 amd64 - -package pagetables - -// limitPCID is the maximum value of valid PCIDs. -const limitPCID = 4095 diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go deleted file mode 100644 index 8f9dacd93..000000000 --- a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package pagetables - -// Visitor is a generic type. -type Visitor interface { - // visit is called on each PTE. - visit(start uintptr, pte *PTE, align uintptr) - - // requiresAlloc indicates that new entries should be allocated within - // the walked range. - requiresAlloc() bool - - // requiresSplit indicates that entries in the given range should be - // split if they are huge or jumbo pages. - requiresSplit() bool -} - -// Walker walks page tables. -type Walker struct { - // pageTables are the tables to walk. - pageTables *PageTables - - // Visitor is the set of arguments. - visitor Visitor -} - -// iterateRange iterates over all appropriate levels of page tables for the given range. -// -// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The -// exception is super pages. If a valid super page (huge or jumbo) cannot be -// installed, then the walk will continue to individual entries. -// -// This algorithm will attempt to maximize the use of super pages whenever -// possible. Whether a super page is provided will be clear through the range -// provided in the callback. -// -// Note that if requiresAlloc is true, then no gaps will be present. However, -// if alloc is not set, then the iteration will likely be full of gaps. -// -// Note that this function should generally be avoided in favor of Map, Unmap, -// etc. when not necessary. -// -// Precondition: start must be page-aligned. -// -// Precondition: start must be less than end. -// -// Precondition: If requiresAlloc is true, then start and end should not span -// non-canonical ranges. If they do, a panic will result. -// -//go:nosplit -func (w *Walker) iterateRange(start, end uintptr) { - if start%pteSize != 0 { - panic("unaligned start") - } - if end < start { - panic("start > end") - } - if start < lowerTop { - if end <= lowerTop { - w.iterateRangeCanonical(start, end) - } else if end > lowerTop && end <= upperBottom { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - w.iterateRangeCanonical(start, lowerTop) - } else { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - w.iterateRangeCanonical(start, lowerTop) - w.iterateRangeCanonical(upperBottom, end) - } - } else if start < upperBottom { - if end <= upperBottom { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - } else { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - w.iterateRangeCanonical(upperBottom, end) - } - } else { - w.iterateRangeCanonical(start, end) - } -} - -// next returns the next address quantized by the given size. -// -//go:nosplit -func next(start uintptr, size uintptr) uintptr { - start &= ^(size - 1) - start += size - return start -} - -// iterateRangeCanonical walks a canonical range. -// -//go:nosplit -func (w *Walker) iterateRangeCanonical(start, end uintptr) { - for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { - var ( - pgdEntry = &w.pageTables.root[pgdIndex] - pudEntries *PTEs - ) - if !pgdEntry.Valid() { - if !w.visitor.requiresAlloc() { - // Skip over this entry. - start = next(start, pgdSize) - continue - } - - // Allocate a new pgd. - pudEntries = w.pageTables.Allocator.NewPTEs() - pgdEntry.setPageTable(w.pageTables, pudEntries) - } else { - pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) - } - - // Map the next level. - clearPUDEntries := uint16(0) - - for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { - var ( - pudEntry = &pudEntries[pudIndex] - pmdEntries *PTEs - ) - if !pudEntry.Valid() { - if !w.visitor.requiresAlloc() { - // Skip over this entry. - clearPUDEntries++ - start = next(start, pudSize) - continue - } - - // This level has 1-GB super pages. Is this - // entire region at least as large as a single - // PUD entry? If so, we can skip allocating a - // new page for the pmd. - if start&(pudSize-1) == 0 && end-start >= pudSize { - pudEntry.SetSuper() - w.visitor.visit(uintptr(start), pudEntry, pudSize-1) - if pudEntry.Valid() { - start = next(start, pudSize) - continue - } - } - - // Allocate a new pud. - pmdEntries = w.pageTables.Allocator.NewPTEs() - pudEntry.setPageTable(w.pageTables, pmdEntries) - - } else if pudEntry.IsSuper() { - // Does this page need to be split? - if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { - // Install the relevant entries. - pmdEntries = w.pageTables.Allocator.NewPTEs() - for index := uint16(0); index < entriesPerPage; index++ { - pmdEntries[index].SetSuper() - pmdEntries[index].Set( - pudEntry.Address()+(pmdSize*uintptr(index)), - pudEntry.Opts()) - } - pudEntry.setPageTable(w.pageTables, pmdEntries) - } else { - // A super page to be checked directly. - w.visitor.visit(uintptr(start), pudEntry, pudSize-1) - - // Might have been cleared. - if !pudEntry.Valid() { - clearPUDEntries++ - } - - // Note that the super page was changed. - start = next(start, pudSize) - continue - } - } else { - pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) - } - - // Map the next level, since this is valid. - clearPMDEntries := uint16(0) - - for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { - var ( - pmdEntry = &pmdEntries[pmdIndex] - pteEntries *PTEs - ) - if !pmdEntry.Valid() { - if !w.visitor.requiresAlloc() { - // Skip over this entry. - clearPMDEntries++ - start = next(start, pmdSize) - continue - } - - // This level has 2-MB huge pages. If this - // region is contined in a single PMD entry? - // As above, we can skip allocating a new page. - if start&(pmdSize-1) == 0 && end-start >= pmdSize { - pmdEntry.SetSuper() - w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) - if pmdEntry.Valid() { - start = next(start, pmdSize) - continue - } - } - - // Allocate a new pmd. - pteEntries = w.pageTables.Allocator.NewPTEs() - pmdEntry.setPageTable(w.pageTables, pteEntries) - - } else if pmdEntry.IsSuper() { - // Does this page need to be split? - if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { - // Install the relevant entries. - pteEntries = w.pageTables.Allocator.NewPTEs() - for index := uint16(0); index < entriesPerPage; index++ { - pteEntries[index].Set( - pmdEntry.Address()+(pteSize*uintptr(index)), - pmdEntry.Opts()) - } - pmdEntry.setPageTable(w.pageTables, pteEntries) - } else { - // A huge page to be checked directly. - w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) - - // Might have been cleared. - if !pmdEntry.Valid() { - clearPMDEntries++ - } - - // Note that the huge page was changed. - start = next(start, pmdSize) - continue - } - } else { - pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) - } - - // Map the next level, since this is valid. - clearPTEEntries := uint16(0) - - for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { - var ( - pteEntry = &pteEntries[pteIndex] - ) - if !pteEntry.Valid() && !w.visitor.requiresAlloc() { - clearPTEEntries++ - start += pteSize - continue - } - - // At this point, we are guaranteed that start%pteSize == 0. - w.visitor.visit(uintptr(start), pteEntry, pteSize-1) - if !pteEntry.Valid() { - if w.visitor.requiresAlloc() { - panic("PTE not set after iteration with requiresAlloc!") - } - clearPTEEntries++ - } - - // Note that the pte was changed. - start += pteSize - continue - } - - // Check if we no longer need this page. - if clearPTEEntries == entriesPerPage { - pmdEntry.Clear() - w.pageTables.Allocator.FreePTEs(pteEntries) - clearPMDEntries++ - } - } - - // Check if we no longer need this page. - if clearPMDEntries == entriesPerPage { - pudEntry.Clear() - w.pageTables.Allocator.FreePTEs(pmdEntries) - clearPUDEntries++ - } - } - - // Check if we no longer need this page. - if clearPUDEntries == entriesPerPage { - pgdEntry.Clear() - w.pageTables.Allocator.FreePTEs(pudEntries) - } - } -} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go deleted file mode 100644 index c261d393a..000000000 --- a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go +++ /dev/null @@ -1,314 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package pagetables - -// Visitor is a generic type. -type Visitor interface { - // visit is called on each PTE. - visit(start uintptr, pte *PTE, align uintptr) - - // requiresAlloc indicates that new entries should be allocated within - // the walked range. - requiresAlloc() bool - - // requiresSplit indicates that entries in the given range should be - // split if they are huge or jumbo pages. - requiresSplit() bool -} - -// Walker walks page tables. -type Walker struct { - // pageTables are the tables to walk. - pageTables *PageTables - - // Visitor is the set of arguments. - visitor Visitor -} - -// iterateRange iterates over all appropriate levels of page tables for the given range. -// -// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The -// exception is sect pages. If a valid sect page (huge or jumbo) cannot be -// installed, then the walk will continue to individual entries. -// -// This algorithm will attempt to maximize the use of sect pages whenever -// possible. Whether a sect page is provided will be clear through the range -// provided in the callback. -// -// Note that if requiresAlloc is true, then no gaps will be present. However, -// if alloc is not set, then the iteration will likely be full of gaps. -// -// Note that this function should generally be avoided in favor of Map, Unmap, -// etc. when not necessary. -// -// Precondition: start must be page-aligned. -// -// Precondition: start must be less than end. -// -// Precondition: If requiresAlloc is true, then start and end should not span -// non-canonical ranges. If they do, a panic will result. -// -//go:nosplit -func (w *Walker) iterateRange(start, end uintptr) { - if start%pteSize != 0 { - panic("unaligned start") - } - if end < start { - panic("start > end") - } - if start < lowerTop { - if end <= lowerTop { - w.iterateRangeCanonical(start, end) - } else if end > lowerTop && end <= upperBottom { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - w.iterateRangeCanonical(start, lowerTop) - } else { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - w.iterateRangeCanonical(start, lowerTop) - w.iterateRangeCanonical(upperBottom, end) - } - } else if start < upperBottom { - if end <= upperBottom { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - } else { - if w.visitor.requiresAlloc() { - panic("alloc spans non-canonical range") - } - w.iterateRangeCanonical(upperBottom, end) - } - } else { - w.iterateRangeCanonical(start, end) - } -} - -// next returns the next address quantized by the given size. -// -//go:nosplit -func next(start uintptr, size uintptr) uintptr { - start &= ^(size - 1) - start += size - return start -} - -// iterateRangeCanonical walks a canonical range. -// -//go:nosplit -func (w *Walker) iterateRangeCanonical(start, end uintptr) { - pgdEntryIndex := w.pageTables.root - if start >= upperBottom { - pgdEntryIndex = w.pageTables.archPageTables.root - } - - for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { - var ( - pgdEntry = &pgdEntryIndex[pgdIndex] - pudEntries *PTEs - ) - if !pgdEntry.Valid() { - if !w.visitor.requiresAlloc() { - // Skip over this entry. - start = next(start, pgdSize) - continue - } - - // Allocate a new pgd. - pudEntries = w.pageTables.Allocator.NewPTEs() - pgdEntry.setPageTable(w.pageTables, pudEntries) - } else { - pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) - } - - // Map the next level. - clearPUDEntries := uint16(0) - - for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { - var ( - pudEntry = &pudEntries[pudIndex] - pmdEntries *PTEs - ) - if !pudEntry.Valid() { - if !w.visitor.requiresAlloc() { - // Skip over this entry. - clearPUDEntries++ - start = next(start, pudSize) - continue - } - - // This level has 1-GB sect pages. Is this - // entire region at least as large as a single - // PUD entry? If so, we can skip allocating a - // new page for the pmd. - if start&(pudSize-1) == 0 && end-start >= pudSize { - pudEntry.SetSect() - w.visitor.visit(uintptr(start), pudEntry, pudSize-1) - if pudEntry.Valid() { - start = next(start, pudSize) - continue - } - } - - // Allocate a new pud. - pmdEntries = w.pageTables.Allocator.NewPTEs() - pudEntry.setPageTable(w.pageTables, pmdEntries) - - } else if pudEntry.IsSect() { - // Does this page need to be split? - if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { - // Install the relevant entries. - pmdEntries = w.pageTables.Allocator.NewPTEs() - for index := uint16(0); index < entriesPerPage; index++ { - pmdEntries[index].SetSect() - pmdEntries[index].Set( - pudEntry.Address()+(pmdSize*uintptr(index)), - pudEntry.Opts()) - } - pudEntry.setPageTable(w.pageTables, pmdEntries) - } else { - // A sect page to be checked directly. - w.visitor.visit(uintptr(start), pudEntry, pudSize-1) - - // Might have been cleared. - if !pudEntry.Valid() { - clearPUDEntries++ - } - - // Note that the sect page was changed. - start = next(start, pudSize) - continue - } - - } else { - pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) - } - - // Map the next level, since this is valid. - clearPMDEntries := uint16(0) - - for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { - var ( - pmdEntry = &pmdEntries[pmdIndex] - pteEntries *PTEs - ) - if !pmdEntry.Valid() { - if !w.visitor.requiresAlloc() { - // Skip over this entry. - clearPMDEntries++ - start = next(start, pmdSize) - continue - } - - // This level has 2-MB huge pages. If this - // region is contined in a single PMD entry? - // As above, we can skip allocating a new page. - if start&(pmdSize-1) == 0 && end-start >= pmdSize { - pmdEntry.SetSect() - w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) - if pmdEntry.Valid() { - start = next(start, pmdSize) - continue - } - } - - // Allocate a new pmd. - pteEntries = w.pageTables.Allocator.NewPTEs() - pmdEntry.setPageTable(w.pageTables, pteEntries) - - } else if pmdEntry.IsSect() { - // Does this page need to be split? - if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { - // Install the relevant entries. - pteEntries = w.pageTables.Allocator.NewPTEs() - for index := uint16(0); index < entriesPerPage; index++ { - pteEntries[index].Set( - pmdEntry.Address()+(pteSize*uintptr(index)), - pmdEntry.Opts()) - } - pmdEntry.setPageTable(w.pageTables, pteEntries) - } else { - // A huge page to be checked directly. - w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) - - // Might have been cleared. - if !pmdEntry.Valid() { - clearPMDEntries++ - } - - // Note that the huge page was changed. - start = next(start, pmdSize) - continue - } - - } else { - pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) - } - - // Map the next level, since this is valid. - clearPTEEntries := uint16(0) - - for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { - var ( - pteEntry = &pteEntries[pteIndex] - ) - if !pteEntry.Valid() && !w.visitor.requiresAlloc() { - clearPTEEntries++ - start += pteSize - continue - } - - // At this point, we are guaranteed that start%pteSize == 0. - w.visitor.visit(uintptr(start), pteEntry, pteSize-1) - if !pteEntry.Valid() { - if w.visitor.requiresAlloc() { - panic("PTE not set after iteration with requiresAlloc!") - } - clearPTEEntries++ - } - - // Note that the pte was changed. - start += pteSize - continue - } - - // Check if we no longer need this page. - if clearPTEEntries == entriesPerPage { - pmdEntry.Clear() - w.pageTables.Allocator.FreePTEs(pteEntries) - clearPMDEntries++ - } - } - - // Check if we no longer need this page. - if clearPMDEntries == entriesPerPage { - pudEntry.Clear() - w.pageTables.Allocator.FreePTEs(pmdEntries) - clearPUDEntries++ - } - } - - // Check if we no longer need this page. - if clearPUDEntries == entriesPerPage { - pgdEntry.Clear() - w.pageTables.Allocator.FreePTEs(pudEntries) - } - } -} diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go deleted file mode 100644 index cdeb1b43a..000000000 --- a/pkg/sentry/platform/ring0/ring0.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ring0 provides basic operating system-level stubs. -package ring0 diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go deleted file mode 100644 index 34fbc1c35..000000000 --- a/pkg/sentry/platform/ring0/x86.go +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build 386 amd64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/cpuid" -) - -// Useful bits. -const ( - _CR0_PE = 1 << 0 - _CR0_ET = 1 << 4 - _CR0_AM = 1 << 18 - _CR0_PG = 1 << 31 - - _CR4_PSE = 1 << 4 - _CR4_PAE = 1 << 5 - _CR4_PGE = 1 << 7 - _CR4_OSFXSR = 1 << 9 - _CR4_OSXMMEXCPT = 1 << 10 - _CR4_FSGSBASE = 1 << 16 - _CR4_PCIDE = 1 << 17 - _CR4_OSXSAVE = 1 << 18 - _CR4_SMEP = 1 << 20 - - _RFLAGS_AC = 1 << 18 - _RFLAGS_NT = 1 << 14 - _RFLAGS_IOPL0 = 1 << 12 - _RFLAGS_IOPL1 = 1 << 13 - _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1 - _RFLAGS_DF = 1 << 10 - _RFLAGS_IF = 1 << 9 - _RFLAGS_STEP = 1 << 8 - _RFLAGS_RESERVED = 1 << 1 - - _EFER_SCE = 0x001 - _EFER_LME = 0x100 - _EFER_LMA = 0x400 - _EFER_NX = 0x800 - - _MSR_STAR = 0xc0000081 - _MSR_LSTAR = 0xc0000082 - _MSR_CSTAR = 0xc0000083 - _MSR_SYSCALL_MASK = 0xc0000084 - _MSR_PLATFORM_INFO = 0xce - _MSR_MISC_FEATURES = 0x140 - - _PLATFORM_INFO_CPUID_FAULT = 1 << 31 - - _MISC_FEATURE_CPUID_TRAP = 0x1 -) - -const ( - // KernelFlagsSet should always be set in the kernel. - KernelFlagsSet = _RFLAGS_RESERVED - - // UserFlagsSet are always set in userspace. - // - // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege - // level. The Current Privilege Level (CPL) of the task must be less - // than or equal to the IOPL in order for the task or program to access - // I/O ports. - // - // Here, _RFLAGS_IOPL0 is used only to determine whether the task is - // running in the kernel or userspace mode. In the user mode, the CPL is - // always 3 and it doesn't matter what IOPL is set if it is bellow CPL. - // - // We need to have one bit which will be always different in user and - // kernel modes. And we have to remember that even though we have - // KernelFlagsClear, we still can see some of these flags in the kernel - // mode. This can happen when the goruntime switches on a goroutine - // which has been saved in the host mode. On restore, the popf - // instruction is used to restore flags and this means that all flags - // what the goroutine has in the host mode will be restored in the - // kernel mode. - // - // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set - // it in the user mode. So if this flag is set, the task is running in - // the user mode and if it isn't set, the task is running in the kernel - // mode. - UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0 - - // KernelFlagsClear should always be clear in the kernel. - KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT - - // UserFlagsClear are always cleared in userspace. - UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1 -) - -// IsKernelFlags returns true if rflags coresponds to the kernel mode. -// -// go:nosplit -func IsKernelFlags(rflags uint64) bool { - return rflags&_RFLAGS_IOPL0 == 0 -} - -// Vector is an exception vector. -type Vector uintptr - -// Exception vectors. -const ( - DivideByZero Vector = iota - Debug - NMI - Breakpoint - Overflow - BoundRangeExceeded - InvalidOpcode - DeviceNotAvailable - DoubleFault - CoprocessorSegmentOverrun - InvalidTSS - SegmentNotPresent - StackSegmentFault - GeneralProtectionFault - PageFault - _ - X87FloatingPointException - AlignmentCheck - MachineCheck - SIMDFloatingPointException - VirtualizationException - SecurityException = 0x1e - SyscallInt80 = 0x80 - _NR_INTERRUPTS = 0x100 -) - -// System call vectors. -const ( - Syscall Vector = _NR_INTERRUPTS -) - -// VirtualAddressBits returns the number bits available for virtual addresses. -// -// Note that sign-extension semantics apply to the highest order bit. -// -// FIXME(b/69382326): This should use the cpuid passed to Init. -func VirtualAddressBits() uint32 { - ax, _, _, _ := cpuid.HostID(0x80000008, 0) - return (ax >> 8) & 0xff -} - -// PhysicalAddressBits returns the number of bits available for physical addresses. -// -// FIXME(b/69382326): This should use the cpuid passed to Init. -func PhysicalAddressBits() uint32 { - ax, _, _, _ := cpuid.HostID(0x80000008, 0) - return ax & 0xff -} - -// Selector is a segment Selector. -type Selector uint16 - -// SegmentDescriptor is a segment descriptor. -type SegmentDescriptor struct { - bits [2]uint32 -} - -// descriptorTable is a collection of descriptors. -type descriptorTable [32]SegmentDescriptor - -// SegmentDescriptorFlags are typed flags within a descriptor. -type SegmentDescriptorFlags uint32 - -// SegmentDescriptorFlag declarations. -const ( - SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set). - SegmentDescriptorWrite = 1 << 9 // Write permission. - SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used. - SegmentDescriptorExecute = 1 << 11 // Execute permission. - SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data. - SegmentDescriptorPresent = 1 << 15 // Present. - SegmentDescriptorAVL = 1 << 20 // Available. - SegmentDescriptorLong = 1 << 21 // Long mode. - SegmentDescriptorDB = 1 << 22 // 16 or 32-bit. - SegmentDescriptorG = 1 << 23 // Granularity: page or byte. -) - -// Base returns the descriptor's base linear address. -func (d *SegmentDescriptor) Base() uint32 { - return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16 -} - -// Limit returns the descriptor size. -func (d *SegmentDescriptor) Limit() uint32 { - l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000 - if d.bits[1]&uint32(SegmentDescriptorG) != 0 { - l <<= 12 - l |= 0xFFF - } - return l -} - -// Flags returns descriptor flags. -func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags { - return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00) -} - -// DPL returns the descriptor privilege level. -func (d *SegmentDescriptor) DPL() int { - return int((d.bits[1] >> 13) & 3) -} - -func (d *SegmentDescriptor) setNull() { - d.bits[0] = 0 - d.bits[1] = 0 -} - -func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) { - flags |= SegmentDescriptorPresent - if limit>>12 != 0 { - limit >>= 12 - flags |= SegmentDescriptorG - } - d.bits[0] = base<<16 | limit&0xFFFF - d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13 -} - -func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) { - d.set(base, limit, dpl, - SegmentDescriptorDB| - SegmentDescriptorExecute| - SegmentDescriptorSystem) -} - -func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) { - d.set(base, limit, dpl, - SegmentDescriptorG| - SegmentDescriptorLong| - SegmentDescriptorExecute| - SegmentDescriptorSystem) -} - -func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) { - d.set(base, limit, dpl, - SegmentDescriptorWrite| - SegmentDescriptorSystem) -} - -// setHi is only used for the TSS segment, which is magically 64-bits. -func (d *SegmentDescriptor) setHi(base uint32) { - d.bits[0] = base - d.bits[1] = 0 -} - -// Gate64 is a 64-bit task, trap, or interrupt gate. -type Gate64 struct { - bits [4]uint32 -} - -// idt64 is a 64-bit interrupt descriptor table. -type idt64 [_NR_INTERRUPTS]Gate64 - -func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) { - g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF - g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7 - g.bits[2] = uint32(rip >> 32) -} - -func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) { - g.setInterrupt(cs, rip, dpl, ist) - g.bits[1] |= 1 << 8 -} - -// TaskState64 is a 64-bit task state structure. -type TaskState64 struct { - _ uint32 - rsp0Lo, rsp0Hi uint32 - rsp1Lo, rsp1Hi uint32 - rsp2Lo, rsp2Hi uint32 - _ [2]uint32 - ist1Lo, ist1Hi uint32 - ist2Lo, ist2Hi uint32 - ist3Lo, ist3Hi uint32 - ist4Lo, ist4Hi uint32 - ist5Lo, ist5Hi uint32 - ist6Lo, ist6Hi uint32 - ist7Lo, ist7Hi uint32 - _ [2]uint32 - _ uint16 - ioPerm uint16 -} diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 7065a0e46..69693f263 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -251,11 +251,11 @@ var errStackType = syserr.New("expected but did not receive a netstack.Stack", l type commonEndpoint interface { // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and // transport.Endpoint.GetLocalAddress. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and // transport.Endpoint.GetRemoteAddress. - GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) + GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) // Readiness implements tcpip.Endpoint.Readiness and // transport.Endpoint.Readiness. @@ -263,19 +263,19 @@ type commonEndpoint interface { // SetSockOpt implements tcpip.Endpoint.SetSockOpt and // transport.Endpoint.SetSockOpt. - SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error + SetSockOpt(tcpip.SettableSocketOption) tcpip.Error // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and // transport.Endpoint.SetSockOptInt. - SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error + SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error // GetSockOpt implements tcpip.Endpoint.GetSockOpt and // transport.Endpoint.GetSockOpt. - GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error + GetSockOpt(tcpip.GettableSocketOption) tcpip.Error // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and // transport.Endpoint.GetSockOpt. - GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) + GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) // State returns a socket's lifecycle state. The returned value is // protocol-specific and is primarily used for diagnostics. @@ -283,7 +283,7 @@ type commonEndpoint interface { // LastError implements tcpip.Endpoint.LastError and // transport.Endpoint.LastError. - LastError() *tcpip.Error + LastError() tcpip.Error // SocketOptions implements tcpip.Endpoint.SocketOptions and // transport.Endpoint.SocketOptions. @@ -442,7 +442,7 @@ func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Write func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { r := src.Reader(ctx) n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) - if err == tcpip.ErrWouldBlock { + if _, ok := err.(*tcpip.ErrWouldBlock); ok { return 0, syserror.ErrWouldBlock } if err != nil { @@ -459,17 +459,24 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO var _ tcpip.Payloader = (*limitedPayloader)(nil) type limitedPayloader struct { - io.LimitedReader + inner io.LimitedReader + err error } -func (l limitedPayloader) Len() int { - return int(l.N) +func (l *limitedPayloader) Read(p []byte) (int, error) { + n, err := l.inner.Read(p) + l.err = err + return n, err +} + +func (l *limitedPayloader) Len() int { + return int(l.inner.N) } // ReadFrom implements fs.FileOperations.ReadFrom. func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { f := limitedPayloader{ - LimitedReader: io.LimitedReader{ + inner: io.LimitedReader{ R: r, N: count, }, @@ -479,8 +486,8 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader // so we can't release the lock while copying data. Atomic: true, }) - if err == tcpip.ErrBadBuffer { - err = nil + if _, ok := err.(*tcpip.ErrBadBuffer); ok { + return n, f.err } return n, syserr.TranslateNetstackError(err).ToError() } @@ -526,7 +533,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool if family == linux.AF_UNSPEC { err := s.Endpoint.Disconnect() - if err == tcpip.ErrNotSupported { + if _, ok := err.(*tcpip.ErrNotSupported); ok { return syserr.ErrAddressFamilyNotSupported } return syserr.TranslateNetstackError(err) @@ -548,15 +555,16 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool s.EventRegister(&e, waiter.EventOut) defer s.EventUnregister(&e) - if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting { + switch err := s.Endpoint.Connect(addr); err.(type) { + case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting: + case *tcpip.ErrNoPortAvailable: if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { // TCP unlike UDP returns EADDRNOTAVAIL when it can't // find an available local ephemeral port. - if err == tcpip.ErrNoPortAvailable { - return syserr.ErrAddressNotAvailable - } + return syserr.ErrAddressNotAvailable } - + return syserr.TranslateNetstackError(err) + default: return syserr.TranslateNetstackError(err) } @@ -614,16 +622,16 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Issue the bind request to the endpoint. err := s.Endpoint.Bind(addr) - if err == tcpip.ErrNoPortAvailable { + if _, ok := err.(*tcpip.ErrNoPortAvailable); ok { // Bind always returns EADDRINUSE irrespective of if the specified port was // already bound or if an ephemeral port was requested but none were // available. // - // tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because + // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because // UDP connect returns EAGAIN on ephemeral port exhaustion. // // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. - err = tcpip.ErrPortInUse + err = &tcpip.ErrPortInUse{} } return syserr.TranslateNetstackError(err) @@ -646,7 +654,8 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAdd // Try to accept the connection again; if it fails, then wait until we // get a notification. for { - if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock { + ep, wq, err := s.Endpoint.Accept(peerAddr) + if _, ok := err.(*tcpip.ErrWouldBlock); !ok { return ep, wq, syserr.TranslateNetstackError(err) } @@ -665,7 +674,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { - if terr != tcpip.ErrWouldBlock || !blocking { + if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } @@ -1098,6 +1107,29 @@ func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, // TODO(b/64800844): Translate fields once they are added to // tcpip.TCPInfoOption. info := linux.TCPInfo{} + switch v.CcState { + case tcpip.RTORecovery: + info.CaState = linux.TCP_CA_Loss + case tcpip.FastRecovery, tcpip.SACKRecovery: + info.CaState = linux.TCP_CA_Recovery + case tcpip.Disorder: + info.CaState = linux.TCP_CA_Disorder + case tcpip.Open: + info.CaState = linux.TCP_CA_Open + } + info.RTO = uint32(v.RTO / time.Microsecond) + info.RTT = uint32(v.RTT / time.Microsecond) + info.RTTVar = uint32(v.RTTVar / time.Microsecond) + info.SndSsthresh = v.SndSsthresh + info.SndCwnd = v.SndCwnd + + // In netstack reorderSeen is updated only when RACK is enabled. + // We only track whether the reordering is seen, which is + // different than Linux where reorderSeen is not specific to + // RACK and is incremented when a reordering event is seen. + if v.ReorderSeen { + info.ReordSeen = 1 + } // Linux truncates the output binary to outLen. buf := t.CopyScratchBuffer(info.SizeBytes()) @@ -2534,7 +2566,7 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq defer s.readMu.Unlock() res, err := s.Endpoint.Read(w, readOptions) - if err == tcpip.ErrBadBuffer && dst.NumBytes() == 0 { + if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 { err = nil } if err != nil { @@ -2634,9 +2666,9 @@ func (s *socketOpsCommon) dequeueErr() *tcpip.SockError { } // Update socket error to reflect ICMP errors in queue. - if nextErr := so.PeekErr(); nextErr != nil && nextErr.ErrOrigin.IsICMPErr() { + if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() { so.SetLastError(nextErr.Err) - } else if err.ErrOrigin.IsICMPErr() { + } else if err.Cause.Origin().IsICMPErr() { so.SetLastError(nil) } return err @@ -2790,13 +2822,15 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if flags&linux.MSG_DONTWAIT != 0 { return int(total), syserr.TranslateNetstackError(err) } - switch err { + block := true + switch err.(type) { case nil: - if total == src.NumBytes() { - break - } - fallthrough - case tcpip.ErrWouldBlock: + block = total != src.NumBytes() + case *tcpip.ErrWouldBlock: + default: + block = false + } + if block { if ch == nil { // We'll have to block. Register for notification and keep trying to // send all the data. diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 3bbdf552e..24922c400 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -130,7 +130,7 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs r := src.Reader(ctx) n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) - if err == tcpip.ErrWouldBlock { + if _, ok := err.(*tcpip.ErrWouldBlock); ok { return 0, syserror.ErrWouldBlock } if err != nil { @@ -154,7 +154,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block } ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { - if terr != tcpip.ErrWouldBlock || !blocking { + if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index c847ff1c7..2515dda80 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -118,7 +118,7 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* // Create the endpoint. var ep tcpip.Endpoint - var e *tcpip.Error + var e tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go index 0af805246..ba1cc79e9 100644 --- a/pkg/sentry/socket/netstack/provider_vfs2.go +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -62,7 +62,7 @@ func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int // Create the endpoint. var ep tcpip.Endpoint - var e *tcpip.Error + var e tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 97729dacc..cc535d794 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -81,10 +81,10 @@ func sockErrCmsgToLinux(sockErr *tcpip.SockError) linux.SockErrCMsg { ee := linux.SockExtendedErr{ Errno: uint32(syserr.TranslateNetstackError(sockErr.Err).ToLinux().Number()), - Origin: errOriginToLinux(sockErr.ErrOrigin), - Type: sockErr.ErrType, - Code: sockErr.ErrCode, - Info: sockErr.ErrInfo, + Origin: errOriginToLinux(sockErr.Cause.Origin()), + Type: sockErr.Cause.Type(), + Code: sockErr.Cause.Code(), + Info: sockErr.Cause.Info(), } switch sockErr.NetProto { diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index b011082dc..fc5b823b0 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -48,7 +48,7 @@ type ConnectingEndpoint interface { Type() linux.SockType // GetLocalAddress returns the bound path. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // Locker protects the following methods. While locked, only the holder of // the lock can change the return value of the protected methods. diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 0e3889c6d..70227bbd2 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -169,32 +169,32 @@ type Endpoint interface { Type() linux.SockType // GetLocalAddress returns the address to which the endpoint is bound. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // GetRemoteAddress returns the address to which the endpoint is // connected. - GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) + GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) // SetSockOpt sets a socket option. - SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error + SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error // SetSockOptInt sets a socket option for simple cases when a value has // the int type. - SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error + SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error // GetSockOpt gets a socket option. - GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error + GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error // GetSockOptInt gets a socket option for simple cases when a return // value has the int type. - GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) + GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) // State returns the current state of the socket, as represented by Linux in // procfs. State() uint32 // LastError clears and returns the last error reported by the endpoint. - LastError() *tcpip.Error + LastError() tcpip.Error // SocketOptions returns the structure which contains all the socket // level options. @@ -580,7 +580,7 @@ type ConnectedEndpoint interface { Passcred() bool // GetLocalAddress implements Endpoint.GetLocalAddress. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // Send sends a single message. This method does not block. // @@ -640,7 +640,7 @@ type connectedEndpoint struct { Passcred() bool // GetLocalAddress implements Endpoint.GetLocalAddress. - GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + GetLocalAddress() (tcpip.FullAddress, tcpip.Error) // Type implements Endpoint.Type. Type() linux.SockType @@ -655,7 +655,7 @@ func (e *connectedEndpoint) Passcred() bool { } // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. -func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return e.endpoint.GetLocalAddress() } @@ -836,11 +836,11 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess } // SetSockOpt sets a socket option. -func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { +func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { return nil } -func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { +func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { switch opt { case tcpip.ReceiveBufferSizeOption: default: @@ -855,34 +855,34 @@ func (e *baseEndpoint) IsUnixSocket() bool { } // GetSendBufferSize implements tcpip.SocketOptionsHandler.GetSendBufferSize. -func (e *baseEndpoint) GetSendBufferSize() (int64, *tcpip.Error) { +func (e *baseEndpoint) GetSendBufferSize() (int64, tcpip.Error) { e.Lock() defer e.Unlock() if !e.Connected() { - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v := e.connected.SendMaxQueueSize() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return v, nil } -func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { +func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 e.Lock() if !e.Connected() { e.Unlock() - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v = int(e.receiver.RecvQueuedSize()) e.Unlock() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return v, nil @@ -890,12 +890,12 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.Lock() if !e.Connected() { e.Unlock() - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v := e.connected.SendQueuedSize() e.Unlock() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return int(v), nil @@ -903,29 +903,29 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.Lock() if e.receiver == nil { e.Unlock() - return -1, tcpip.ErrNotConnected + return -1, &tcpip.ErrNotConnected{} } v := e.receiver.RecvMaxQueueSize() e.Unlock() if v < 0 { - return -1, tcpip.ErrQueueSizeNotSupported + return -1, &tcpip.ErrQueueSizeNotSupported{} } return int(v), nil default: log.Warningf("Unsupported socket option: %d", opt) - return -1, tcpip.ErrUnknownProtocolOption + return -1, &tcpip.ErrUnknownProtocolOption{} } } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { +func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { log.Warningf("Unsupported socket option: %T", opt) - return tcpip.ErrUnknownProtocolOption + return &tcpip.ErrUnknownProtocolOption{} } // LastError implements Endpoint.LastError. -func (*baseEndpoint) LastError() *tcpip.Error { +func (*baseEndpoint) LastError() tcpip.Error { return nil } @@ -965,7 +965,7 @@ func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { } // GetLocalAddress returns the bound path. -func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { +func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { e.Lock() defer e.Unlock() return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil @@ -973,14 +973,14 @@ func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { // GetRemoteAddress returns the local address of the connected endpoint (if // available). -func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { +func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { e.Lock() c := e.connected e.Unlock() if c != nil { return c.GetLocalAddress() } - return tcpip.FullAddress{}, tcpip.ErrNotConnected + return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } // Release implements BoundEndpoint.Release. diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index dab6207c0..d1778d029 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -134,8 +134,8 @@ func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op s // Similar to EPIPE. Return what we wrote this time, and let // ENOSPC be returned on the next call. return true, nil - case syserror.ECONNRESET: - // For TCP sendfile connections, we may have a reset. But we + case syserror.ECONNRESET, syserror.ETIMEDOUT: + // For TCP sendfile connections, we may have a reset or timeout. But we // should just return n as the result. return true, nil case syserror.ErrWouldBlock: diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index e39f074f2..1a31898e8 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -123,6 +123,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: + // allowed + default: + return 0, nil, syserror.EBADF + } + } + switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: minfd := args[2].Int() @@ -395,6 +404,10 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + return 0, nil, syserror.EBADF + } + // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { return 0, nil, syserror.ESPIPE diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 20c264fef..c7c3fed57 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -32,6 +32,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + return 0, nil, syserror.EBADF + } + // Handle ioctls that apply to all FDs. switch args[1].Int() { case linux.FIONCLEX: diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 6e9b599e2..1f8a5878c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -36,6 +36,10 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } defer file.DecRef(t) + if file.StatusFlags()&linux.O_PATH != 0 { + return 0, nil, syserror.EBADF + } + return 0, nil, file.SyncFS(t) } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index a3868bf16..df4990854 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -83,6 +83,7 @@ go_library( "mount.go", "mount_namespace_refs.go", "mount_unsafe.go", + "opath.go", "options.go", "pathname.go", "permissions.go", diff --git a/pkg/sentry/vfs/opath.go b/pkg/sentry/vfs/opath.go new file mode 100644 index 000000000..39fbac987 --- /dev/null +++ b/pkg/sentry/vfs/opath.go @@ -0,0 +1,139 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// opathFD implements vfs.FileDescriptionImpl for a file description opened with O_PATH. +// +// +stateify savable +type opathFD struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + NoLockFD +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *opathFD) Release(context.Context) { + // noop +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EBADF +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return syserror.EBADF +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.EBADF +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return syserror.EBADF +} + +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return nil, syserror.EBADF +} + +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { + return "", syserror.EBADF +} + +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error { + return syserror.EBADF +} + +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error { + return syserror.EBADF +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *opathFD) Sync(ctx context.Context) error { + return syserror.EBADF +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error { + return syserror.EBADF +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + vfsObj := fd.vfsfd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vfsfd.vd, + Start: fd.vfsfd.vd, + }) + stat, err := fd.vfsfd.vd.mount.fs.impl.StatAt(ctx, rp, opts) + vfsObj.putResolvingPath(ctx, rp) + return stat, err +} + +// StatFS returns metadata for the filesystem containing the file represented +// by fd. +func (fd *opathFD) StatFS(ctx context.Context) (linux.Statfs, error) { + vfsObj := fd.vfsfd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vfsfd.vd, + Start: fd.vfsfd.vd, + }) + statfs, err := fd.vfsfd.vd.mount.fs.impl.StatFSAt(ctx, rp) + vfsObj.putResolvingPath(ctx, rp) + return statfs, err +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index bc79e5ecc..c9907843c 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -129,7 +129,7 @@ type OpenOptions struct { // // FilesystemImpls are responsible for implementing the following flags: // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, - // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and + // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_SYNC, O_TMPFILE, and // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file // descriptors are mostly outside the scope of VFS. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 6fd1bb0b2..0aff2dd92 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -425,6 +425,18 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential rp.mustBeDir = true rp.mustBeDirOrig = true } + if opts.Flags&linux.O_PATH != 0 { + vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) + if err != nil { + return nil, err + } + fd := &opathFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}); err != nil { + return nil, err + } + vd.DecRef(ctx) + return &fd.vfsfd, err + } for { fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { |