diff options
Diffstat (limited to 'pkg')
591 files changed, 6761 insertions, 10657 deletions
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go index 3059479bd..008bbca08 100644 --- a/pkg/abi/abi_linux.go +++ b/pkg/abi/abi_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package abi diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index eb004a7f6..3576396c1 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -41,6 +41,7 @@ go_library( "linux.go", "membarrier.go", "mm.go", + "msgqueue.go", "netdevice.go", "netfilter.go", "netfilter_ipv6.go", diff --git a/pkg/abi/linux/arch_amd64.go b/pkg/abi/linux/arch_amd64.go index 0be31e755..064c0a6da 100644 --- a/pkg/abi/linux/arch_amd64.go +++ b/pkg/abi/linux/arch_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/clone.go b/pkg/abi/linux/clone.go index c2cbfca5e..322a4ef5a 100644 --- a/pkg/abi/linux/clone.go +++ b/pkg/abi/linux/clone.go @@ -16,13 +16,16 @@ package linux // Clone constants per clone(2). const ( + CSIGNAL = 0xff + CLONE_VM = 0x100 CLONE_FS = 0x200 CLONE_FILES = 0x400 CLONE_SIGHAND = 0x800 - CLONE_PARENT = 0x8000 + CLONE_PIDFD = 0x1000 CLONE_PTRACE = 0x2000 CLONE_VFORK = 0x4000 + CLONE_PARENT = 0x8000 CLONE_THREAD = 0x10000 CLONE_NEWNS = 0x20000 CLONE_SYSVSEM = 0x40000 @@ -32,10 +35,30 @@ const ( CLONE_DETACHED = 0x400000 CLONE_UNTRACED = 0x800000 CLONE_CHILD_SETTID = 0x1000000 + CLONE_NEWCGROUP = 0x2000000 CLONE_NEWUTS = 0x4000000 CLONE_NEWIPC = 0x8000000 CLONE_NEWUSER = 0x10000000 CLONE_NEWPID = 0x20000000 CLONE_NEWNET = 0x40000000 CLONE_IO = 0x80000000 + + // Only passable via clone3(2). + CLONE_CLEAR_SIGHAND = 0x100000000 + CLONE_INTO_CGROUP = 0x200000000 ) + +// CloneArgs is struct clone_args, from include/uapi/linux/sched.h. +type CloneArgs struct { + Flags uint64 + Pidfd uint64 + ChildTID uint64 + ParentTID uint64 + ExitSignal uint64 + Stack uint64 + StackSize uint64 + TLS uint64 + SetTID uint64 + SetTIDSize uint64 + Cgroup uint64 +} diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go index 7e74b1143..7d5b9fdfb 100644 --- a/pkg/abi/linux/epoll_amd64.go +++ b/pkg/abi/linux/epoll_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go index a35939cc9..5e5960d32 100644 --- a/pkg/abi/linux/epoll_arm64.go +++ b/pkg/abi/linux/epoll_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go index 6b72364ea..ab404b17e 100644 --- a/pkg/abi/linux/file_amd64.go +++ b/pkg/abi/linux/file_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go index 6492c9038..6234955ab 100644 --- a/pkg/abi/linux/file_arm64.go +++ b/pkg/abi/linux/file_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/msgqueue.go b/pkg/abi/linux/msgqueue.go new file mode 100644 index 000000000..e1e8d0357 --- /dev/null +++ b/pkg/abi/linux/msgqueue.go @@ -0,0 +1,108 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/marshal/primitive" +) + +// Linux-specific control commands. Source: include/uapi/linux/msg.h +const ( + MSG_STAT = 11 + MSG_INFO = 12 + MSG_STAT_ANY = 13 +) + +// msgrcv(2) options. Source: include/uapi/linux/msg.h +const ( + MSG_NOERROR = 010000 // No error if message is too big. + MSG_EXCEPT = 020000 // Receive any message except of specified type. + MSG_COPY = 040000 // Copy (not remove) all queue messages. +) + +// System-wide limits for message queues. Source: include/uapi/linux/msg.h +const ( + MSGMNI = 32000 // Maximum number of message queue identifiers. + MSGMAX = 8192 // Maximum size of message (bytes). + MSGMNB = 16384 // Default max size of a message queue. +) + +// System-wide limits. Unused. Source: include/uapi/linux/msg.h +const ( + MSGPOOL = (MSGMNI * MSGMNB / 1024) + MSGTQL = MSGMNB + MSGMAP = MSGMNB + MSGSSZ = 16 + + // MSGSEG is simplified due to the inexistance of a ternary operator. + MSGSEG = (MSGPOOL * 1024) / MSGSSZ +) + +// MsqidDS is equivelant to struct msqid64_ds. Source: +// include/uapi/asm-generic/shmbuf.h +// +// +marshal +type MsqidDS struct { + MsgPerm IPCPerm // IPC permissions. + MsgStime TimeT // Last msgsnd time. + MsgRtime TimeT // Last msgrcv time. + MsgCtime TimeT // Last change time. + MsgCbytes uint64 // Current number of bytes on the queue. + MsgQnum uint64 // Number of messages in the queue. + MsgQbytes uint64 // Max number of bytes in the queue. + MsgLspid int32 // PID of last msgsnd. + MsgLrpid int32 // PID of last msgrcv. + unused4 uint64 + unused5 uint64 +} + +// MsgBuf is equivelant to struct msgbuf. Source: include/uapi/linux/msg.h +// +// +marshal dynamic +type MsgBuf struct { + Type primitive.Int64 + Text primitive.ByteSlice +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (b *MsgBuf) SizeBytes() int { + return b.Type.SizeBytes() + b.Text.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (b *MsgBuf) MarshalBytes(dst []byte) { + b.Type.MarshalUnsafe(dst) + b.Text.MarshalBytes(dst[b.Type.SizeBytes():]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (b *MsgBuf) UnmarshalBytes(src []byte) { + b.Type.UnmarshalUnsafe(src) + b.Text.UnmarshalBytes(src[b.Type.SizeBytes():]) +} + +// MsgInfo is equivelant to struct msginfo. Source: include/uapi/linux/msg.h +// +// +marshal +type MsgInfo struct { + MsgPool int32 + MsgMap int32 + MsgMax int32 + MsgMnb int32 + MsgMni int32 + MsgSsz int32 + MsgTql int32 + MsgSeg uint16 `marshal:"unaligned"` +} diff --git a/pkg/abi/linux/ptrace_amd64.go b/pkg/abi/linux/ptrace_amd64.go index e722971f1..e970b5b4a 100644 --- a/pkg/abi/linux/ptrace_amd64.go +++ b/pkg/abi/linux/ptrace_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/ptrace_arm64.go b/pkg/abi/linux/ptrace_arm64.go index 3d0906565..91e5af56b 100644 --- a/pkg/abi/linux/ptrace_arm64.go +++ b/pkg/abi/linux/ptrace_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/sem_amd64.go b/pkg/abi/linux/sem_amd64.go index ab980cb4f..cabd2d4b8 100644 --- a/pkg/abi/linux/sem_amd64.go +++ b/pkg/abi/linux/sem_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/sem_arm64.go b/pkg/abi/linux/sem_arm64.go index 521468fb1..a0c467dc4 100644 --- a/pkg/abi/linux/sem_arm64.go +++ b/pkg/abi/linux/sem_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/wait.go b/pkg/abi/linux/wait.go index 4bdc280d1..710729138 100644 --- a/pkg/abi/linux/wait.go +++ b/pkg/abi/linux/wait.go @@ -14,6 +14,10 @@ package linux +import ( + "fmt" +) + // Options for waitpid(2), wait4(2), and/or waitid(2), from // include/uapi/linux/wait.h. const ( @@ -34,3 +38,124 @@ const ( P_PID = 0x1 P_PGID = 0x2 ) + +// WaitStatus represents a thread status, as returned by the wait* family of +// syscalls. +type WaitStatus uint32 + +// WaitStatusExit returns a WaitStatus representing the given exit status. +func WaitStatusExit(status int32) WaitStatus { + return WaitStatus(uint32(status) << 8) +} + +// WaitStatusTerminationSignal returns a WaitStatus representing termination by +// the given signal. +func WaitStatusTerminationSignal(sig Signal) WaitStatus { + return WaitStatus(uint32(sig)) +} + +// WaitStatusStopped returns a WaitStatus representing stoppage by the given +// signal or ptrace trap code. +func WaitStatusStopped(code uint32) WaitStatus { + return WaitStatus(code<<8 | 0x7f) +} + +// WaitStatusContinued returns a WaitStatus representing continuation by +// SIGCONT. +func WaitStatusContinued() WaitStatus { + return WaitStatus(0xffff) +} + +// WithCoreDump returns a copy of ws that indicates that a core dump was +// generated. +// +// Preconditions: ws.Signaled(). +func (ws WaitStatus) WithCoreDump() WaitStatus { + return ws | 0x80 +} + +// Exited returns true if ws represents an exit status, consistent with +// WIFEXITED. +func (ws WaitStatus) Exited() bool { + return ws&0x7f == 0 +} + +// Signaled returns true if ws represents a termination by signal, consistent +// with WIFSIGNALED. +func (ws WaitStatus) Signaled() bool { + // ws&0x7f != 0 (exited) and ws&0x7f != 0x7f (stopped or continued) + return ((ws&0x7f)+1)>>1 != 0 +} + +// CoreDumped returns true if ws indicates that a core dump was produced, +// consistent with WCOREDUMP. +// +// Preconditions: ws.Signaled(). +func (ws WaitStatus) CoreDumped() bool { + return ws&0x80 != 0 +} + +// Stopped returns true if ws represents a stoppage, consistent with +// WIFSTOPPED. +func (ws WaitStatus) Stopped() bool { + return ws&0xff == 0x7f +} + +// Continued returns true if ws represents a continuation by SIGCONT, +// consistent with WIFCONTINUED. +func (ws WaitStatus) Continued() bool { + return ws == 0xffff +} + +// ExitStatus returns the lower 8 bits of the exit status represented by ws, +// consistent with WEXITSTATUS. +// +// Preconditions: ws.Exited(). +func (ws WaitStatus) ExitStatus() uint32 { + return uint32((ws & 0xff00) >> 8) +} + +// TerminationSignal returns the termination signal represented by ws, +// consistent with WTERMSIG. +// +// Preconditions: ws.Signaled(). +func (ws WaitStatus) TerminationSignal() Signal { + return Signal(ws & 0x7f) +} + +// StopSignal returns the stop signal represented by ws, consistent with +// WSTOPSIG. +// +// Preconditions: ws.Stopped(). +func (ws WaitStatus) StopSignal() Signal { + return Signal((ws & 0xff00) >> 8) +} + +// PtraceEvent returns the PTRACE_EVENT_* field in ws. +// +// Preconditions: ws.Stopped(). +func (ws WaitStatus) PtraceEvent() uint32 { + return uint32(ws >> 16) +} + +// String implements fmt.Stringer.String. +func (ws WaitStatus) String() string { + switch { + case ws.Exited(): + return fmt.Sprintf("exit status %d", ws.ExitStatus()) + case ws.Signaled(): + if ws.CoreDumped() { + return fmt.Sprintf("killed by signal %d (core dumped)", ws.TerminationSignal()) + } + return fmt.Sprintf("killed by signal %d", ws.TerminationSignal()) + case ws.Stopped(): + if ev := ws.PtraceEvent(); ev != 0 { + return fmt.Sprintf("stopped by signal %d (PTRACE_EVENT %d)", ws.StopSignal(), ev) + } + return fmt.Sprintf("stopped by signal %d", ws.StopSignal()) + case ws.Continued(): + return "continued" + default: + return fmt.Sprintf("unknown status %#x", uint32(ws)) + } +} diff --git a/pkg/atomicbitops/aligned_32bit_unsafe.go b/pkg/atomicbitops/aligned_32bit_unsafe.go index df706b453..383f81ff2 100644 --- a/pkg/atomicbitops/aligned_32bit_unsafe.go +++ b/pkg/atomicbitops/aligned_32bit_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm || mips || 386 // +build arm mips 386 package atomicbitops diff --git a/pkg/atomicbitops/aligned_64bit.go b/pkg/atomicbitops/aligned_64bit.go index 1544c7814..2c421d920 100644 --- a/pkg/atomicbitops/aligned_64bit.go +++ b/pkg/atomicbitops/aligned_64bit.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !arm && !mips && !386 // +build !arm,!mips,!386 package atomicbitops diff --git a/pkg/atomicbitops/atomicbitops.go b/pkg/atomicbitops/atomicbitops.go index 1be081719..4c4606a58 100644 --- a/pkg/atomicbitops/atomicbitops.go +++ b/pkg/atomicbitops/atomicbitops.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || arm64 // +build amd64 arm64 // Package atomicbitops provides extensions to the sync/atomic package. diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go index 3b2898256..474c0c815 100644 --- a/pkg/atomicbitops/atomicbitops_noasm.go +++ b/pkg/atomicbitops/atomicbitops_noasm.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !amd64 && !arm64 // +build !amd64,!arm64 package atomicbitops diff --git a/pkg/bitmap/bitmap.go b/pkg/bitmap/bitmap.go index 803b7b3c7..12d2fc2b8 100644 --- a/pkg/bitmap/bitmap.go +++ b/pkg/bitmap/bitmap.go @@ -32,8 +32,8 @@ type Bitmap struct { bitBlock []uint64 } -// BitmapWithSize create a new empty Bitmap. -func BitmapWithSize(size uint32) Bitmap { +// New create a new empty Bitmap. +func New(size uint32) Bitmap { b := Bitmap{} bSize := (size + 63) / 64 b.bitBlock = make([]uint64, bSize) diff --git a/pkg/bitmap/bitmap_test.go b/pkg/bitmap/bitmap_test.go index 37f068438..76ebd779f 100644 --- a/pkg/bitmap/bitmap_test.go +++ b/pkg/bitmap/bitmap_test.go @@ -42,7 +42,7 @@ func generateFilledSlice(min, max, length int) []uint32 { // generateFilledBitmap generates a Bitmap filled with fillNum of numbers, // and returns the slice and bitmap. func generateFilledBitmap(min, max, fillNum int) ([]uint32, Bitmap) { - bitmap := BitmapWithSize(uint32(max)) + bitmap := New(uint32(max)) randSlice := generateFilledSlice(min, max, fillNum) for i := 0; i < fillNum; i++ { bitmap.Add(randSlice[i]) @@ -64,8 +64,8 @@ func TestNewBitmap(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { - if bitmap := BitmapWithSize(uint32(tt.size)); len(bitmap.bitBlock) != tt.expectSize { - t.Errorf("BitmapWithSize created bitmap with %v, bitBlock size: %d, wanted: %d", tt.name, len(bitmap.bitBlock), tt.expectSize) + if bitmap := New(uint32(tt.size)); len(bitmap.bitBlock) != tt.expectSize { + t.Errorf("New created bitmap with %v, bitBlock size: %d, wanted: %d", tt.name, len(bitmap.bitBlock), tt.expectSize) } }) } @@ -87,7 +87,7 @@ func TestAdd(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { - bitmap := BitmapWithSize(uint32(tt.bitmapSize)) + bitmap := New(uint32(tt.bitmapSize)) bitmap.Add(uint32(tt.addNum)) bitmapSlice := bitmap.ToSlice() if bitmapSlice[0] != uint32(tt.addNum) { @@ -98,7 +98,7 @@ func TestAdd(t *testing.T) { } func TestRemove(t *testing.T) { - bitmap := BitmapWithSize(uint32(1024)) + bitmap := New(uint32(1024)) firstSlice := generateFilledSlice(0, 511, 50) secondSlice := generateFilledSlice(512, 1024, 50) for i := 0; i < 50; i++ { @@ -176,7 +176,7 @@ func TestClearRange(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { - bitmap := BitmapWithSize(uint32(tt.bitmapSize)) + bitmap := New(uint32(tt.bitmapSize)) bitmap.FlipRange(uint32(0), uint32(tt.bitmapSize)) bitmap.ClearRange(uint32(tt.clearRangeMin), uint32(tt.clearRangeMax+1)) clearedBitmapSlice := bitmap.ToSlice() @@ -295,7 +295,7 @@ func TestBitmapNumOnes(t *testing.T) { } func TestFirstZero(t *testing.T) { - bitmap := BitmapWithSize(uint32(1000)) + bitmap := New(uint32(1000)) bitmap.FlipRange(200, 400) for i, j := range map[uint32]uint32{0: 0, 201: 400, 200: 400, 199: 199, 400: 400, 10000: math.MaxInt32} { v := bitmap.FirstZero(i) diff --git a/pkg/bits/uint64_arch.go b/pkg/bits/uint64_arch.go index 9f23eff77..fc5634167 100644 --- a/pkg/bits/uint64_arch.go +++ b/pkg/bits/uint64_arch.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || arm64 // +build amd64 arm64 package bits diff --git a/pkg/bits/uint64_arch_amd64_asm.s b/pkg/bits/uint64_arch_amd64_asm.s index 8ff364181..2931b5d56 100644 --- a/pkg/bits/uint64_arch_amd64_asm.s +++ b/pkg/bits/uint64_arch_amd64_asm.s @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 TEXT ·TrailingZeros64(SB),$0-16 diff --git a/pkg/bits/uint64_arch_arm64_asm.s b/pkg/bits/uint64_arch_arm64_asm.s index 814ba562d..eb8d4d280 100644 --- a/pkg/bits/uint64_arch_arm64_asm.s +++ b/pkg/bits/uint64_arch_arm64_asm.s @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 TEXT ·TrailingZeros64(SB),$0-16 diff --git a/pkg/bits/uint64_arch_generic.go b/pkg/bits/uint64_arch_generic.go index 9dd2098d1..83b23a3fc 100644 --- a/pkg/bits/uint64_arch_generic.go +++ b/pkg/bits/uint64_arch_generic.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !amd64 && !arm64 // +build !amd64,!arm64 package bits diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go index b33a20802..0fabee92b 100644 --- a/pkg/coverage/coverage.go +++ b/pkg/coverage/coverage.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + // Package coverage provides an interface through which Go coverage data can // be collected, converted to kcov format, and exposed to userspace. // diff --git a/pkg/cpuid/cpuid_arm64.go b/pkg/cpuid/cpuid_arm64.go index 98c6ec62f..6e61d562f 100644 --- a/pkg/cpuid/cpuid_arm64.go +++ b/pkg/cpuid/cpuid_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package cpuid diff --git a/pkg/cpuid/cpuid_arm64_test.go b/pkg/cpuid/cpuid_arm64_test.go index a34f67779..16b1c064a 100644 --- a/pkg/cpuid/cpuid_arm64_test.go +++ b/pkg/cpuid/cpuid_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package cpuid diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go index d60fdb550..36dd20552 100644 --- a/pkg/cpuid/cpuid_parse_x86_test.go +++ b/pkg/cpuid/cpuid_parse_x86_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package cpuid diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go index 392711e8f..dc17cade8 100644 --- a/pkg/cpuid/cpuid_x86.go +++ b/pkg/cpuid/cpuid_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package cpuid diff --git a/pkg/cpuid/cpuid_x86_test.go b/pkg/cpuid/cpuid_x86_test.go index bacf345c8..92a2d9f81 100644 --- a/pkg/cpuid/cpuid_x86_test.go +++ b/pkg/cpuid/cpuid_x86_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package cpuid diff --git a/pkg/crypto/crypto_stdlib.go b/pkg/crypto/crypto_stdlib.go index 514592b08..69e867386 100644 --- a/pkg/crypto/crypto_stdlib.go +++ b/pkg/crypto/crypto_stdlib.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package crypto import ( diff --git a/pkg/errors/linuxerr/linuxerr.go b/pkg/errors/linuxerr/linuxerr.go index 9246f2e89..f9f8412e0 100644 --- a/pkg/errors/linuxerr/linuxerr.go +++ b/pkg/errors/linuxerr/linuxerr.go @@ -166,6 +166,8 @@ var ( EWOULDBLOCK = EAGAIN EDEADLOCK = EDEADLK ENONET = ENOENT + ENOATTR = ENODATA + ENOTSUP = EOPNOTSUPP ) // A nil *errors.Error denotes no error and is placed at the 0 index of diff --git a/pkg/errors/linuxerr/linuxerr_test.go b/pkg/errors/linuxerr/linuxerr_test.go index 62743c338..f09d61b02 100644 --- a/pkg/errors/linuxerr/linuxerr_test.go +++ b/pkg/errors/linuxerr/linuxerr_test.go @@ -44,7 +44,7 @@ func BenchmarkAssignLinuxerr(b *testing.B) { func BenchmarkAssignSyserror(b *testing.B) { for i := b.N; i > 0; i-- { - globalError = syserror.EINVAL + globalError = linuxerr.ENOMSG } } @@ -69,10 +69,10 @@ func BenchmarkCompareLinuxerr(b *testing.B) { } func BenchmarkCompareSyserror(b *testing.B) { - globalError = syserror.EAGAIN + globalError = linuxerr.EAGAIN j := 0 for i := b.N; i > 0; i-- { - if globalError == syserror.EINVAL { + if globalError == linuxerr.EACCES { j++ } } @@ -109,15 +109,15 @@ func BenchmarkSwitchLinuxerr(b *testing.B) { } func BenchmarkSwitchSyserror(b *testing.B) { - globalError = syserror.EPERM + globalError = linuxerr.EPERM j := 0 for i := b.N; i > 0; i-- { switch globalError { - case syserror.EINVAL: + case linuxerr.EACCES: j++ case syserror.EINTR: j += 2 - case syserror.EAGAIN: + case linuxerr.EAGAIN: j += 3 } } @@ -265,7 +265,7 @@ func TestEqualsMethod(t *testing.T) { { name: "linuxerr nil error not", linuxErr: []*gErrors.Error{nil, linuxerr.NOERROR}, - err: []error{unix.Errno(1), linuxerr.EPERM, syserror.EACCES}, + err: []error{unix.Errno(1), linuxerr.EPERM, linuxerr.EACCES}, equal: false, }, { @@ -277,13 +277,13 @@ func TestEqualsMethod(t *testing.T) { { name: "equal errors", linuxErr: []*gErrors.Error{linuxerr.ESRCH}, - err: []error{linuxerr.ESRCH, syserror.ESRCH, unix.Errno(linuxerr.ESRCH.Errno())}, + err: []error{linuxerr.ESRCH, linuxerr.ESRCH, unix.Errno(linuxerr.ESRCH.Errno())}, equal: true, }, { name: "unequal errors", linuxErr: []*gErrors.Error{linuxerr.ENOENT}, - err: []error{linuxerr.ESRCH, syserror.ESRCH, unix.Errno(linuxerr.ESRCH.Errno())}, + err: []error{linuxerr.ESRCH, linuxerr.ESRCH, unix.Errno(linuxerr.ESRCH.Errno())}, equal: false, }, { diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index a264ae2f0..ad15d3672 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -12,13 +12,13 @@ go_library( visibility = ["//:sandbox"], deps = [ ":eventchannel_go_proto", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/sync", "//pkg/unet", "@org_golang_google_protobuf//encoding/prototext:go_default_library", "@org_golang_google_protobuf//proto:go_default_library", "@org_golang_google_protobuf//types/known/anypb:go_default_library", - "@org_golang_x_sys//unix:go_default_library", "@org_golang_x_time//rate:go_default_library", ], ) diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go index 98dfeb1f5..2be2d9d37 100644 --- a/pkg/eventchannel/event.go +++ b/pkg/eventchannel/event.go @@ -23,9 +23,9 @@ import ( "encoding/binary" "fmt" - "golang.org/x/sys/unix" "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/proto" + "gvisor.dev/gvisor/pkg/errors/linuxerr" pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" @@ -155,7 +155,7 @@ func (s *socketEmitter) Emit(msg proto.Message) (bool, error) { for done := 0; done < len(p); { n, err := s.socket.Write(p[done:]) if err != nil { - return (err == unix.EPIPE), err + return linuxerr.Equals(linuxerr.EPIPE, err), err } done += n } diff --git a/pkg/eventchannel/event_any.go b/pkg/eventchannel/event_any.go index a5549f6cd..13f300061 100644 --- a/pkg/eventchannel/event_any.go +++ b/pkg/eventchannel/event_any.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package eventchannel import ( diff --git a/pkg/fdchannel/fdchannel_unsafe.go b/pkg/fdchannel/fdchannel_unsafe.go index 1f24a448d..f9a201eeb 100644 --- a/pkg/fdchannel/fdchannel_unsafe.go +++ b/pkg/fdchannel/fdchannel_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris // Package fdchannel implements passing file descriptors between processes over diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go index 1290d5d10..152557143 100644 --- a/pkg/fdnotifier/fdnotifier.go +++ b/pkg/fdnotifier/fdnotifier.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package fdnotifier contains an adapter that translates IO events (e.g., a diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go index 493ea8375..db917303f 100644 --- a/pkg/fdnotifier/poll_unsafe.go +++ b/pkg/fdnotifier/poll_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdnotifier diff --git a/pkg/flipcall/ctrl_futex.go b/pkg/flipcall/ctrl_futex.go index 2e8452a02..5d2ee4018 100644 --- a/pkg/flipcall/ctrl_futex.go +++ b/pkg/flipcall/ctrl_futex.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package flipcall import ( diff --git a/pkg/flipcall/futex_linux.go b/pkg/flipcall/futex_linux.go index c212f05f1..4bb85939b 100644 --- a/pkg/flipcall/futex_linux.go +++ b/pkg/flipcall/futex_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package flipcall diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go index 374aac2b4..bd8ceba19 100644 --- a/pkg/gohacks/gohacks_unsafe.go +++ b/pkg/gohacks/gohacks_unsafe.go @@ -12,10 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.13 -// +build !go1.18 +//go:build go1.13 && !go1.18 +// +build go1.13,!go1.18 -// Check type signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. + +// Check type signatures and Noescape when updating Go version. +// +// TODO(b/165820485): add these checks to checklinkname. // Package gohacks contains utilities for subverting the Go compiler. package gohacks diff --git a/pkg/goid/goid.go b/pkg/goid/goid.go index 193b2c2d4..85fb2f6d4 100644 --- a/pkg/goid/goid.go +++ b/pkg/goid/goid.go @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.12 -// +build !go1.18 +//go:build go1.12 && !go1.18 +// +build go1.12,!go1.18 // Check type signatures when updating Go version. diff --git a/pkg/hostarch/hostarch_arm64.go b/pkg/hostarch/hostarch_arm64.go index a31a8aeeb..a65c810a5 100644 --- a/pkg/hostarch/hostarch_arm64.go +++ b/pkg/hostarch/hostarch_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package hostarch diff --git a/pkg/hostarch/hostarch_x86.go b/pkg/hostarch/hostarch_x86.go index af6ef2b7f..00bf668f3 100644 --- a/pkg/hostarch/hostarch_x86.go +++ b/pkg/hostarch/hostarch_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package hostarch diff --git a/pkg/memutil/memfd_linux_unsafe.go b/pkg/memutil/memfd_linux_unsafe.go index 504382213..2179c92f3 100644 --- a/pkg/memutil/memfd_linux_unsafe.go +++ b/pkg/memutil/memfd_linux_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package memutil diff --git a/pkg/memutil/mmap.go b/pkg/memutil/mmap.go index 7c939293f..7a55d1b28 100644 --- a/pkg/memutil/mmap.go +++ b/pkg/memutil/mmap.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package memutil import ( diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go index ac7868ad9..0b961d3d9 100644 --- a/pkg/merkletree/merkletree.go +++ b/pkg/merkletree/merkletree.go @@ -151,21 +151,21 @@ type VerityDescriptor struct { Mode uint32 UID uint32 GID uint32 - Children map[string]struct{} + Children []string SymlinkTarget string RootHash []byte } -func (d *VerityDescriptor) String() string { +func (d *VerityDescriptor) encode() []byte { b := new(bytes.Buffer) e := gob.NewEncoder(b) - e.Encode(d.Children) - return fmt.Sprintf("Name: %s, Size: %d, Mode: %d, UID: %d, GID: %d, Children: %v, Symlink: %s, RootHash: %v", d.Name, d.FileSize, d.Mode, d.UID, d.GID, b.Bytes(), d.SymlinkTarget, d.RootHash) + e.Encode(d) + return b.Bytes() } // verify generates a hash from d, and compares it with expected. func (d *VerityDescriptor) verify(expected []byte, hashAlgorithms int) error { - h, err := hashData([]byte(d.String()), hashAlgorithms) + h, err := hashData(d.encode(), hashAlgorithms) if err != nil { return err } @@ -210,7 +210,7 @@ type GenerateParams struct { GID uint32 // Children is a map of children names for a directory. It should be // empty for a regular file. - Children map[string]struct{} + Children []string // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink. SymlinkTarget string // HashAlgorithms is the algorithms used to hash data. @@ -242,7 +242,7 @@ func Generate(params *GenerateParams) ([]byte, error) { // If file is a symlink do not generate root hash for file content. if params.SymlinkTarget != "" { - return hashData([]byte(descriptor.String()), params.HashAlgorithms) + return hashData(descriptor.encode(), params.HashAlgorithms) } layout, err := InitLayout(params.Size, params.HashAlgorithms, params.DataAndTreeInSameFile) @@ -315,7 +315,7 @@ func Generate(params *GenerateParams) ([]byte, error) { numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock() } descriptor.RootHash = root - return hashData([]byte(descriptor.String()), params.HashAlgorithms) + return hashData(descriptor.encode(), params.HashAlgorithms) } // VerifyParams contains the params used to verify a portion of a file against @@ -339,7 +339,7 @@ type VerifyParams struct { GID uint32 // Children is a map of children names for a directory. It should be // empty for a regular file. - Children map[string]struct{} + Children []string // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink. SymlinkTarget string // HashAlgorithms is the algorithms used to hash data. diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go index 5d6f8df1b..1447fd139 100644 --- a/pkg/merkletree/merkletree_test.go +++ b/pkg/merkletree/merkletree_test.go @@ -206,112 +206,112 @@ func TestGenerate(t *testing.T) { data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{9, 115, 238, 230, 38, 140, 195, 70, 207, 144, 202, 118, 23, 113, 32, 129, 226, 239, 177, 69, 161, 26, 14, 113, 16, 37, 30, 96, 19, 148, 132, 27}, + expectedHash: []byte{78, 38, 225, 107, 61, 246, 26, 6, 71, 163, 254, 97, 112, 200, 87, 232, 190, 87, 231, 160, 119, 124, 61, 229, 49, 126, 90, 223, 134, 51, 77, 182}, }, { name: "OnePageZeroesSHA256SameFile", data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{9, 115, 238, 230, 38, 140, 195, 70, 207, 144, 202, 118, 23, 113, 32, 129, 226, 239, 177, 69, 161, 26, 14, 113, 16, 37, 30, 96, 19, 148, 132, 27}, + expectedHash: []byte{78, 38, 225, 107, 61, 246, 26, 6, 71, 163, 254, 97, 112, 200, 87, 232, 190, 87, 231, 160, 119, 124, 61, 229, 49, 126, 90, 223, 134, 51, 77, 182}, }, { name: "OnePageZeroesSHA512SeparateFile", data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{127, 8, 95, 11, 83, 101, 51, 39, 170, 235, 39, 43, 135, 243, 145, 118, 148, 58, 27, 155, 182, 205, 44, 47, 5, 223, 215, 17, 35, 16, 43, 104, 43, 11, 8, 88, 171, 7, 249, 243, 14, 62, 126, 218, 23, 159, 237, 237, 42, 226, 39, 25, 87, 48, 253, 191, 116, 213, 37, 3, 187, 152, 154, 14}, + expectedHash: []byte{221, 45, 182, 132, 61, 212, 227, 145, 150, 131, 98, 221, 195, 5, 89, 21, 188, 36, 250, 101, 85, 78, 197, 253, 193, 23, 74, 219, 28, 108, 77, 47, 65, 79, 123, 144, 50, 245, 109, 72, 71, 80, 24, 77, 158, 95, 242, 185, 109, 163, 105, 183, 67, 106, 55, 194, 223, 46, 12, 242, 165, 203, 172, 254}, }, { name: "OnePageZeroesSHA512SameFile", data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{127, 8, 95, 11, 83, 101, 51, 39, 170, 235, 39, 43, 135, 243, 145, 118, 148, 58, 27, 155, 182, 205, 44, 47, 5, 223, 215, 17, 35, 16, 43, 104, 43, 11, 8, 88, 171, 7, 249, 243, 14, 62, 126, 218, 23, 159, 237, 237, 42, 226, 39, 25, 87, 48, 253, 191, 116, 213, 37, 3, 187, 152, 154, 14}, + expectedHash: []byte{221, 45, 182, 132, 61, 212, 227, 145, 150, 131, 98, 221, 195, 5, 89, 21, 188, 36, 250, 101, 85, 78, 197, 253, 193, 23, 74, 219, 28, 108, 77, 47, 65, 79, 123, 144, 50, 245, 109, 72, 71, 80, 24, 77, 158, 95, 242, 185, 109, 163, 105, 183, 67, 106, 55, 194, 223, 46, 12, 242, 165, 203, 172, 254}, }, { name: "MultiplePageZeroesSHA256SeparateFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{247, 158, 42, 215, 180, 106, 0, 28, 77, 64, 132, 162, 74, 65, 250, 161, 243, 66, 129, 44, 197, 8, 145, 14, 94, 206, 156, 184, 145, 145, 20, 185}, + expectedHash: []byte{131, 122, 73, 143, 4, 202, 193, 156, 218, 169, 196, 223, 70, 100, 117, 191, 241, 113, 134, 11, 229, 231, 105, 157, 156, 0, 66, 213, 122, 145, 174, 8}, }, { name: "MultiplePageZeroesSHA256SameFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{247, 158, 42, 215, 180, 106, 0, 28, 77, 64, 132, 162, 74, 65, 250, 161, 243, 66, 129, 44, 197, 8, 145, 14, 94, 206, 156, 184, 145, 145, 20, 185}, + expectedHash: []byte{131, 122, 73, 143, 4, 202, 193, 156, 218, 169, 196, 223, 70, 100, 117, 191, 241, 113, 134, 11, 229, 231, 105, 157, 156, 0, 66, 213, 122, 145, 174, 8}, }, { name: "MultiplePageZeroesSHA512SeparateFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{100, 121, 14, 30, 104, 200, 142, 182, 190, 78, 23, 68, 157, 174, 23, 75, 174, 250, 250, 25, 66, 45, 235, 103, 129, 49, 78, 127, 173, 154, 121, 35, 37, 115, 60, 217, 26, 205, 253, 253, 236, 145, 107, 109, 232, 19, 72, 92, 4, 191, 181, 205, 191, 57, 234, 177, 144, 235, 143, 30, 15, 197, 109, 81}, + expectedHash: []byte{211, 48, 232, 110, 240, 51, 99, 241, 123, 138, 42, 76, 94, 86, 59, 200, 3, 246, 137, 148, 189, 226, 111, 103, 146, 29, 12, 218, 40, 182, 33, 99, 193, 163, 238, 26, 184, 13, 165, 187, 68, 173, 139, 9, 208, 59, 0, 192, 180, 50, 221, 35, 43, 119, 194, 16, 64, 84, 116, 63, 158, 195, 194, 226}, }, { name: "MultiplePageZeroesSHA512SameFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{100, 121, 14, 30, 104, 200, 142, 182, 190, 78, 23, 68, 157, 174, 23, 75, 174, 250, 250, 25, 66, 45, 235, 103, 129, 49, 78, 127, 173, 154, 121, 35, 37, 115, 60, 217, 26, 205, 253, 253, 236, 145, 107, 109, 232, 19, 72, 92, 4, 191, 181, 205, 191, 57, 234, 177, 144, 235, 143, 30, 15, 197, 109, 81}, + expectedHash: []byte{211, 48, 232, 110, 240, 51, 99, 241, 123, 138, 42, 76, 94, 86, 59, 200, 3, 246, 137, 148, 189, 226, 111, 103, 146, 29, 12, 218, 40, 182, 33, 99, 193, 163, 238, 26, 184, 13, 165, 187, 68, 173, 139, 9, 208, 59, 0, 192, 180, 50, 221, 35, 43, 119, 194, 16, 64, 84, 116, 63, 158, 195, 194, 226}, }, { name: "SingleASHA256SeparateFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{90, 124, 194, 100, 206, 242, 75, 152, 47, 249, 16, 27, 136, 161, 223, 228, 121, 241, 126, 158, 126, 122, 100, 120, 117, 15, 81, 78, 201, 133, 119, 111}, + expectedHash: []byte{26, 47, 238, 138, 235, 244, 140, 231, 129, 240, 155, 252, 219, 44, 46, 72, 57, 249, 139, 88, 132, 238, 86, 108, 181, 115, 96, 72, 99, 210, 134, 47}, }, { name: "SingleASHA256SameFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{90, 124, 194, 100, 206, 242, 75, 152, 47, 249, 16, 27, 136, 161, 223, 228, 121, 241, 126, 158, 126, 122, 100, 120, 117, 15, 81, 78, 201, 133, 119, 111}, + expectedHash: []byte{26, 47, 238, 138, 235, 244, 140, 231, 129, 240, 155, 252, 219, 44, 46, 72, 57, 249, 139, 88, 132, 238, 86, 108, 181, 115, 96, 72, 99, 210, 134, 47}, }, { name: "SingleASHA512SeparateFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{24, 10, 13, 25, 113, 62, 169, 99, 151, 70, 166, 113, 81, 81, 163, 85, 5, 25, 29, 15, 46, 37, 104, 120, 142, 218, 52, 178, 187, 83, 30, 166, 101, 87, 70, 196, 188, 61, 123, 20, 13, 254, 126, 52, 212, 111, 75, 203, 33, 233, 233, 47, 181, 161, 43, 193, 131, 41, 99, 33, 164, 73, 89, 152}, + expectedHash: []byte{44, 30, 224, 12, 102, 119, 163, 171, 119, 175, 212, 121, 231, 188, 125, 171, 79, 28, 144, 234, 75, 122, 44, 75, 15, 101, 173, 92, 233, 109, 234, 60, 173, 148, 125, 85, 94, 234, 95, 91, 16, 196, 88, 175, 23, 129, 226, 110, 24, 238, 5, 49, 186, 128, 72, 188, 193, 180, 207, 193, 203, 119, 40, 191}, }, { name: "SingleASHA512SameFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{24, 10, 13, 25, 113, 62, 169, 99, 151, 70, 166, 113, 81, 81, 163, 85, 5, 25, 29, 15, 46, 37, 104, 120, 142, 218, 52, 178, 187, 83, 30, 166, 101, 87, 70, 196, 188, 61, 123, 20, 13, 254, 126, 52, 212, 111, 75, 203, 33, 233, 233, 47, 181, 161, 43, 193, 131, 41, 99, 33, 164, 73, 89, 152}, + expectedHash: []byte{44, 30, 224, 12, 102, 119, 163, 171, 119, 175, 212, 121, 231, 188, 125, 171, 79, 28, 144, 234, 75, 122, 44, 75, 15, 101, 173, 92, 233, 109, 234, 60, 173, 148, 125, 85, 94, 234, 95, 91, 16, 196, 88, 175, 23, 129, 226, 110, 24, 238, 5, 49, 186, 128, 72, 188, 193, 180, 207, 193, 203, 119, 40, 191}, }, { name: "OnePageASHA256SeparateFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{132, 54, 112, 142, 156, 19, 50, 140, 138, 240, 192, 154, 100, 120, 242, 69, 64, 217, 62, 166, 127, 88, 23, 197, 100, 66, 255, 215, 214, 229, 54, 1}, + expectedHash: []byte{166, 254, 83, 46, 241, 111, 18, 47, 79, 6, 181, 197, 176, 143, 211, 204, 53, 5, 245, 134, 172, 95, 97, 131, 236, 132, 197, 138, 123, 78, 43, 13}, }, { name: "OnePageASHA256SameFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{132, 54, 112, 142, 156, 19, 50, 140, 138, 240, 192, 154, 100, 120, 242, 69, 64, 217, 62, 166, 127, 88, 23, 197, 100, 66, 255, 215, 214, 229, 54, 1}, + expectedHash: []byte{166, 254, 83, 46, 241, 111, 18, 47, 79, 6, 181, 197, 176, 143, 211, 204, 53, 5, 245, 134, 172, 95, 97, 131, 236, 132, 197, 138, 123, 78, 43, 13}, }, { name: "OnePageASHA512SeparateFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{165, 46, 176, 116, 47, 209, 101, 193, 64, 185, 30, 9, 52, 22, 24, 154, 135, 220, 232, 168, 215, 45, 222, 226, 207, 104, 160, 10, 156, 98, 245, 250, 76, 21, 68, 204, 65, 118, 69, 52, 210, 155, 36, 109, 233, 103, 1, 40, 218, 89, 125, 38, 247, 194, 2, 225, 119, 155, 65, 99, 182, 111, 110, 145}, + expectedHash: []byte{23, 69, 6, 79, 39, 232, 90, 246, 62, 55, 4, 229, 47, 36, 230, 24, 233, 47, 55, 36, 26, 139, 196, 78, 242, 12, 194, 77, 109, 81, 151, 188, 63, 201, 127, 235, 81, 214, 91, 200, 19, 232, 240, 14, 197, 1, 99, 224, 18, 213, 203, 242, 44, 102, 25, 62, 90, 189, 106, 107, 129, 61, 115, 39}, }, { name: "OnePageASHA512SameFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{165, 46, 176, 116, 47, 209, 101, 193, 64, 185, 30, 9, 52, 22, 24, 154, 135, 220, 232, 168, 215, 45, 222, 226, 207, 104, 160, 10, 156, 98, 245, 250, 76, 21, 68, 204, 65, 118, 69, 52, 210, 155, 36, 109, 233, 103, 1, 40, 218, 89, 125, 38, 247, 194, 2, 225, 119, 155, 65, 99, 182, 111, 110, 145}, + expectedHash: []byte{23, 69, 6, 79, 39, 232, 90, 246, 62, 55, 4, 229, 47, 36, 230, 24, 233, 47, 55, 36, 26, 139, 196, 78, 242, 12, 194, 77, 109, 81, 151, 188, 63, 201, 127, 235, 81, 214, 91, 200, 19, 232, 240, 14, 197, 1, 99, 224, 18, 213, 203, 242, 44, 102, 25, 62, 90, 189, 106, 107, 129, 61, 115, 39}, }, } @@ -324,7 +324,7 @@ func TestGenerate(t *testing.T) { Mode: defaultMode, UID: defaultUID, GID: defaultGID, - Children: make(map[string]struct{}), + Children: []string{}, HashAlgorithms: tc.hashAlgorithms, TreeReader: &tree, TreeWriter: &tree, @@ -366,7 +366,7 @@ func prepareVerify(t *testing.T, dataSize int64, hashAlgorithm int, dataAndTreeI Mode: defaultMode, UID: defaultUID, GID: defaultGID, - Children: make(map[string]struct{}), + Children: []string{}, HashAlgorithms: hashAlgorithm, TreeReader: &tree, TreeWriter: &tree, @@ -398,7 +398,7 @@ func prepareVerify(t *testing.T, dataSize int64, hashAlgorithm int, dataAndTreeI Mode: defaultMode, UID: defaultUID, GID: defaultGID, - Children: make(map[string]struct{}), + Children: []string{}, HashAlgorithms: hashAlgorithm, ReadOffset: verifyStart, ReadSize: verifySize, @@ -627,7 +627,7 @@ func TestVerifyModifiedChildren(t *testing.T) { t.Run(tc.name, func(t *testing.T) { var buf bytes.Buffer _, params := prepareVerify(t, hostarch.PageSize /* dataSize */, defaultHashAlgorithm, tc.dataAndTreeInSameFile, false /* isSymlink */, 0 /* verifyStart */, 0 /* verifySize */, &buf) - params.Children["abc"] = struct{}{} + params.Children = append(params.Children, "abc") if _, err := Verify(¶ms); errors.Is(err, nil) { t.Errorf("Verification succeeded when expected to fail") } diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD index 0a6a5d215..c08792751 100644 --- a/pkg/metric/BUILD +++ b/pkg/metric/BUILD @@ -4,13 +4,16 @@ package(licenses = ["notice"]) go_library( name = "metric", - srcs = ["metric.go"], + srcs = [ + "metric.go", + ], visibility = ["//:sandbox"], deps = [ ":metric_go_proto", "//pkg/eventchannel", "//pkg/log", "//pkg/sync", + "@org_golang_google_protobuf//types/known/timestamppb", ], ) @@ -18,6 +21,9 @@ proto_library( name = "metric", srcs = ["metric.proto"], visibility = ["//:sandbox"], + deps = [ + "@com_google_protobuf//:timestamp_proto", + ], ) go_test( diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index 4829ae7ce..ac38ec894 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -20,7 +20,9 @@ import ( "fmt" "sort" "sync/atomic" + "time" + "google.golang.org/protobuf/types/known/timestamppb" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" @@ -54,6 +56,27 @@ var ( }) ) +// InitStage is the name of a Sentry initialization stage. +type InitStage string + +// List of all Sentry initialization stages. +var ( + InitRestoreConfig InitStage = "restore_config" + InitExecConfig InitStage = "exec_config" + InitRestore InitStage = "restore" + InitCreateProcess InitStage = "create_process" + InitTaskStart InitStage = "task_start" + + // allStages is the list of allowed stages. + allStages = []InitStage{ + InitRestoreConfig, + InitExecConfig, + InitRestore, + InitCreateProcess, + InitTaskStart, + } +) + // Uint64Metric encapsulates a uint64 that represents some kind of metric to be // monitored. We currently support metrics with at most one field. // @@ -98,6 +121,10 @@ func Initialize() error { for _, v := range allMetrics.m { m.Metrics = append(m.Metrics, v.metadata) } + m.Stages = make([]string, 0, len(allStages)) + for _, s := range allStages { + m.Stages = append(m.Stages, string(s)) + } if err := eventchannel.Emit(&m); err != nil { return fmt.Errorf("unable to emit metric initialize event: %w", err) } @@ -287,34 +314,66 @@ func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...string) { } } -// metricSet holds named metrics. +// stageTiming contains timing data for an initialization stage. +type stageTiming struct { + stage InitStage + started time.Time + // ended is the zero time when the stage has not ended yet. + ended time.Time +} + +// inProgress returns whether this stage hasn't ended yet. +func (s stageTiming) inProgress() bool { + return !s.started.IsZero() && s.ended.IsZero() +} + +// metricSet holds metric data. type metricSet struct { + // Map of metrics. m map[string]customUint64Metric + + // mu protects the fields below. + mu sync.RWMutex + + // Information about the stages reached by the Sentry. Only appended to, so + // reading a shallow copy of the slice header concurrently is safe. + finished []stageTiming + + // The current stage in progress. + currentStage stageTiming } // makeMetricSet returns a new metricSet. func makeMetricSet() metricSet { return metricSet{ - m: make(map[string]customUint64Metric), + m: make(map[string]customUint64Metric), + finished: make([]stageTiming, 0, len(allStages)), } } // Values returns a snapshot of all values in m. func (m *metricSet) Values() metricValues { - vals := make(metricValues) + m.mu.Lock() + stages := m.finished[:] + m.mu.Unlock() + + vals := metricValues{ + m: make(map[string]interface{}, len(m.m)), + stages: stages, + } for k, v := range m.m { fields := v.metadata.GetFields() switch len(fields) { case 0: - vals[k] = v.value() + vals.m[k] = v.value() case 1: values := fields[0].GetAllowedValues() fieldsMap := make(map[string]uint64) for _, fieldValue := range values { fieldsMap[fieldValue] = v.value(fieldValue) } - vals[k] = fieldsMap + vals.m[k] = fieldsMap default: panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields))) } @@ -322,10 +381,16 @@ func (m *metricSet) Values() metricValues { return vals } -// metricValues contains a copy of the values of all metrics. It is a map -// with key as metric name and value can be either uint64 or map[string]uint64 -// to support metrics with one field. -type metricValues map[string]interface{} +// metricValues contains a copy of the values of all metrics. +type metricValues struct { + // m is a map with key as metric name and value can be either uint64 or + // map[string]uint64 to support metrics with one field. + m map[string]interface{} + + // Information on when initialization stages were reached. Does not include + // the currently-ongoing stage, if any. + stages []stageTiming +} var ( // emitMu protects metricsAtLastEmit and ensures that all emitted @@ -354,8 +419,8 @@ func EmitMetricUpdate() { m := pb.MetricUpdate{} // On the first call metricsAtLastEmit will be empty. Include all // metrics then. - for k, v := range snapshot { - prev, ok := metricsAtLastEmit[k] + for k, v := range snapshot.m { + prev, ok := metricsAtLastEmit.m[k] switch t := v.(type) { case uint64: // Metric exists and value did not change. @@ -386,8 +451,23 @@ func EmitMetricUpdate() { } } + for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ { + newStage := snapshot.stages[s] + m.StageTiming = append(m.StageTiming, &pb.StageTiming{ + Stage: string(newStage.stage), + Started: ×tamppb.Timestamp{ + Seconds: newStage.started.Unix(), + Nanos: int32(newStage.started.Nanosecond()), + }, + Ended: ×tamppb.Timestamp{ + Seconds: newStage.ended.Unix(), + Nanos: int32(newStage.ended.Nanosecond()), + }, + }) + } + metricsAtLastEmit = snapshot - if len(m.Metrics) == 0 { + if len(m.Metrics) == 0 && len(m.StageTiming) == 0 { return } @@ -399,9 +479,52 @@ func EmitMetricUpdate() { for _, metric := range m.Metrics { log.Debugf("%s: %+v", metric.Name, metric.Value) } + for _, stage := range m.StageTiming { + duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond + log.Debugf("Stage %s took %v", stage.GetStage(), duration) + } } if err := eventchannel.Emit(&m); err != nil { log.Warningf("Unable to emit metrics: %s", err) } } + +// StartStage should be called when an initialization stage is started. +// It returns a function that must be called to indicate that the stage ended. +// Alternatively, future calls to StartStage will implicitly indicate that the +// previous stage ended. +// Stage information will be emitted in the next call to EmitMetricUpdate after +// a stage has ended. +// +// This function may (and is expected to) be called prior to final +// initialization of this metric library, as it has to capture early stages +// of Sentry initialization. +func StartStage(stage InitStage) func() { + now := time.Now() + allMetrics.mu.Lock() + defer allMetrics.mu.Unlock() + if allMetrics.currentStage.inProgress() { + endStage(now) + } + allMetrics.currentStage.stage = stage + allMetrics.currentStage.started = now + return func() { + now := time.Now() + allMetrics.mu.Lock() + defer allMetrics.mu.Unlock() + // The current stage may have been ended by another call to StartStage, so + // double-check prior to clearing the current stage. + if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage { + endStage(now) + } + } +} + +// endStage marks allMetrics.currentStage as ended, adding it to the list of +// finished stages. It assumes allMetrics.mu is locked. +func endStage(when time.Time) { + allMetrics.currentStage.ended = when + allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage) + allMetrics.currentStage = stageTiming{} +} diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto index 53c8b4b50..d466b6904 100644 --- a/pkg/metric/metric.proto +++ b/pkg/metric/metric.proto @@ -16,6 +16,8 @@ syntax = "proto3"; package gvisor; +import "google/protobuf/timestamp.proto"; + // MetricMetadata contains all of the metadata describing a single metric. message MetricMetadata { // name is the unique name of the metric, usually in a "directory" format @@ -63,6 +65,7 @@ message MetricMetadata { // future MetricUpdates. message MetricRegistration { repeated MetricMetadata metrics = 1; + repeated string stages = 2; } // MetricValue the value of a metric at a single point in time. @@ -79,9 +82,20 @@ message MetricValue { repeated string field_values = 4; } +// StageTiming represents a new stage that's been reached by the Sentry. +message StageTiming { + string stage = 1; + google.protobuf.Timestamp started = 2; + google.protobuf.Timestamp ended = 3; +} + // MetricUpdate contains new values for multiple distinct metrics. // // Metrics whose values have not changed are not included. message MetricUpdate { repeated MetricValue metrics = 1; + // Timing information of initialization stages reached since last update. + // The first MetricUpdate will include multiple entries, since metric + // initialization happens relatively late in the Sentry startup process. + repeated StageTiming stage_timing = 2; } diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go index 1b4a9e73a..0654bdf07 100644 --- a/pkg/metric/metric_test.go +++ b/pkg/metric/metric_test.go @@ -16,6 +16,7 @@ package metric import ( "testing" + "time" "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/eventchannel" @@ -352,3 +353,147 @@ func TestEmitMetricUpdateWithFields(t *testing.T) { t.Errorf("Field value weird2 not found: %+v", emitter) } } + +func TestMetricUpdateStageTiming(t *testing.T) { + defer reset() + + expectedTimings := map[InitStage]struct{ min, max time.Duration }{} + measureStage := func(stage InitStage, body func()) { + stageStarted := time.Now() + endStage := StartStage(stage) + bodyStarted := time.Now() + body() + bodyEnded := time.Now() + endStage() + stageEnded := time.Now() + + expectedTimings[stage] = struct{ min, max time.Duration }{ + min: bodyEnded.Sub(bodyStarted), + max: stageEnded.Sub(stageStarted), + } + } + checkStage := func(got *pb.StageTiming, want InitStage) { + if InitStage(got.GetStage()) != want { + t.Errorf("%v: got stage %q expected %q", got, got.GetStage(), want) + } + timingBounds, found := expectedTimings[want] + if !found { + t.Fatalf("invalid init stage name %q", want) + } + started := got.Started.AsTime() + ended := got.Ended.AsTime() + duration := ended.Sub(started) + if duration < timingBounds.min { + t.Errorf("stage %v: lasted %v, expected at least %v", want, duration, timingBounds.min) + } else if duration > timingBounds.max { + t.Errorf("stage %v: lasted %v, expected no more than %v", want, duration, timingBounds.max) + } + } + + // Test that it's legit to go through stages before metric registration. + measureStage("before_first_update_1", func() { + time.Sleep(100 * time.Millisecond) + }) + measureStage("before_first_update_2", func() { + time.Sleep(100 * time.Millisecond) + }) + + fooMetric, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription) + if err != nil { + t.Fatalf("Cannot register /foo: %v", err) + } + emitter.Reset() + Initialize() + EmitMetricUpdate() + + // We should have gotten the metric registration and the first MetricUpdate. + if len(emitter) != 2 { + t.Fatalf("emitter has %d messages (%v), expected %d", len(emitter), emitter, 2) + } + + if registration, ok := emitter[0].(*pb.MetricRegistration); !ok { + t.Errorf("first message is not MetricRegistration: %T / %v", emitter[0], emitter[0]) + } else if len(registration.Stages) != len(allStages) { + t.Errorf("MetricRegistration has %d stages (%v), expected %d (%v)", len(registration.Stages), registration.Stages, len(allStages), allStages) + } else { + for i := 0; i < len(allStages); i++ { + if InitStage(registration.Stages[i]) != allStages[i] { + t.Errorf("MetricRegistration.Stages[%d]: got %q want %q", i, registration.Stages[i], allStages[i]) + } + } + } + + if firstUpdate, ok := emitter[1].(*pb.MetricUpdate); !ok { + t.Errorf("second message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(firstUpdate.StageTiming) != 2 { + t.Errorf("MetricUpdate has %d stage timings (%v), expected %d", len(firstUpdate.StageTiming), firstUpdate.StageTiming, 2) + } else { + checkStage(firstUpdate.StageTiming[0], "before_first_update_1") + checkStage(firstUpdate.StageTiming[1], "before_first_update_2") + } + + // Ensure re-emitting doesn't cause another event to be sent. + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 0 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 0) + } + + // Generate monitoring data, we should get an event with no stages. + fooMetric.Increment() + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 1 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 1) + } else if update, ok := emitter[0].(*pb.MetricUpdate); !ok { + t.Errorf("message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(update.StageTiming) != 0 { + t.Errorf("unexpected stage timing information: %v", update.StageTiming) + } + + // Now generate new stages. + measureStage("foo_stage_1", func() { + time.Sleep(100 * time.Millisecond) + }) + measureStage("foo_stage_2", func() { + time.Sleep(100 * time.Millisecond) + }) + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 1 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 1) + } else if update, ok := emitter[0].(*pb.MetricUpdate); !ok { + t.Errorf("message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(update.Metrics) != 0 { + t.Errorf("MetricUpdate has %d metric value changes (%v), expected %d", len(update.Metrics), update.Metrics, 0) + } else if len(update.StageTiming) != 2 { + t.Errorf("MetricUpdate has %d stages (%v), expected %d", len(update.StageTiming), update.StageTiming, 2) + } else { + checkStage(update.StageTiming[0], "foo_stage_1") + checkStage(update.StageTiming[1], "foo_stage_2") + } + + // Now try generating data for both metrics and stages. + fooMetric.Increment() + measureStage("last_stage_1", func() { + time.Sleep(100 * time.Millisecond) + }) + measureStage("last_stage_2", func() { + time.Sleep(100 * time.Millisecond) + }) + fooMetric.Increment() + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 1 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 1) + } else if update, ok := emitter[0].(*pb.MetricUpdate); !ok { + t.Errorf("message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(update.Metrics) != 1 { + t.Errorf("MetricUpdate has %d metric value changes (%v), expected %d", len(update.Metrics), update.Metrics, 1) + } else if len(update.StageTiming) != 2 { + t.Errorf("MetricUpdate has %d stages (%v), expected %d", len(update.StageTiming), update.StageTiming, 2) + } else { + checkStage(update.StageTiming[0], "last_stage_1") + checkStage(update.StageTiming[1], "last_stage_2") + } +} diff --git a/pkg/procid/procid.go b/pkg/procid/procid.go index 78b92422c..e0d42819d 100644 --- a/pkg/procid/procid.go +++ b/pkg/procid/procid.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + // Package procid provides a way to get the current system thread identifier. package procid diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s index c4307c523..74a8de42c 100644 --- a/pkg/procid/procid_amd64.s +++ b/pkg/procid/procid_amd64.s @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build amd64 -// +build go1.8 -// +build !go1.18 +//go:build amd64 && go1.8 && !go1.18 && go1.1 +// +build amd64,go1.8,!go1.18,go1.1 + +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. #include "textflag.h" diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s index c1c409f3c..48182c4a9 100644 --- a/pkg/procid/procid_arm64.s +++ b/pkg/procid/procid_arm64.s @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build arm64 -// +build go1.8 -// +build !go1.18 +//go:build arm64 && go1.8 && !go1.18 && go1.1 +// +build arm64,go1.8,!go1.18,go1.1 + +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. #include "textflag.h" diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go index a2714784d..be0e85fdb 100644 --- a/pkg/rand/rand.go +++ b/pkg/rand/rand.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !linux // +build !linux // Package rand implements a cryptographically secure pseudorandom number diff --git a/pkg/refsvfs2/refs.go b/pkg/refsvfs2/refs.go index ef8beb659..fe3e4a1ca 100644 --- a/pkg/refsvfs2/refs.go +++ b/pkg/refsvfs2/refs.go @@ -28,6 +28,11 @@ type RefCounter interface { // DecRef decrements the object's reference count. Users of refs_template.Refs // may specify a destructor to be called once the reference count reaches zero. DecRef(ctx context.Context) +} + +// TryRefCounter is like RefCounter but allow the ref increment to be tried. +type TryRefCounter interface { + RefCounter // TryIncRef attempts to increment the reference count, but may fail if all // references have already been dropped, in which case it returns false. If diff --git a/pkg/refsvfs2/refs_template.go b/pkg/refsvfs2/refs_template.go index 1102c8adc..55b0a60a1 100644 --- a/pkg/refsvfs2/refs_template.go +++ b/pkg/refsvfs2/refs_template.go @@ -101,7 +101,7 @@ func (r *Refs) IncRef() { } } -// TryIncRef implements refs.RefCounter.TryIncRef. +// TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish diff --git a/pkg/ring0/aarch64.go b/pkg/ring0/aarch64.go index 3bda594f9..96c884844 100644 --- a/pkg/ring0/aarch64.go +++ b/pkg/ring0/aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/defs_amd64.go b/pkg/ring0/defs_amd64.go index 76776c65c..24f6e4cde 100644 --- a/pkg/ring0/defs_amd64.go +++ b/pkg/ring0/defs_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 diff --git a/pkg/ring0/defs_arm64.go b/pkg/ring0/defs_arm64.go index 0125690d2..3e212516f 100644 --- a/pkg/ring0/defs_arm64.go +++ b/pkg/ring0/defs_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go index d87b1fd00..afd646b0b 100644 --- a/pkg/ring0/entry_amd64.go +++ b/pkg/ring0/entry_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -31,6 +32,13 @@ import ( // executed from kernel mode or not and the appropriate stub is called. func sysenter() +// addrOfSysenter returns the start address of sysenter. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfSysenter() uintptr + // swapgs swaps the current GS value. // // This must be called prior to sysret/iret. @@ -39,6 +47,9 @@ func swapgs() // jumpToKernel jumps to the kernel version of the current RIP. func jumpToKernel() +// jumpToUser jumps to the user version of the current RIP. +func jumpToUser() + // sysret returns to userspace from a system call. // // The return code is the vector that interrupted execution. @@ -65,7 +76,12 @@ func exception() // This is used when processing kernel exceptions and syscalls. func resume() -// Start is the CPU entrypoint. +// start is the CPU entrypoint. +// +// See requirements below. +func start() + +// AddrOfStart return the address of the CPU entrypoint. // // The following start conditions must be satisfied: // @@ -78,7 +94,11 @@ func resume() // * c.EFER() should be the current EFER value. // // The CPU state will be set to c.Registers(). -func Start() +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfStart() uintptr // Exception stubs. func divideByZero() @@ -104,28 +124,56 @@ func virtualizationException() func securityException() func syscallInt80() +// These returns the start address of the functions above. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfDivideByZero() uintptr +func addrOfDebug() uintptr +func addrOfNMI() uintptr +func addrOfBreakpoint() uintptr +func addrOfOverflow() uintptr +func addrOfBoundRangeExceeded() uintptr +func addrOfInvalidOpcode() uintptr +func addrOfDeviceNotAvailable() uintptr +func addrOfDoubleFault() uintptr +func addrOfCoprocessorSegmentOverrun() uintptr +func addrOfInvalidTSS() uintptr +func addrOfSegmentNotPresent() uintptr +func addrOfStackSegmentFault() uintptr +func addrOfGeneralProtectionFault() uintptr +func addrOfPageFault() uintptr +func addrOfX87FloatingPointException() uintptr +func addrOfAlignmentCheck() uintptr +func addrOfMachineCheck() uintptr +func addrOfSimdFloatingPointException() uintptr +func addrOfVirtualizationException() uintptr +func addrOfSecurityException() uintptr +func addrOfSyscallInt80() uintptr + // Exception handler index. -var handlers = map[Vector]func(){ - DivideByZero: divideByZero, - Debug: debug, - NMI: nmi, - Breakpoint: breakpoint, - Overflow: overflow, - BoundRangeExceeded: boundRangeExceeded, - InvalidOpcode: invalidOpcode, - DeviceNotAvailable: deviceNotAvailable, - DoubleFault: doubleFault, - CoprocessorSegmentOverrun: coprocessorSegmentOverrun, - InvalidTSS: invalidTSS, - SegmentNotPresent: segmentNotPresent, - StackSegmentFault: stackSegmentFault, - GeneralProtectionFault: generalProtectionFault, - PageFault: pageFault, - X87FloatingPointException: x87FloatingPointException, - AlignmentCheck: alignmentCheck, - MachineCheck: machineCheck, - SIMDFloatingPointException: simdFloatingPointException, - VirtualizationException: virtualizationException, - SecurityException: securityException, - SyscallInt80: syscallInt80, +var handlers = map[Vector]uintptr{ + DivideByZero: addrOfDivideByZero(), + Debug: addrOfDebug(), + NMI: addrOfNMI(), + Breakpoint: addrOfBreakpoint(), + Overflow: addrOfOverflow(), + BoundRangeExceeded: addrOfBoundRangeExceeded(), + InvalidOpcode: addrOfInvalidOpcode(), + DeviceNotAvailable: addrOfDeviceNotAvailable(), + DoubleFault: addrOfDoubleFault(), + CoprocessorSegmentOverrun: addrOfCoprocessorSegmentOverrun(), + InvalidTSS: addrOfInvalidTSS(), + SegmentNotPresent: addrOfSegmentNotPresent(), + StackSegmentFault: addrOfStackSegmentFault(), + GeneralProtectionFault: addrOfGeneralProtectionFault(), + PageFault: addrOfPageFault(), + X87FloatingPointException: addrOfX87FloatingPointException(), + AlignmentCheck: addrOfAlignmentCheck(), + MachineCheck: addrOfMachineCheck(), + SIMDFloatingPointException: addrOfSimdFloatingPointException(), + VirtualizationException: addrOfVirtualizationException(), + SecurityException: addrOfSecurityException(), + SyscallInt80: addrOfSyscallInt80(), } diff --git a/pkg/ring0/entry_amd64.s b/pkg/ring0/entry_amd64.s index f59747df3..520bd9f57 100644 --- a/pkg/ring0/entry_amd64.s +++ b/pkg/ring0/entry_amd64.s @@ -88,11 +88,33 @@ #define LOAD_KERNEL_STACK(entry) \ MOVQ ENTRY_STACK_TOP(entry), SP; +// ADDR_OF_FUNC defines a function named 'name' that returns the address of +// 'symbol'. +#define ADDR_OF_FUNC(name, symbol) \ +TEXT name,$0-8; \ + MOVQ $symbol, AX; \ + MOVQ AX, ret+0(FP); \ + RET + // See kernel.go. TEXT ·Halt(SB),NOSPLIT,$0 HLT RET +// See kernel_amd64.go. +TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8 + HLT + + // Restore FS_BASE. + MOVQ regs+0(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ AX + + RET + // See entry_amd64.go. TEXT ·swapgs(SB),NOSPLIT,$0 SWAP_GS() @@ -107,8 +129,29 @@ TEXT ·jumpToKernel(SB),NOSPLIT,$0 MOVQ AX, 0(SP) RET +// jumpToUser changes execution to the user address space. +// +// This works by changing the return value to the user version. +TEXT ·jumpToUser(SB),NOSPLIT,$0 + // N.B. we can't access KernelStartAddress from the upper half (data + // pages not available), so just naively clear all the upper bits. + // We are assuming a 47-bit virtual address space. + MOVQ $0x00007fffffffffff, AX + MOVQ 0(SP), BX + ANDQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + // See entry_amd64.go. TEXT ·sysret(SB),NOSPLIT,$0-24 + // Set application FS. We can't do this in Go because Go code needs FS. + MOVQ regs+8(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX + CALL ·writeFS(SB) + POPQ AX + CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return @@ -142,6 +185,14 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 // See entry_amd64.go. TEXT ·iret(SB),NOSPLIT,$0-24 + // Set application FS. We can't do this in Go because Go code needs FS. + MOVQ regs+8(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ AX + CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return @@ -184,13 +235,29 @@ TEXT ·resume(SB),NOSPLIT,$0 IRET() // See entry_amd64.go. -TEXT ·Start(SB),NOSPLIT,$0 +TEXT ·start(SB),NOSPLIT,$0 + // N.B. This is the vCPU entrypoint. It is not called from Go code and + // thus pushes and pops values on the stack until calling into Go + // (startGo) because we aren't usually a typical Go assembly frame. + PUSHQ $0x0 // Previous frame pointer. MOVQ SP, BP // Set frame pointer. - PUSHQ AX // First argument (CPU). - CALL ·start(SB) // Call Go hook. + + PUSHQ AX // Save CPU. + + // Set up environment required by Go before calling startGo: Go needs + // FS_BASE and floating point initialized. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + + // First argument (CPU) already at bottom of stack. + CALL ·startGo(SB) // Call Go hook. JMP ·resume(SB) // Restore to registers. +ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB)); + // See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT,$0 // _RFLAGS_IOPL0 is always set in the user mode and it is never set in @@ -218,6 +285,18 @@ user: MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. + CALL ·jumpToUser(SB) + + // Restore kernel FS_BASE. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + // Return to the kernel, where the frame is: // // vector (sp+32) @@ -252,6 +331,8 @@ kernel: POPQ AX // Pop vCPU. JMP ·resume(SB) +ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); + // exception is a generic exception handler. // // There are two cases handled: @@ -298,6 +379,16 @@ user: MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) + CALL ·jumpToUser(SB) + + // Restore kernel FS_BASE. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + // Copy out and return. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ 0(SP), BX // Load vector. @@ -336,36 +427,38 @@ kernel: POPQ AX // Pop vCPU. JMP ·resume(SB) -#define EXCEPTION_WITH_ERROR(value, symbol) \ +#define EXCEPTION_WITH_ERROR(value, symbol, addr) \ +ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT,$0; \ PUSHQ $value; \ JMP ·exception(SB); -#define EXCEPTION_WITHOUT_ERROR(value, symbol) \ +#define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \ +ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT,$0; \ PUSHQ $0x0; \ PUSHQ $value; \ JMP ·exception(SB); -EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB)) -EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB)) -EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB)) -EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB)) -EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB)) -EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB)) -EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB)) -EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB)) -EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB)) -EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB)) -EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB)) -EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB)) -EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB)) -EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB)) -EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB)) -EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB)) -EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB)) -EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB)) -EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB)) -EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB)) -EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB)) -EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB)) +EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB)) +EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB)) +EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB)) +EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB)) +EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB)) +EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB)) +EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB)) +EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB)) +EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB)) +EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB)) +EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB)) +EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB)) +EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB)) +EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB)) +EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB)) +EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB)) +EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB)) +EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB)) +EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB)) +EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB)) +EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB)) +EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB)) diff --git a/pkg/ring0/entry_arm64.go b/pkg/ring0/entry_arm64.go index 62a93f3d6..299036478 100644 --- a/pkg/ring0/entry_arm64.go +++ b/pkg/ring0/entry_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index f63af8b76..4a4c0ae26 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -19,12 +20,20 @@ package ring0 import ( "encoding/binary" "reflect" + "sync" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/sentry/arch" ) +// HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the +// value in regs. +func HaltAndWriteFSBase(regs *arch.Registers) + // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { + initSentryXCR0() + entrySize := reflect.TypeOf(kernelEntry{}).Size() var ( entries []kernelEntry @@ -168,7 +177,7 @@ func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { // //go:nosplit func (c *CPU) CR0() uint64 { - return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET + return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE } // CR4 returns the CPU's CR4 value. @@ -240,7 +249,6 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { // Perform the switch. swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. LoadFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy in floating point. if switchOpts.FullRestore { @@ -249,38 +257,58 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { vector = sysret(c, regs, uintptr(userCR3)) } SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. RestoreKernelFPState() // escapes: no. Restore kernel MXCSR. return } -var sentryXCR0 = xgetbv(0) +var ( + sentryXCR0 uintptr + sentryXCR0Once sync.Once +) -// start is the CPU entrypoint. +// initSentryXCR0 saves a value of XCR0 in the host mode. It is used to +// initialize XCR0 of guest vCPU-s. +func initSentryXCR0() { + sentryXCR0Once.Do(func() { sentryXCR0 = xgetbv(0) }) +} + +// startGo is the CPU entrypoint. // -// This is called from the Start asm stub (see entry_amd64.go); on return the +// This is called from the start asm stub (see entry_amd64.go); on return the // registers in c.registers will be restored (not segments). // +// Note that any code written in Go should adhere to Go expected environment: +// * Initialized floating point state (required for optimizations using +// floating point instructions). +// * Go TLS in FS_BASE (this is required by splittable functions, calls into +// the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access +// TLS)). +// //go:nosplit -func start(c *CPU) { - // Save per-cpu & FS segment. +func startGo(c *CPU) { + // Save per-cpu. WriteGS(kernelAddr(c.kernelEntry)) - WriteFS(uintptr(c.registers.Fs_base)) + // + // TODO(mpratt): Note that per the note above, this should be done + // before entering Go code. However for simplicity we leave it here for + // now, since the small critical sections with undefined FPU state + // should only contain very limited use of floating point instructions + // (notably, use of XMM15 as a zero register). fninit() // Need to sync XCR0 with the host, because xsave and xrstor can be // called from different contexts. xsetbv(0, sentryXCR0) // Set the syscall target. - wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter())) wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) // NOTE: This depends on having the 64-bit segments immediately // following the 32-bit user segments. This is simply the way the // sysret instruction is designed to work (it assumes they follow). wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) - wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter())) } // SetCPUIDFaulting sets CPUID faulting per the boolean value. diff --git a/pkg/ring0/kernel_arm64.go b/pkg/ring0/kernel_arm64.go index 21db910a2..79f85ff50 100644 --- a/pkg/ring0/kernel_arm64.go +++ b/pkg/ring0/kernel_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/kernel_unsafe.go b/pkg/ring0/kernel_unsafe.go index 16955ad91..04c60d0a7 100644 --- a/pkg/ring0/kernel_unsafe.go +++ b/pkg/ring0/kernel_unsafe.go @@ -35,7 +35,6 @@ func kernelAddr(obj interface{}) uintptr { // kernelFunc returns the address of the given function. // //go:nosplit -func kernelFunc(fn func()) uintptr { - fnptr := (**uintptr)(unsafe.Pointer(&fn)) - return KernelStartAddress | **fnptr +func kernelFunc(fn uintptr) uintptr { + return KernelStartAddress | fn } diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go index 3e6bb9663..05c394ff5 100644 --- a/pkg/ring0/lib_amd64.go +++ b/pkg/ring0/lib_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -43,8 +44,8 @@ func xsave(*byte) // xsaveopt uses xsaveopt to save floating point state. func xsaveopt(*byte) -// WriteFS sets the GS address (set by init). -var WriteFS func(addr uintptr) +// writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr). +func writeFS(addr uintptr) // wrfsbase writes to the GS base address. func wrfsbase(addr uintptr) @@ -116,10 +117,8 @@ func Init(featureSet *cpuid.FeatureSet) { LoadFloatingPoint = fxrstor } if hasFSGSBASE { - WriteFS = wrfsbase WriteGS = wrgsbase } else { - WriteFS = wrfsmsr WriteGS = wrgsmsr } } diff --git a/pkg/ring0/lib_amd64.s b/pkg/ring0/lib_amd64.s index 70a43e79e..8ed98fc84 100644 --- a/pkg/ring0/lib_amd64.s +++ b/pkg/ring0/lib_amd64.s @@ -80,6 +80,29 @@ TEXT ·xsaveopt(SB),NOSPLIT,$0-8 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; RET +// writeFS writes to the FS base. +// +// This is written in assembly because it must be safe to call before the Go +// environment is set up. See comment on start(). +// +// Preconditions: must be running in the lower address space, as it accesses +// global data. +TEXT ·writeFS(SB),NOSPLIT,$8-8 + MOVQ addr+0(FP), AX + + CMPB ·hasFSGSBASE(SB), $1 + JNE msr + + PUSHQ AX + CALL ·wrfsbase(SB) + POPQ AX + RET +msr: + PUSHQ AX + CALL ·wrfsmsr(SB) + POPQ AX + RET + // wrfsbase writes to the FS base. // // The code corresponds to: diff --git a/pkg/ring0/lib_arm64.go b/pkg/ring0/lib_arm64.go index 5eabd4296..a72a6926d 100644 --- a/pkg/ring0/lib_arm64.go +++ b/pkg/ring0/lib_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/offsets_amd64.go b/pkg/ring0/offsets_amd64.go index ca4075b09..75f6218b3 100644 --- a/pkg/ring0/offsets_amd64.go +++ b/pkg/ring0/offsets_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -95,6 +96,6 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FS_BASE 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_GS_BASE 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) } diff --git a/pkg/ring0/offsets_arm64.go b/pkg/ring0/offsets_arm64.go index 03adaa6b0..60b2c4074 100644 --- a/pkg/ring0/offsets_arm64.go +++ b/pkg/ring0/offsets_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/pagetables/pagetables_aarch64.go b/pkg/ring0/pagetables/pagetables_aarch64.go index 86eb00a4f..aa2a5c984 100644 --- a/pkg/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/ring0/pagetables/pagetables_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/pagetables/pagetables_amd64_test.go b/pkg/ring0/pagetables/pagetables_amd64_test.go index a13c616ae..c27b3b10a 100644 --- a/pkg/ring0/pagetables/pagetables_amd64_test.go +++ b/pkg/ring0/pagetables/pagetables_amd64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package pagetables diff --git a/pkg/ring0/pagetables/pagetables_arm64_test.go b/pkg/ring0/pagetables/pagetables_arm64_test.go index 2514b9ac5..1c919ec7d 100644 --- a/pkg/ring0/pagetables/pagetables_arm64_test.go +++ b/pkg/ring0/pagetables/pagetables_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/pagetables/pagetables_x86.go b/pkg/ring0/pagetables/pagetables_x86.go index e43698173..dc98d8452 100644 --- a/pkg/ring0/pagetables/pagetables_x86.go +++ b/pkg/ring0/pagetables/pagetables_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package pagetables diff --git a/pkg/ring0/pagetables/pcids_aarch64.go b/pkg/ring0/pagetables/pcids_aarch64.go index fbfd41d83..ad492d039 100644 --- a/pkg/ring0/pagetables/pcids_aarch64.go +++ b/pkg/ring0/pagetables/pcids_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/pagetables/pcids_aarch64.s b/pkg/ring0/pagetables/pcids_aarch64.s index e9d62d768..cfcedba71 100644 --- a/pkg/ring0/pagetables/pcids_aarch64.s +++ b/pkg/ring0/pagetables/pcids_aarch64.s @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 #include "funcdata.h" diff --git a/pkg/ring0/pagetables/pcids_x86.go b/pkg/ring0/pagetables/pcids_x86.go index 91fc5e8dd..2a107ea70 100644 --- a/pkg/ring0/pagetables/pcids_x86.go +++ b/pkg/ring0/pagetables/pcids_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build i386 || amd64 // +build i386 amd64 package pagetables diff --git a/pkg/ring0/pagetables/walker_amd64.go b/pkg/ring0/pagetables/walker_amd64.go index eb4fbcc31..ca5e2f85f 100644 --- a/pkg/ring0/pagetables/walker_amd64.go +++ b/pkg/ring0/pagetables/walker_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package pagetables diff --git a/pkg/ring0/pagetables/walker_arm64.go b/pkg/ring0/pagetables/walker_arm64.go index 5ed881c7a..e32dbda2d 100644 --- a/pkg/ring0/pagetables/walker_arm64.go +++ b/pkg/ring0/pagetables/walker_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/x86.go b/pkg/ring0/x86.go index 34fbc1c35..9a98703da 100644 --- a/pkg/ring0/x86.go +++ b/pkg/ring0/x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package ring0 @@ -24,6 +25,7 @@ import ( const ( _CR0_PE = 1 << 0 _CR0_ET = 1 << 4 + _CR0_NE = 1 << 5 _CR0_AM = 1 << 18 _CR0_PG = 1 << 31 diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index 8ffa1db37..062250d69 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -74,8 +74,8 @@ func Install(rules SyscallRules) error { } // Perform the actual installation. - if errno := SetFilter(instrs); errno != 0 { - return fmt.Errorf("failed to set filter: %v", errno) + if err := SetFilter(instrs); err != nil { + return fmt.Errorf("failed to set filter: %v", err) } log.Infof("Seccomp filters installed.") diff --git a/pkg/seccomp/seccomp_amd64.go b/pkg/seccomp/seccomp_amd64.go index 00bf332c1..9cd003bc5 100644 --- a/pkg/seccomp/seccomp_amd64.go +++ b/pkg/seccomp/seccomp_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package seccomp diff --git a/pkg/seccomp/seccomp_arm64.go b/pkg/seccomp/seccomp_arm64.go index b62133f21..adcf73e72 100644 --- a/pkg/seccomp/seccomp_arm64.go +++ b/pkg/seccomp/seccomp_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package seccomp diff --git a/pkg/seccomp/seccomp_test_victim_amd64.go b/pkg/seccomp/seccomp_test_victim_amd64.go index efb8604ec..5c1ecc301 100644 --- a/pkg/seccomp/seccomp_test_victim_amd64.go +++ b/pkg/seccomp/seccomp_test_victim_amd64.go @@ -15,6 +15,7 @@ // Test binary used to test that seccomp filters are properly constructed and // indeed kill the process on violation. +//go:build amd64 // +build amd64 package main diff --git a/pkg/seccomp/seccomp_test_victim_arm64.go b/pkg/seccomp/seccomp_test_victim_arm64.go index 97cb5f5fe..9647e2758 100644 --- a/pkg/seccomp/seccomp_test_victim_arm64.go +++ b/pkg/seccomp/seccomp_test_victim_arm64.go @@ -15,6 +15,7 @@ // Test binary used to test that seccomp filters are properly constructed and // indeed kill the process on violation. +//go:build arm64 // +build arm64 package main diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go index 7202591df..6701b5542 100644 --- a/pkg/seccomp/seccomp_unsafe.go +++ b/pkg/seccomp/seccomp_unsafe.go @@ -15,6 +15,8 @@ package seccomp import ( + "fmt" + "runtime" "unsafe" "golang.org/x/sys/unix" @@ -22,12 +24,56 @@ import ( ) // SetFilter installs the given BPF program. +func SetFilter(instrs []linux.BPFInstruction) error { + // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See + // seccomp(2) for details. + // + // PR_SET_NO_NEW_PRIVS is specific to the calling thread, not the whole + // thread group, so between PR_SET_NO_NEW_PRIVS and seccomp() below we must + // remain on the same thread. no_new_privs will be propagated to other + // threads in the thread group by seccomp(SECCOMP_FILTER_FLAG_TSYNC), in + // kernel/seccomp.c:seccomp_sync_threads(). + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if _, _, errno := unix.RawSyscall6(unix.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 { + return errno + } + + sockProg := linux.SockFprog{ + Len: uint16(len(instrs)), + Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), + } + tid, errno := seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) + if errno != 0 { + return errno + } + // "On error, if SECCOMP_FILTER_FLAG_TSYNC was used, the return value is + // the ID of the thread that caused the synchronization failure. (This ID + // is a kernel thread ID of the type returned by clone(2) and gettid(2).)" + // - seccomp(2) + if tid != 0 { + return fmt.Errorf("couldn't synchronize filter to TID %d", tid) + } + return nil +} + +// SetFilterInChild is equivalent to SetFilter, but: +// +// - It is safe to call after runtime.syscall_runtime_AfterForkInChild. +// +// - It requires that the calling goroutine cannot be moved to another thread, +// which either requires that runtime.LockOSThread() is in effect or that the +// caller is in fact in a fork()ed child process. // -// This is safe to call from an afterFork context. +// - Since fork()ed child processes cannot perform heap allocation, it returns +// a unix.Errno rather than an error. // +// - The race instrumentation has to be disabled for all functions that are +// called in a forked child. +// +//go:norace //go:nosplit -func SetFilter(instrs []linux.BPFInstruction) unix.Errno { - // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details. +func SetFilterInChild(instrs []linux.BPFInstruction) unix.Errno { if _, _, errno := unix.RawSyscall6(unix.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 { return errno } @@ -36,12 +82,22 @@ func SetFilter(instrs []linux.BPFInstruction) unix.Errno { Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } - return seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) + tid, errno := seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) + if errno != 0 { + return errno + } + if tid != 0 { + // Return an errno that seccomp(2) doesn't to uniquely identify this + // case. Since this case occurs if another thread has a conflicting + // filter set, "name not unique on network" is at least suggestive? + return unix.ENOTUNIQ + } + return 0 } func isKillProcessAvailable() (bool, error) { action := uint32(linux.SECCOMP_RET_KILL_PROCESS) - if errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 { + if _, errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 { // EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet. // EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported. if errno == unix.EINVAL || errno == unix.EOPNOTSUPP { @@ -55,9 +111,7 @@ func isKillProcessAvailable() (bool, error) { // seccomp calls seccomp(2). This is safe to call from an afterFork context. // //go:nosplit -func seccomp(op, flags uint32, ptr unsafe.Pointer) unix.Errno { - if _, _, errno := unix.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)); errno != 0 { - return errno - } - return 0 +func seccomp(op, flags uint32, ptr unsafe.Pointer) (uintptr, unix.Errno) { + n, _, errno := unix.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)) + return n, errno } diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 61dacd2fb..e0dbc436d 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -28,13 +28,13 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/cpuid", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", "//pkg/sentry/arch/fpu", "//pkg/sentry/limits", - "//pkg/syserror", "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 08789f517..9a827e84f 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch @@ -22,10 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/syserror" ) // Registers represents the CPU registers for this architecture. @@ -233,11 +234,11 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceGetRegs(dst) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } @@ -246,11 +247,11 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceSetRegs(src) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index d6b4d2357..e7cb24102 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 348f238fd..0d27a1f22 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index b2b94c304..6da13f26e 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package arch diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index e8e52d3a8..96e9a6949 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package arch @@ -23,10 +24,10 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/syserror" ) // Registers represents the CPU registers for this architecture. @@ -353,7 +354,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceGetRegs(dst) case _NT_PRFPREG: @@ -361,7 +362,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, case _NT_X86_XSTATE: return s.fpState.PtraceGetXstateRegs(dst, maxlen, s.FeatureSet) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } @@ -370,7 +371,7 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceSetRegs(src) case _NT_PRFPREG: @@ -378,7 +379,7 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, case _NT_X86_XSTATE: return s.fpState.PtraceSetXstateRegs(src, maxlen, s.FeatureSet) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go index 5d7b99bd9..bb5ff7f7f 100644 --- a/pkg/sentry/arch/arch_x86_impl.go +++ b/pkg/sentry/arch/arch_x86_impl.go @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (amd64 || 386) && go1.1 // +build amd64 386 +// +build go1.1 package arch diff --git a/pkg/sentry/arch/fpu/BUILD b/pkg/sentry/arch/fpu/BUILD index 4e4f20639..6cdd21b1b 100644 --- a/pkg/sentry/arch/fpu/BUILD +++ b/pkg/sentry/arch/fpu/BUILD @@ -13,9 +13,9 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/cpuid", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sync", - "//pkg/syserror", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/arch/fpu/fpu_amd64.go b/pkg/sentry/arch/fpu/fpu_amd64.go index f0ba26736..e422f67a1 100644 --- a/pkg/sentry/arch/fpu/fpu_amd64.go +++ b/pkg/sentry/arch/fpu/fpu_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || i386 // +build amd64 i386 package fpu @@ -21,9 +22,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // initX86FPState (defined in asm files) sets up initial state. @@ -70,7 +71,7 @@ const ptraceFPRegsSize = 512 // PtraceGetFPRegs implements Context.PtraceGetFPRegs. func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) { if maxlen < ptraceFPRegsSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return dst.Write((*s)[:ptraceFPRegsSize]) @@ -79,7 +80,7 @@ func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) { // PtraceSetFPRegs implements Context.PtraceSetFPRegs. func (s *State) PtraceSetFPRegs(src io.Reader, maxlen int) (int, error) { if maxlen < ptraceFPRegsSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } var f [ptraceFPRegsSize]byte diff --git a/pkg/sentry/arch/fpu/fpu_arm64.go b/pkg/sentry/arch/fpu/fpu_arm64.go index 46634661f..49e641722 100644 --- a/pkg/sentry/arch/fpu/fpu_arm64.go +++ b/pkg/sentry/arch/fpu/fpu_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package fpu diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 58e28dbba..dbd4336f9 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 80df90076..ee22ec512 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go index 3859f41ee..c021ba072 100644 --- a/pkg/sentry/arch/syscalls_amd64.go +++ b/pkg/sentry/arch/syscalls_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index 95dfd1e90..7146c9e44 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go index 8a500a515..7613dfcbc 100644 --- a/pkg/sentry/control/logging.go +++ b/pkg/sentry/control/logging.go @@ -50,20 +50,20 @@ type LoggingArgs struct { // enable strace at all. If this flag is false then a completely // pristine copy of the syscall table will be swapped in. This // approach is used to remain consistent with an empty strace - // whitelist meaning trace all system calls. + // allowlist meaning trace all system calls. EnableStrace bool - // Strace is the whitelist of syscalls to trace to log. If this - // and StraceEventWhitelist are empty trace all system calls. - StraceWhitelist []string + // Strace is the allowlist of syscalls to trace to log. If this + // and StraceEventAllowlist are empty trace all system calls. + StraceAllowlist []string // SetEventStrace is a flag used to indicate that event strace // related arguments were passed in. SetEventStrace bool - // StraceEventWhitelist is the whitelist of syscalls to trace + // StraceEventAllowlist is the allowlist of syscalls to trace // to event log. - StraceEventWhitelist []string + StraceEventAllowlist []string } // Logging provides functions related to logging. @@ -107,13 +107,13 @@ func (l *Logging) Change(args *LoggingArgs, code *int) error { func (l *Logging) configureStrace(args *LoggingArgs) error { if args.EnableStrace { - // Install the whitelist specified. - if len(args.StraceWhitelist) > 0 { - if err := strace.Enable(args.StraceWhitelist, strace.SinkTypeLog); err != nil { + // Install the allowlist specified. + if len(args.StraceAllowlist) > 0 { + if err := strace.Enable(args.StraceAllowlist, strace.SinkTypeLog); err != nil { return err } } else { - // For convenience, if strace is enabled but whitelist + // For convenience, if strace is enabled but allowlist // is empty, enable everything to log. strace.EnableAll(strace.SinkTypeLog) } @@ -125,8 +125,8 @@ func (l *Logging) configureStrace(args *LoggingArgs) error { } func (l *Logging) configureEventStrace(args *LoggingArgs) error { - if len(args.StraceEventWhitelist) > 0 { - if err := strace.Enable(args.StraceEventWhitelist, strace.SinkTypeEvent); err != nil { + if len(args.StraceEventAllowlist) > 0 { + if err := strace.Enable(args.StraceEventAllowlist, strace.SinkTypeEvent); err != nil { return err } } else { diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 221e98a01..6352ea71a 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -126,7 +126,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { // Wait for completion. newTG.WaitExited() - *waitStatus = newTG.ExitStatus().Status() + *waitStatus = uint32(newTG.ExitStatus()) return nil } @@ -223,7 +223,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI _ = fd.Close() } }() - ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fds) if err != nil { return nil, 0, nil, nil, err } diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go index 62eaca965..4c83b8e8e 100644 --- a/pkg/sentry/control/state.go +++ b/pkg/sentry/control/state.go @@ -17,6 +17,7 @@ package control import ( "errors" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/state" @@ -67,7 +68,7 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error { log.Warningf("Save failed: exiting...") s.Kernel.SetSaveError(err) } - s.Kernel.Kill(kernel.ExitStatus{}) + s.Kernel.Kill(linux.WaitStatusExit(0)) }, } return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog) diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD index 8b38d574d..60c971030 100644 --- a/pkg/sentry/devices/tundev/BUILD +++ b/pkg/sentry/devices/tundev/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/devtmpfs", @@ -16,7 +17,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/socket/netstack", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/tcpip/link/tun", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go index a12eeb8e7..b4e2a6d91 100644 --- a/pkg/sentry/devices/tundev/tundev.go +++ b/pkg/sentry/devices/tundev/tundev.go @@ -18,6 +18,7 @@ package tundev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -77,11 +77,11 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg switch request { case linux.TUNSETIFF: if !t.HasCapability(linux.CAP_NET_ADMIN) { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } stack, ok := t.NetworkContext().(*netstack.Stack) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var req linux.IFReq @@ -104,7 +104,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD index 6b4f8b0ed..563e96e0d 100644 --- a/pkg/sentry/fdimport/BUILD +++ b/pkg/sentry/fdimport/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/sentry/fs/host", "//pkg/sentry/fsimpl/host", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", ], ) diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index badd5b073..f2b9630eb 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -31,9 +32,9 @@ import ( // sets up TTY for the first 3 FDs in the slice representing stdin, stdout, // stderr. Used FDs are either closed or released. It's safe for the caller to // close any remaining files upon return. -func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, uid auth.KUID, gid auth.KGID, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if kernel.VFS2Enabled { - ttyFile, err := importVFS2(ctx, fdTable, console, fds) + ttyFile, err := importVFS2(ctx, fdTable, console, uid, gid, fds) return nil, ttyFile, err } ttyFile, err := importFS(ctx, fdTable, console, fds) @@ -89,7 +90,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] return ttyFile.FileOperations.(*host.TTYFileOperations), nil } -func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) { +func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, uid auth.KUID, gid auth.KGID, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, fmt.Errorf("cannot find kernel from context") @@ -103,7 +104,13 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */) + appFile, err = hostvfs2.NewFD(ctx, k.HostMount(), hostFD.FD(), &hostvfs2.NewFDOptions{ + Savable: true, + IsTTY: true, + VirtualOwner: true, + UID: uid, + GID: gid, + }) if err != nil { return nil, err } @@ -121,7 +128,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi } } else { var err error - appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */) + appFile, err = hostvfs2.NewFD(ctx, k.HostMount(), hostFD.FD(), &hostvfs2.NewFDOptions{ + Savable: true, + VirtualOwner: true, + UID: uid, + GID: gid, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 74adbfa55..58fe1e77c 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -117,7 +117,6 @@ go_test( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/contexttest", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index ae282d14e..a8591052c 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -162,7 +162,7 @@ func doCopyUp(ctx context.Context, d *Dirent) error { // then try to take copyMu for writing here, we'd deadlock. t := d.Inode.overlay.lower.StableAttr.Type if t != RegularFile && t != Directory && t != Symlink { - return syserror.EINVAL + return linuxerr.EINVAL } // Wait to get exclusive access to the upper Inode. diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 23a3a9a2d..e28a8961b 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -18,6 +18,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/rand", "//pkg/safemem", diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index 77e8d222a..1abf11142 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -17,6 +17,7 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -98,11 +98,11 @@ func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io user switch request { case linux.TUNSETIFF: if !t.HasCapability(linux.CAP_NET_ADMIN) { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } stack, ok := t.NetworkContext().(*netstack.Stack) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var req linux.IFReq @@ -125,7 +125,7 @@ func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io user return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index e45749be6..ad8ff227e 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -488,11 +488,11 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be // expensive, if possible release the lock and re-acquire it. if walkMayUnlock { - d.mu.Unlock() + d.mu.Unlock() // +checklocksforce: results in an inconsistent block. } c, err := d.Inode.Lookup(ctx, name) if walkMayUnlock { - d.mu.Lock() + d.mu.Lock() // +checklocksforce: see above. } // No dice. if err != nil { @@ -594,21 +594,27 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { // lockDirectory should be called for any operation that changes this `d`s // children (creating or removing them). -func (d *Dirent) lockDirectory() func() { +// +checklocksacquire:d.dirMu +// +checklocksacquire:d.mu +func (d *Dirent) lockDirectory() { renameMu.RLock() d.dirMu.Lock() d.mu.Lock() - return func() { - d.mu.Unlock() - d.dirMu.Unlock() - renameMu.RUnlock() - } +} + +// unlockDirectory is the reverse of lockDirectory. +// +checklocksrelease:d.dirMu +// +checklocksrelease:d.mu +func (d *Dirent) unlockDirectory() { + d.mu.Unlock() + d.dirMu.Unlock() + renameMu.RUnlock() // +checklocksforce: see lockDirectory. } // Create creates a new regular file in this directory. func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) { - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Does something already exist? if d.exists(ctx, root, name) { @@ -670,8 +676,8 @@ func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // genericCreate executes create if name does not exist. Removes a negative Dirent at name if // create succeeds. func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error { - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Does something already exist? if d.exists(ctx, root, name) { @@ -858,7 +864,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // Once we have written entries for "." and "..", future errors from // IterateDir will be hidden. if !IsDir(d.Inode.StableAttr) { - return 0, syserror.ENOTDIR + return 0, linuxerr.ENOTDIR } // This is a special case for lseek(fd, 0, SEEK_END). @@ -964,7 +970,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err // // See Linux equivalent in fs/namespace.c:do_add_mount. if IsSymlink(inode.StableAttr) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // Dirent that'll replace d. @@ -1021,8 +1027,8 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath panic("Dirent.Remove: root must not be nil") } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Try to walk to the node. child, err := d.walk(ctx, root, name, false /* may unlock */) @@ -1082,8 +1088,8 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) panic("Dirent.Remove: root must not be nil") } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Check for dots. if name == "." { @@ -1259,17 +1265,15 @@ func (d *Dirent) dropExtendedReference() { d.Inode.MountSource.fscache.Remove(d) } -// lockForRename takes locks on oldParent and newParent as required by Rename -// and returns a function that will unlock the locks taken. The returned -// function must be called even if a non-nil error is returned. -func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) { +// lockForRename takes locks on oldParent and newParent as required by Rename. +// On return, unlockForRename must always be called, even with an error. +// +checklocksacquire:oldParent.mu +// +checklocksacquire:newParent.mu +func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) error { renameMu.Lock() if oldParent == newParent { oldParent.mu.Lock() - return func() { - oldParent.mu.Unlock() - renameMu.Unlock() - }, nil + return nil // +checklocksforce: only one lock exists. } // Renaming between directories is a bit subtle: @@ -1297,11 +1301,7 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName // itself. err = unix.EINVAL } - return func() { - newParent.mu.Unlock() - oldParent.mu.Unlock() - renameMu.Unlock() - }, err + return err } child = p } @@ -1310,17 +1310,27 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName // have no relationship; in either case we can do this: newParent.mu.Lock() oldParent.mu.Lock() - return func() { + return nil +} + +// unlockForRename is the opposite of lockForRename. +// +checklocksrelease:oldParent.mu +// +checklocksrelease:newParent.mu +func unlockForRename(oldParent, newParent *Dirent) { + if oldParent == newParent { oldParent.mu.Unlock() - newParent.mu.Unlock() - renameMu.Unlock() - }, nil + renameMu.Unlock() // +checklocksforce: only one lock exists. + return + } + newParent.mu.Unlock() + oldParent.mu.Unlock() + renameMu.Unlock() // +checklocksforce: not tracked. } func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { - return syserror.EPERM + return linuxerr.EPERM } if !uattr.Perms.Sticky { return nil @@ -1333,7 +1343,7 @@ func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { vuattr, err := victim.Inode.UnstableAttr(ctx) if err != nil { - return syserror.EPERM + return linuxerr.EPERM } if vuattr.Owner.UID == creds.EffectiveKUID { return nil @@ -1341,7 +1351,7 @@ func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) { return nil } - return syserror.EPERM + return linuxerr.EPERM } // MayDelete determines whether `name`, a child of `d`, can be deleted or @@ -1353,8 +1363,8 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error return err } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() victim, err := d.walk(ctx, root, name, true /* may unlock */) if err != nil { @@ -1375,7 +1385,7 @@ func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error { } if victim.IsRoot() { - return syserror.EBUSY + return linuxerr.EBUSY } return nil @@ -1392,8 +1402,8 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string } // Acquire global renameMu lock, and mu locks on oldParent/newParent. - unlock, err := lockForRename(oldParent, oldName, newParent, newName) - defer unlock() + err := lockForRename(oldParent, oldName, newParent, newName) + defer unlockForRename(oldParent, newParent) if err != nil { return err } diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 7fc53ed22..5c889c861 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -39,6 +39,8 @@ go_test( library = ":fdpipe", deps = [ "//pkg/context", + "//pkg/errors", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdnotifier", "//pkg/hostarch", diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 7b3ff191f..89d8be741 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -25,8 +25,8 @@ import ( "github.com/google/uuid" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -515,8 +515,8 @@ func assertReaderHungup(t *testing.T, desc string, reader io.Reader) bool { } func assertWriterHungup(t *testing.T, desc string, writer io.Writer) bool { - if _, err := writer.Write([]byte("hello")); unwrapError(err) != unix.EPIPE { - t.Errorf("%s: write to self after hangup got error %v, want %v", desc, err, unix.EPIPE) + if _, err := writer.Write([]byte("hello")); !linuxerr.Equals(linuxerr.EPIPE, unwrapError(err)) { + t.Errorf("%s: write to self after hangup got error %v, want %v", desc, err, linuxerr.EPIPE) return false } return true diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index ab0e9dac7..4c8905a7e 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -21,14 +21,15 @@ import ( "testing" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) func singlePipeFD() (int, error) { @@ -209,17 +210,17 @@ func TestPipeRequest(t *testing.T) { { desc: "ReadDir on pipe returns ENOTDIR", context: &ReadDir{}, - err: unix.ENOTDIR, + err: linuxerr.ENOTDIR, }, { desc: "Fsync on pipe returns EINVAL", context: &Fsync{}, - err: unix.EINVAL, + err: linuxerr.EINVAL, }, { desc: "Seek on pipe returns ESPIPE", context: &Seek{}, - err: unix.ESPIPE, + err: linuxerr.ESPIPE, }, { desc: "Readv on pipe from empty buffer returns nil", @@ -248,7 +249,7 @@ func TestPipeRequest(t *testing.T) { desc: "Writev on pipe from non-empty buffer and closed partner returns EPIPE", context: &Writev{Src: usermem.BytesIOSequence([]byte("hello"))}, flags: fs.FileFlags{Write: true}, - err: unix.EPIPE, + err: linuxerr.EPIPE, }, { desc: "Writev on pipe from non-empty buffer and open partner succeeds", @@ -307,7 +308,11 @@ func TestPipeRequest(t *testing.T) { t.Errorf("%s: unknown request type %T", test.desc, test.context) } - if unwrapError(err) != test.err { + if linuxErr, ok := test.err.(*errors.Error); ok { + if !linuxerr.Equals(linuxErr, unwrapError(err)) { + t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) + } + } else if test.err != unwrapError(err) { t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) } } diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 696613f3a..06c07c807 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -18,6 +18,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -357,7 +358,7 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt } if !o.isMappableLocked() { - return syserror.ENODEV + return linuxerr.ENODEV } // FIXME(jamieliu): This is a copy/paste of fsutil.GenericConfigureMMap, @@ -407,7 +408,7 @@ func (f *overlayFileOperations) Ioctl(ctx context.Context, overlayFile *File, io // copy up on any ioctl would be too drastic. In the future, it can have a // list of ioctls that are safe to send to lower and a list that triggers a // copy up. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } return f.upper.FileOperations.Ioctl(ctx, f.upper, io, args) } @@ -417,7 +418,7 @@ func (f *overlayFileOperations) FifoSize(ctx context.Context, overlayFile *File) err = f.onTop(ctx, overlayFile, func(file *File, ops FileOperations) error { sz, ok := ops.(FifoSizer) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } rv, err = sz.FifoSize(ctx, file) return err @@ -432,11 +433,11 @@ func (f *overlayFileOperations) SetFifoSize(size int64) (rv int64, err error) { if f.upper == nil { // Named pipes cannot be copied up and changes to the lower are prohibited. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } sz, ok := f.upper.FileOperations.(FifoSizer) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return sz.SetFifoSize(size) } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 44587bb37..a346c316b 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -80,23 +80,33 @@ func AsyncBarrier() { // Async executes a function asynchronously. // // Async must not be called recursively. +// +checklocksignore func Async(f func()) { workMu.RLock() - go func() { // S/R-SAFE: AsyncBarrier must be called. - defer workMu.RUnlock() // Ensure RUnlock in case of panic. - f() - }() + go asyncWork(f) // S/R-SAFE: AsyncBarrier must be called. +} + +// +checklocksignore +func asyncWork(f func()) { + // Ensure RUnlock in case of panic. + defer workMu.RUnlock() + f() } // AsyncWithContext is just like Async, except that it calls the asynchronous // function with the given context as argument. This function exists to avoid // needing to allocate an extra function on the heap in a hot path. +// +checklocksignore func AsyncWithContext(ctx context.Context, f func(context.Context)) { workMu.RLock() - go func() { // S/R-SAFE: AsyncBarrier must be called. - defer workMu.RUnlock() // Ensure RUnlock in case of panic. - f(ctx) - }() + go asyncWorkWithContext(ctx, f) +} + +// +checklocksignore +func asyncWorkWithContext(ctx context.Context, f func(context.Context)) { + // Ensure RUnlock in case of panic. + defer workMu.RUnlock() + f(ctx) } // AsyncErrorBarrier waits for all outstanding asynchronous work to complete, or diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 6469cc3a9..6bf2d51cb 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -76,6 +76,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/safemem", @@ -106,13 +107,13 @@ go_test( library = ":fsutil", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/safemem", "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index dc9efa5df..00b3bb29b 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -18,6 +18,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -45,7 +46,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, // Does the Inode represents a non-seekable type? if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) { - return current, syserror.ESPIPE + return current, linuxerr.ESPIPE } // Does the Inode represent a character device? @@ -63,12 +64,12 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, switch inode.StableAttr.Type { case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: if offset < 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return offset, nil case fs.Directory, fs.SpecialDirectory: if offset != 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } // SEEK_SET to 0 moves the directory "cursor" to the beginning. if dirCursor != nil { @@ -76,22 +77,22 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, } return 0, nil default: - return current, syserror.EINVAL + return current, linuxerr.EINVAL } case fs.SeekCurrent: switch inode.StableAttr.Type { case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: if current+offset < 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return current + offset, nil case fs.Directory, fs.SpecialDirectory: if offset != 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return current, nil default: - return current, syserror.EINVAL + return current, linuxerr.EINVAL } case fs.SeekEnd: switch inode.StableAttr.Type { @@ -103,14 +104,14 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, } sz := uattr.Size if sz+offset < 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return sz + offset, nil // FIXME(b/34778850): This is not universally correct. // Remove SpecialDirectory. case fs.SpecialDirectory: if offset != 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } // SEEK_END to 0 moves the directory "cursor" to the end. // @@ -121,12 +122,12 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, // futile (EOF will always be the result). return fs.FileMaxOffset, nil default: - return current, syserror.EINVAL + return current, linuxerr.EINVAL } } // Not a valid seek request. - return current, syserror.EINVAL + return current, linuxerr.EINVAL } // FileGenericSeek implements fs.FileOperations.Seek for files that use a @@ -152,7 +153,7 @@ type FileNoSeek struct{} // Seek implements fs.FileOperations.Seek. func (FileNoSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // FilePipeSeek implements fs.FileOperations.Seek and can be used for files @@ -161,7 +162,7 @@ type FilePipeSeek struct{} // Seek implements fs.FileOperations.Seek. func (FilePipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // FileNotDirReaddir implements fs.FileOperations.Readdir for non-directories. @@ -169,7 +170,7 @@ type FileNotDirReaddir struct{} // Readdir implements fs.FileOperations.FileNotDirReaddir. func (FileNotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) { - return 0, syserror.ENOTDIR + return 0, linuxerr.ENOTDIR } // FileNoFsync implements fs.FileOperations.Fsync for files that don't support @@ -178,7 +179,7 @@ type FileNoFsync struct{} // Fsync implements fs.FileOperations.Fsync. func (FileNoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { - return syserror.EINVAL + return linuxerr.EINVAL } // FileNoopFsync implements fs.FileOperations.Fsync for files that don't need @@ -204,7 +205,7 @@ type FileNoMMap struct{} // ConfigureMMap implements fs.FileOperations.ConfigureMMap. func (FileNoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error { - return syserror.ENODEV + return linuxerr.ENODEV } // GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most @@ -222,7 +223,7 @@ type FileNoIoctl struct{} // Ioctl implements fs.FileOperations.Ioctl. func (FileNoIoctl) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // FileNoSplice implements fs.FileOperations.ReadFrom and @@ -345,7 +346,7 @@ func NewFileStaticContentReader(b []byte) FileStaticContentReader { // Read implements fs.FileOperations.Read. func (scr *FileStaticContentReader) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset >= int64(len(scr.content)) { return 0, nil @@ -367,7 +368,7 @@ type FileNoRead struct{} // Read implements fs.FileOperations.Read. func (FileNoRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // FileNoWrite implements fs.FileOperations.Write to return EINVAL. @@ -375,7 +376,7 @@ type FileNoWrite struct{} // Write implements fs.FileOperations.Write. func (FileNoWrite) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // FileNoopRead implement fs.FileOperations.Read as a noop. diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 85e7e35db..7c2de04c1 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -17,6 +17,7 @@ package fsutil import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -218,7 +219,7 @@ func (i *InodeSimpleExtendedAttributes) GetXattr(_ context.Context, _ *fs.Inode, value, ok := i.xattrs[name] i.mu.RUnlock() if !ok { - return "", syserror.ENOATTR + return "", linuxerr.ENOATTR } return value, nil } @@ -229,17 +230,17 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode, defer i.mu.Unlock() if i.xattrs == nil { if flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } i.xattrs = make(map[string]string) } _, ok := i.xattrs[name] if ok && flags&linux.XATTR_CREATE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } if !ok && flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } i.xattrs[name] = value @@ -265,7 +266,7 @@ func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Ino delete(i.xattrs, name) return nil } - return syserror.ENOATTR + return linuxerr.ENOATTR } // staticFile is a file with static contents. It is returned by @@ -331,52 +332,52 @@ type InodeNotDirectory struct{} // Lookup implements fs.InodeOperations.Lookup. func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // Create implements fs.InodeOperations.Create. func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // CreateLink implements fs.InodeOperations.CreateLink. func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // CreateHardLink implements fs.InodeOperations.CreateHardLink. func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // CreateDirectory implements fs.InodeOperations.CreateDirectory. func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Bind implements fs.InodeOperations.Bind. func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // CreateFifo implements fs.InodeOperations.CreateFifo. func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Remove implements fs.InodeOperations.Remove. func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Rename implements fs.FileOperations.Rename. func (InodeNotDirectory) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error { - return syserror.EINVAL + return linuxerr.EINVAL } // InodeNotSocket can be used by Inodes that are not sockets. @@ -392,7 +393,7 @@ type InodeNotTruncatable struct{} // Truncate implements fs.InodeOperations.Truncate. func (InodeNotTruncatable) Truncate(context.Context, *fs.Inode, int64) error { - return syserror.EINVAL + return linuxerr.EINVAL } // InodeIsDirTruncate implements fs.InodeOperations.Truncate for directories. @@ -416,7 +417,7 @@ type InodeNotRenameable struct{} // Rename implements fs.InodeOperations.Rename. func (InodeNotRenameable) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error { - return syserror.EINVAL + return linuxerr.EINVAL } // InodeNotOpenable can be used by Inodes that cannot be opened. @@ -448,12 +449,12 @@ type InodeNotSymlink struct{} // Readlink implements fs.InodeOperations.Readlink. func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) { - return "", syserror.ENOLINK + return "", linuxerr.ENOLINK } // Getlink implements fs.InodeOperations.Getlink. func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } // InodeNoExtendedAttributes can be used by Inodes that do not support @@ -462,22 +463,22 @@ type InodeNoExtendedAttributes struct{} // GetXattr implements fs.InodeOperations.GetXattr. func (InodeNoExtendedAttributes) GetXattr(context.Context, *fs.Inode, string, uint64) (string, error) { - return "", syserror.EOPNOTSUPP + return "", linuxerr.EOPNOTSUPP } // SetXattr implements fs.InodeOperations.SetXattr. func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, string, uint32) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // ListXattr implements fs.InodeOperations.ListXattr. func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) { - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } // RemoveXattr implements fs.InodeOperations.RemoveXattr. func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // InodeNoopRelease implements fs.InodeOperations.Release as a noop. @@ -512,7 +513,7 @@ type InodeNotAllocatable struct{} // Allocate implements fs.InodeOperations.Allocate. func (InodeNotAllocatable) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // InodeNoopAllocate implements fs.InodeOperations.Allocate as a noop. diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index e107c3096..25e76d9f2 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -20,13 +20,13 @@ import ( "testing" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -244,7 +244,7 @@ func (*sliceBackingFile) FD() int { } func (f *sliceBackingFile) Allocate(ctx context.Context, offset int64, length int64) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } type noopMappingSpace struct{} diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index da3178527..9ff64a8b6 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -476,7 +477,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi switch d.Inode.StableAttr.Type { case fs.Socket: if i.session().overrides != nil { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } return i.getFileSocket(ctx, d, flags) case fs.Pipe: @@ -676,7 +677,7 @@ func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string // Getlink implementfs fs.InodeOperations.Getlink. func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { if !fs.IsSymlink(i.fileState.sattr) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } return nil, fs.ErrResolveViaReadlink } @@ -714,7 +715,7 @@ func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) er if i.fileState.hostMappable != nil { return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts) } - return syserror.ENODEV + return linuxerr.ENODEV } func init() { diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index e2af1d2ae..19f91f010 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -112,13 +112,6 @@ func (i *inodeFileState) loadLoading(_ struct{}) { // +checklocks:i.loading func (i *inodeFileState) afterLoad() { load := func() (err error) { - // See comment on i.loading(). - defer func() { - if err == nil { - i.loading.Unlock() - } - }() - // Manually restore the p9.File. name, ok := i.s.inodeMappings[i.sattr.InodeID] if !ok { @@ -167,6 +160,9 @@ func (i *inodeFileState) afterLoad() { i.savedUAttr = nil } + // See comment on i.loading(). This only unlocks on the + // non-error path. + i.loading.Unlock() // +checklocksforce: per comment. return nil } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 1a6f353d0..88d83060c 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -44,10 +44,11 @@ func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode { // policy. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } - cp := i.session().cachePolicy + s := i.session() + cp := s.cachePolicy if cp.cacheReaddir() { // Check to see if we have readdirCache that indicates the // child does not exist. Avoid holding readdirMu longer than @@ -78,7 +79,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string return nil, err } - if i.session().overrides != nil { + if s.overrides != nil { // Check if file belongs to a internal named pipe. Note that it doesn't need // to check for sockets because it's done in newInodeOperations below. deviceKey := device.MultiDeviceKey{ @@ -86,13 +87,13 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string SecondaryDevice: i.session().connID, Inode: qids[0].Path, } - unlock := i.session().overrides.lock() - if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil { - unlock() + s.overrides.lock() + if pipeInode := s.overrides.getPipe(deviceKey); pipeInode != nil { + s.overrides.unlock() pipeInode.IncRef() return fs.NewDirent(ctx, pipeInode, name), nil } - unlock() + s.overrides.unlock() } // Construct the Inode operations. @@ -107,7 +108,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // Ownership is currently ignored. func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } // Create replaces the directory fid with the newly created/opened @@ -196,7 +197,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string // CreateLink uses Create to create a symlink between oldname and newname. func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { if len(newname) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } owner := fs.FileOwnerFromContext(ctx) @@ -210,29 +211,32 @@ func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname // CreateHardLink implements InodeOperations.CreateHardLink. func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, target *fs.Inode, newName string) error { if len(newName) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } targetOpts, ok := target.InodeOperations.(*inodeOperations) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(inode) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(inode) { // Increase link count. targetOpts.cachingInodeOps.IncLinks(ctx) } + i.touchModificationAndStatusChangeTime(ctx, inode) return nil } // CreateDirectory uses Create to create a directory named s under inodeOperations. -func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error { - if len(s) > maxFilenameLen { - return syserror.ENAMETOOLONG +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + if len(name) > maxFilenameLen { + return linuxerr.ENAMETOOLONG } // If the parent directory has setgid enabled, change the new directory's @@ -247,16 +251,18 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s perm.SetGID = true } - if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + if _, err := i.fileState.file.mkdir(ctx, name, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(dir) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(dir) { // Increase link count. // // N.B. This will update the modification time. i.cachingInodeOps.IncLinks(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } @@ -266,16 +272,17 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s // Bind implements InodeOperations.Bind. func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } - if i.session().overrides == nil { - return nil, syserror.EOPNOTSUPP + s := i.session() + if s.overrides == nil { + return nil, linuxerr.EOPNOTSUPP } // Stabilize the override map while creation is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket) if err != nil { @@ -284,22 +291,23 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) + s.overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) return childDir, nil } // CreateFifo implements fs.InodeOperations.CreateFifo. func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } owner := fs.FileOwnerFromContext(ctx) mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe // N.B. FIFOs use major/minor numbers 0. + s := i.session() if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - if i.session().overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) { + if s.overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) { return err } // If gofer doesn't support mknod, check if we can create an internal fifo. @@ -311,13 +319,14 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st } func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error { - if i.session().overrides == nil { - return syserror.EPERM + s := i.session() + if s.overrides == nil { + return linuxerr.EPERM } // Stabilize the override map while creation is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe) if err != nil { @@ -336,7 +345,7 @@ func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode) + s.overrides.addPipe(fileOps.fileState.key, childDir, inode) return nil } @@ -383,11 +392,12 @@ func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, // Remove implements InodeOperations.Remove. func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } + s := i.session() var key *device.MultiDeviceKey - if i.session().overrides != nil { + if s.overrides != nil { // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { @@ -402,8 +412,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string } // Stabilize the override map while deletion is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() } } } @@ -412,7 +422,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(ctx, *key) + s.overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) @@ -422,18 +432,20 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string // Remove implements InodeOperations.RemoveDirectory. func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } // 0x200 = AT_REMOVEDIR. if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(dir) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(dir) { // Decrease link count and updates atime. i.cachingInodeOps.DecLinks(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } @@ -443,12 +455,12 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na // Rename renames this node. func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { if len(newName) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } // Don't allow renames across different mounts. if newParent.MountSource != oldParent.MountSource { - return syserror.EXDEV + return linuxerr.EXDEV } // Unwrap the new parent to a *inodeOperations. @@ -463,12 +475,13 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } // Is the renamed entity a directory? Fix link counts. + s := i.session() if fs.IsDir(i.fileState.sattr) { // Update cached state. - if i.session().cachePolicy.cacheUAttrs(oldParent) { + if s.cachePolicy.cacheUAttrs(oldParent) { oldParentInodeOperations.cachingInodeOps.DecLinks(ctx) } - if i.session().cachePolicy.cacheUAttrs(newParent) { + if s.cachePolicy.cacheUAttrs(newParent) { // Only IncLinks if there is a new addition to // newParent. If this is replacement, then the total // count remains the same. @@ -477,7 +490,7 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } } } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Mark old directory dirty. oldParentInodeOperations.markDirectoryDirty() if oldParent != newParent { @@ -487,17 +500,18 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } // Rename always updates ctime. - if i.session().cachePolicy.cacheUAttrs(inode) { + if s.cachePolicy.cacheUAttrs(inode) { i.cachingInodeOps.TouchStatusChangeTime(ctx) } return nil } func (i *inodeOperations) touchModificationAndStatusChangeTime(ctx context.Context, inode *fs.Inode) { - if i.session().cachePolicy.cacheUAttrs(inode) { + s := i.session() + if s.cachePolicy.cacheUAttrs(inode) { i.cachingInodeOps.TouchModificationAndStatusChangeTime(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 7cf3522ff..b7debeecb 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -98,9 +98,14 @@ func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { // lock blocks other addition and removal operations from happening while // the backing file is being created or deleted. Returns a function that unlocks // the endpoint map. -func (e *overrideMaps) lock() func() { +// +checklocksacquire:e.mu +func (e *overrideMaps) lock() { e.mu.Lock() - return func() { e.mu.Unlock() } +} + +// +checklocksrelease:e.mu +func (e *overrideMaps) unlock() { + e.mu.Unlock() } // getBoundEndpoint returns the bound endpoint mapped to the given key. @@ -366,8 +371,8 @@ func newOverrideMaps() *overrideMaps { // fillKeyMap populates key and dirent maps upon restore from saved pathmap. func (s *session) fillKeyMap(ctx context.Context) error { - unlock := s.overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() for ep, dirPath := range s.overrides.pathMap { _, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath)) @@ -394,8 +399,8 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. func (s *session) fillPathMap(ctx context.Context) error { - unlock := s.overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 8a1c69ac2..1fd8a0910 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -32,10 +32,11 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return nil } - if i.session().overrides != nil { - unlock := i.session().overrides.lock() - defer unlock() - ep := i.session().overrides.getBoundEndpoint(i.fileState.key) + s := i.session() + if s.overrides != nil { + s.overrides.lock() + defer s.overrides.unlock() + ep := s.overrides.getBoundEndpoint(i.fileState.key) if ep != nil { return ep } diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 07bd078b7..77c08a7ce 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -19,6 +19,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" @@ -268,7 +269,7 @@ func (f *fileOperations) Flush(context.Context, *fs.File) error { // ConfigureMMap implements fs.FileOperations.ConfigureMMap. func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { if !canMap(file.Dirent.Inode) { - return syserror.ENODEV + return linuxerr.ENODEV } return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts) } diff --git a/pkg/sentry/fs/host/host.go b/pkg/sentry/fs/host/host.go index 081ba1dd8..9f6dbd7e9 100644 --- a/pkg/sentry/fs/host/host.go +++ b/pkg/sentry/fs/host/host.go @@ -17,8 +17,8 @@ package host import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) // filesystem is a host filesystem. @@ -40,7 +40,7 @@ func (*filesystem) Name() string { // Mount returns an error. Mounting hostfs is not allowed. func (*filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, dataObj interface{}) (*fs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // AllowUserMount prohibits users from using mount(2) with this file system. diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index e299b532c..5f6af2067 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -17,6 +17,7 @@ package host import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" @@ -113,7 +114,7 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa return nil } if mask.UID || mask.GID { - return syserror.EPERM + return linuxerr.EPERM } if mask.Perms { if err := unix.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil { @@ -224,48 +225,48 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // Create implements fs.InodeOperations.Create. func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // CreateDirectory implements fs.InodeOperations.CreateDirectory. func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syserror.EPERM + return linuxerr.EPERM } // CreateLink implements fs.InodeOperations.CreateLink. func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { - return syserror.EPERM + return linuxerr.EPERM } // CreateHardLink implements fs.InodeOperations.CreateHardLink. func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { - return syserror.EPERM + return linuxerr.EPERM } // CreateFifo implements fs.InodeOperations.CreateFifo. func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { - return syserror.EPERM + return linuxerr.EPERM } // Remove implements fs.InodeOperations.Remove. func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // Rename implements fs.InodeOperations.Rename. func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { - return syserror.EPERM + return linuxerr.EPERM } // Bind implements fs.InodeOperations.Bind. func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } // BoundEndpoint implements fs.InodeOperations.BoundEndpoint. @@ -276,7 +277,7 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. // GetFile implements fs.InodeOperations.GetFile. func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { if fs.IsSocket(d.Inode.StableAttr) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } return newFile(ctx, d, flags, i), nil @@ -313,7 +314,7 @@ func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermM // SetOwner implements fs.InodeOperations.SetOwner. func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { - return syserror.EPERM + return linuxerr.EPERM } // SetPermissions implements fs.InodeOperations.SetPermissions. @@ -392,7 +393,7 @@ func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string // Getlink implements fs.InodeOperations.Getlink. func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { if !fs.IsSymlink(i.fileState.sattr) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } return nil, fs.ErrResolveViaReadlink } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 225244868..54c421775 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -32,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/waiter" @@ -212,7 +211,7 @@ func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMess if n < totalLen && err == nil { // The host only returns a short write if it would otherwise // block (and only for stream sockets). - err = syserror.EAGAIN + err = linuxerr.EAGAIN } if n > 0 && !linuxerr.Equals(linuxerr.EAGAIN, err) { // The caller may need to block to send more data, but diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go index fd48aff11..d98e3c6d1 100644 --- a/pkg/sentry/fs/host/socket_iovec.go +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -16,8 +16,8 @@ package host import ( "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/hostfd" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -66,9 +66,9 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec if length > maxlen { if truncate { stopLen = maxlen - err = syserror.EAGAIN + err = linuxerr.EAGAIN } else { - return 0, nil, nil, syserror.EMSGSIZE + return 0, nil, nil, linuxerr.EMSGSIZE } } diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 77613bfd5..6f38b25c3 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -127,7 +127,7 @@ func (t *TTYFileOperations) Release(ctx context.Context) { func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // Ignore arg[0]. This is the real FD: @@ -168,7 +168,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO pidns := kernel.PIDNamespaceFromContext(ctx) if pidns == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } t.mu.Lock() @@ -193,7 +193,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from // tty_check_change() to -ENOTTY. if linuxerr.Equals(linuxerr.EIO, err) { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } return 0, err } @@ -201,7 +201,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // Check that calling task's process group is in the TTY // session. if task.ThreadGroup().Session() != t.session { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } var pgIDP primitive.Int32 @@ -212,19 +212,19 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // pgID must be non-negative. if pgID < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Process group with pgID must exist in this PID namespace. pidns := task.PIDNamespace() pg := pidns.ProcessGroupWithID(pgID) if pg == nil { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } // Check that new process group is in the TTY session. if pg.Session() != t.session { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } t.fgProcessGroup = pg @@ -284,7 +284,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO unimpl.EmitUnimplementedEvent(ctx) fallthrough default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fs/host/util_amd64_unsafe.go b/pkg/sentry/fs/host/util_amd64_unsafe.go index 21782f1da..e90629f4e 100644 --- a/pkg/sentry/fs/host/util_amd64_unsafe.go +++ b/pkg/sentry/fs/host/util_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package host diff --git a/pkg/sentry/fs/host/util_arm64_unsafe.go b/pkg/sentry/fs/host/util_arm64_unsafe.go index ed8f5242a..9fbb93726 100644 --- a/pkg/sentry/fs/host/util_arm64_unsafe.go +++ b/pkg/sentry/fs/host/util_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package host diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 41a3c2047..ec204e5cf 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -17,6 +17,7 @@ package fs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" @@ -298,7 +299,7 @@ func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error { func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error { // First check the outer-most mounted filesystem. if p.Write && i.MountSource.Flags.ReadOnly { - return syserror.EROFS + return linuxerr.EROFS } if i.overlay != nil { @@ -312,7 +313,7 @@ func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error { // we should not attempt to modify the writable layer if it // is mounted read-only. if p.Write && overlayUpperMountSource(i.MountSource).Flags.ReadOnly { - return syserror.EROFS + return linuxerr.EROFS } } @@ -324,7 +325,7 @@ func (i *Inode) check(ctx context.Context, p PermMask) error { return overlayCheck(ctx, i.overlay, p) } if !i.InodeOperations.Check(ctx, i, p) { - return syserror.EACCES + return linuxerr.EACCES } return nil } diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 2bbfb72ef..98e9fb2b1 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -283,7 +283,7 @@ type InodeOperations interface { // // Any error returned from Getlink other than ErrResolveViaReadlink // indicates the caller's inability to traverse this Inode as a link - // (e.g. syserror.ENOLINK indicates that the Inode is not a link, + // (e.g. linuxerr.ENOLINK indicates that the Inode is not a link, // syscall.EPERM indicates that traversing the link is not allowed, etc). Getlink(context.Context, *Inode) (*Dirent, error) diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index bd1125dcc..c47b9ce58 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -344,7 +344,7 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child * return err } if ser.Written() != 0 { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } if child.Inode.overlay.upper != nil { @@ -375,7 +375,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // Maybe some day we can allow the more complicated case of // non-overlay X overlay renames, but that's not necessary right now. if renamed.Inode.overlay == nil || newParent.Inode.overlay == nil || oldParent.Inode.overlay == nil { - return syserror.EXDEV + return linuxerr.EXDEV } if replacement { @@ -421,7 +421,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // need to bother checking for them. if len(children) > 0 { replaced.DecRef(ctx) - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } @@ -553,7 +553,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin // Don't forward the value of the extended attribute if it would // unexpectedly change the behavior of a wrapping overlay layer. if isXattrOverlay(name) { - return "", syserror.ENODATA + return "", linuxerr.ENODATA } o.copyMu.RLock() @@ -569,7 +569,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error { // Don't allow changes to overlay xattrs through a setxattr syscall. if isXattrOverlay(name) { - return syserror.EPERM + return linuxerr.EPERM } if err := copyUp(ctx, d); err != nil { @@ -601,7 +601,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error { // Don't allow changes to overlay xattrs through a removexattr syscall. if isXattrOverlay(name) { - return syserror.EPERM + return linuxerr.EPERM } if err := copyUp(ctx, d); err != nil { @@ -688,7 +688,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { dirent.DecRef(ctx) // Claim that the path is not accessible. - err = syserror.EACCES + err = linuxerr.EACCES log.Warningf("Getlink not supported in overlay for %q", name) } return nil, err diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index cc5ffa6f1..a3800d700 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" - "gvisor.dev/gvisor/pkg/syserror" ) func TestLookup(t *testing.T) { @@ -390,7 +389,7 @@ func (d *dir) GetXattr(_ context.Context, _ *fs.Inode, name string, _ uint64) (s return "y", nil } } - return "", syserror.ENOATTR + return "", linuxerr.ENOATTR } // GetFile implements InodeOperations.GetFile. diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 1b83643db..ee28b0f99 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -116,23 +117,23 @@ func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { // Seek implements FileOperations.Seek. func (*Inotify) Seek(context.Context, *File, SeekWhence, int64) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Readdir implements FileOperatons.Readdir. func (*Inotify) Readdir(context.Context, *File, DentrySerializer) (int64, error) { - return 0, syserror.ENOTDIR + return 0, linuxerr.ENOTDIR } // Write implements FileOperations.Write. func (*Inotify) Write(context.Context, *File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Read implements FileOperations.Read. func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ int64) (int64, error) { if dst.NumBytes() < inotifyEventBaseSize { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } i.evMu.Lock() @@ -156,7 +157,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i // write some events out. return writeLen, nil } - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Linux always dequeues an available event as long as there's enough @@ -183,7 +184,7 @@ func (*Inotify) WriteTo(context.Context, *File, io.Writer, int64, bool) (int64, // Fsync implements FileOperations.Fsync. func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error { - return syserror.EINVAL + return linuxerr.EINVAL } // ReadFrom implements FileOperations.ReadFrom. @@ -198,7 +199,7 @@ func (*Inotify) Flush(context.Context, *File) error { // ConfigureMMap implements FileOperations.ConfigureMMap. func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error { - return syserror.ENODEV + return linuxerr.ENODEV } // UnstableAttr implements FileOperations.UnstableAttr. @@ -222,7 +223,7 @@ func (i *Inotify) Ioctl(ctx context.Context, _ *File, io usermem.IO, args arch.S return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } @@ -329,7 +330,7 @@ func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { watch, ok := i.watches[wd] if !ok { i.mu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } // Remove the watch from this instance. diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go index 1d6ea5736..fba7b961b 100644 --- a/pkg/sentry/fs/mock.go +++ b/pkg/sentry/fs/mock.go @@ -16,7 +16,7 @@ package fs import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // MockInodeOperations implements InodeOperations for testing Inodes. @@ -109,7 +109,7 @@ func (n *MockInodeOperations) SetPermissions(context.Context, *Inode, FilePermis // SetOwner implements fs.InodeOperations.SetOwner. func (*MockInodeOperations) SetOwner(context.Context, *Inode, FileOwner) error { - return syserror.EINVAL + return linuxerr.EINVAL } // SetTimestamps implements fs.InodeOperations.SetTimestamps. @@ -172,5 +172,5 @@ func (n *MockInodeOperations) RemoveDirectory(context.Context, *Inode, string) e // Getlink implements fs.InodeOperations.Getlink. func (n *MockInodeOperations) Getlink(context.Context, *Inode) (*Dirent, error) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 243098a09..10146af4e 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -20,10 +20,10 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // DefaultTraversalLimit provides a sensible default traversal limit that may @@ -281,7 +281,7 @@ func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error // Linux allows mounting over the root (?). It comes with a strange set // of semantics. We'll just not do this for now. if node.parent == nil { - return syserror.EBUSY + return linuxerr.EBUSY } // For both mount and unmount, we take this lock so we can swap out the @@ -357,7 +357,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly orig, ok := mns.mounts[node] if !ok { // node is not a mount point. - return syserror.EINVAL + return linuxerr.EINVAL } if orig.previous == nil { @@ -380,7 +380,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly if refs := m.DirentRefs(); refs < 2 { panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs)) } else if refs != 2 { - return syserror.EBUSY + return linuxerr.EBUSY } } @@ -497,7 +497,7 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path if current != root { if !IsDir(current.Inode.StableAttr) { current.DecRef(ctx) // Drop reference from above. - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { current.DecRef(ctx) // Drop reference from above. @@ -566,8 +566,8 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema // Resolve the path. target, err := node.Inode.Getlink(ctx) - switch err { - case nil: + switch { + case err == nil: // Make sure we didn't exhaust the traversal budget. if *remainingTraversals == 0 { target.DecRef(ctx) @@ -577,11 +577,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema node.DecRef(ctx) // Drop the original reference. return target, nil - case unix.ENOLINK: + case linuxerr.Equals(linuxerr.ENOLINK, err): // Not a symlink. return node, nil - case ErrResolveViaReadlink: + case err == ErrResolveViaReadlink: defer node.DecRef(ctx) // See above. // First, check if we should traverse. diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index f96f5a3e5..7e72e47b5 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -19,11 +19,11 @@ import ( "strings" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // The virtual filesystem implements an overlay configuration. For a high-level @@ -218,7 +218,7 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist // We don't support copying up from character devices, // named pipes, or anything weird (like proc files). log.Warningf("%s not supported in lower filesytem", lower.StableAttr.Type) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } return &overlayEntry{ diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 24426b225..379429ab2 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -104,7 +104,7 @@ var _ fs.FileOperations = (*execArgFile)(nil) // Read reads the exec arg from the process's address space.. func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } m, err := getTaskMM(f.t) diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 2f2a9f920..546b57287 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -21,6 +21,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" @@ -130,7 +131,7 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { } // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // threadSelf is more magical than "self" link. @@ -154,7 +155,7 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err } // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // Lookup loads an Inode at name into a Dirent. diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 713b81e08..90bd32345 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -9,13 +9,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/proc/device", "//pkg/sentry/kernel/time", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go index b01688b1d..77270814e 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -20,13 +20,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -204,7 +204,7 @@ var _ fs.FileOperations = (*seqFileOperations)(nil) // Write implements fs.FileOperations.Write. func (*seqFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EACCES + return 0, linuxerr.EACCES } // Read implements fs.FileOperations.Read. diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 4893af56b..71f37d582 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -592,7 +592,7 @@ func (pf *portRangeFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSe // Port numbers must be uint16s. if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if err := pf.inode.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ae5ed25f9..edd62b857 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -46,7 +47,7 @@ import ( // no longer in use. func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { if t.ExitState() == kernel.TaskExitDead { - return nil, syserror.ESRCH + return nil, linuxerr.ESRCH } var m *mm.MemoryManager t.WithMuLocked(func(t *kernel.Task) { @@ -61,9 +62,9 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { func checkTaskState(t *kernel.Task) error { switch t.ExitState() { case kernel.TaskExitZombie: - return syserror.EACCES + return linuxerr.EACCES case kernel.TaskExitDead: - return syserror.ESRCH + return linuxerr.ESRCH } return nil } @@ -272,7 +273,7 @@ func (e *exe) executable() (file fsbridge.File, err error) { e.t.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { - err = syserror.EACCES + err = linuxerr.EACCES return } @@ -281,7 +282,7 @@ func (e *exe) executable() (file fsbridge.File, err error) { // (with locks held). file = mm.Executable() if file == nil { - err = syserror.ESRCH + err = linuxerr.ESRCH } }) return @@ -290,7 +291,7 @@ func (e *exe) executable() (file fsbridge.File, err error) { // Readlink implements fs.InodeOperations. func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if !kernel.ContextCanTrace(ctx, e.t, false) { - return "", syserror.EACCES + return "", linuxerr.EACCES } // Pull out the executable for /proc/TID/exe. @@ -323,7 +324,7 @@ func newCwd(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode // Readlink implements fs.InodeOperations. func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if !kernel.ContextCanTrace(ctx, e.t, false) { - return "", syserror.EACCES + return "", linuxerr.EACCES } if err := checkTaskState(e.t); err != nil { return "", err @@ -331,14 +332,14 @@ func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { cwd := e.t.FSContext().WorkingDirectory() if cwd == nil { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } defer cwd.DecRef(ctx) root := fs.RootFromContext(ctx) if root == nil { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } defer root.DecRef(ctx) @@ -380,7 +381,7 @@ func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (strin // Getlink implements fs.InodeOperations.Getlink. func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) { if !kernel.ContextCanTrace(ctx, n.t, false) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } if err := checkTaskState(n.t); err != nil { return nil, err @@ -448,7 +449,7 @@ func (m *memData) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH if !kernel.ContextCanTrace(ctx, m.t, true) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } if err := checkTaskState(m.t); err != nil { return nil, err @@ -473,7 +474,7 @@ func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen n, readErr := mm.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) if n > 0 { if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return int64(n), nil } @@ -867,7 +868,7 @@ var _ fs.FileOperations = (*commFile)(nil) // Read implements fs.FileOperations.Read. func (f *commFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } buf := []byte(f.t.Name() + "\n") @@ -922,7 +923,7 @@ type auxvecFile struct { // Read implements fs.FileOperations.Read. func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } m, err := getTaskMM(f.t) @@ -1003,7 +1004,7 @@ func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F // Read implements fs.FileOperations.Read. func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if f.t.ExitState() == kernel.TaskExitDead { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } var buf bytes.Buffer fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj()) @@ -1030,7 +1031,7 @@ func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOS } if f.t.ExitState() == kernel.TaskExitDead { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } if err := f.t.SetOOMScoreAdj(v); err != nil { return 0, err diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go index 30d5ad4cf..fcdc1e7bd 100644 --- a/pkg/sentry/fs/proc/uid_gid_map.go +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -21,12 +21,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -108,7 +108,7 @@ const maxIDMapLines = 5 // Read implements fs.FileOperations.Read. func (imfo *idMapFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var entries []auth.IDMapEntry if imfo.iops.gids { @@ -134,7 +134,7 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u // the file ..." - user_namespaces(7) srclen := src.NumBytes() if srclen >= hostarch.PageSize || offset != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } b := make([]byte, srclen) if _, err := src.CopyIn(ctx, b); err != nil { @@ -154,7 +154,7 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u } lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) if len(lines) > maxIDMapLines { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries := make([]auth.IDMapEntry, len(lines)) @@ -162,7 +162,7 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u var e auth.IDMapEntry _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) if err != nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries[i] = e } diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go index c0f6fb802..ac896f963 100644 --- a/pkg/sentry/fs/proc/uptime.go +++ b/pkg/sentry/fs/proc/uptime.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -74,7 +74,7 @@ type uptimeFile struct { // Read implements fs.FileOperations.Read. func (f *uptimeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } now := ktime.NowFromContext(ctx) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 4a3d9636b..b46567cf8 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -14,6 +14,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index 19990f9db..33023af77 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -178,7 +179,7 @@ func (d *Dir) Children() ([]string, map[string]fs.DentAttr) { func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) { inode, ok := d.children[name] if !ok { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } delete(d.children, name) @@ -208,7 +209,7 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er // Remove removes the named non-directory. func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -226,7 +227,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { // RemoveDirectory removes the named directory. func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) error { if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -240,7 +241,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err if ok, err := hasChildren(ctx, childInode); err != nil { return err } else if ok { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } // Child was empty. Proceed with removal. @@ -259,7 +260,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err // with a reference. func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) { if len(p) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -292,7 +293,7 @@ func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) { // makeInodeOperations. It is the common logic for creating a new child. func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, makeInodeOperations func() (*fs.Inode, error)) (*fs.Inode, error) { if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -311,7 +312,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make // Create creates a new Inode with the given name and returns its File. func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) { if d.CreateOps == nil || d.CreateOps.NewFile == nil { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { @@ -333,7 +334,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F // CreateLink returns a new link. func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error { if d.CreateOps == nil || d.CreateOps.NewSymlink == nil { - return syserror.EACCES + return linuxerr.EACCES } _, err := d.createInodeOperationsCommon(ctx, newname, func() (*fs.Inode, error) { return d.NewSymlink(ctx, dir, oldname) @@ -344,7 +345,7 @@ func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname st // CreateHardLink creates a new hard link. func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error { if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -362,7 +363,7 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod // CreateDirectory returns a new subdirectory. func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error { if d.CreateOps == nil || d.CreateOps.NewDir == nil { - return syserror.EACCES + return linuxerr.EACCES } _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { return d.NewDir(ctx, dir, perms) @@ -373,7 +374,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p // Bind implements fs.InodeOperations.Bind. func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) { if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { return d.NewBoundEndpoint(ctx, dir, ep, perms) @@ -392,7 +393,7 @@ func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport // CreateFifo implements fs.InodeOperations.CreateFifo. func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error { if d.CreateOps == nil || d.CreateOps.NewFifo == nil { - return syserror.EACCES + return linuxerr.EACCES } _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { return d.NewFifo(ctx, dir, perms) @@ -496,14 +497,14 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) { func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string, replacement bool) error { op, ok := oldParent.(*Dir) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } np, ok := newParent.(*Dir) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } if len(newName) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } np.mu.Lock() @@ -521,7 +522,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n if ok, err := hasChildren(ctx, replaced); err != nil { return err } else if ok { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go index d0c565879..dc9d27bb3 100644 --- a/pkg/sentry/fs/ramfs/socket.go +++ b/pkg/sentry/fs/ramfs/socket.go @@ -17,10 +17,10 @@ package ramfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -64,7 +64,7 @@ func (s *Socket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint { // GetFile implements fs.FileOperations.GetFile. func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } // +stateify savable diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index ca9f645f6..fff4befb2 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -29,7 +29,7 @@ import ( func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, error) { // Verify basic file flag permissions. if !dst.Flags().Write || !src.Flags().Read { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Check whether or not the objects being sliced are stream-oriented diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index c7977a217..0148b33cf 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -8,6 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index c8ebe256c..093a14c1f 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -20,6 +20,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" @@ -121,7 +122,7 @@ func (t *TimerOperations) EventUnregister(e *waiter.Entry) { func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if val := atomic.SwapUint64(&t.val, 0); val != 0 { var buf [sizeofUint64]byte @@ -138,7 +139,7 @@ func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.I // Write implements fs.FileOperations.Write. func (t *TimerOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Notify implements ktime.TimerListener.Notify. diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 90398376a..511fffb43 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/safemem", "//pkg/sentry/device", @@ -30,7 +31,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index 7faa822f0..1974523bf 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -31,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -150,7 +150,7 @@ func (*fileInodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldPare // GetFile implements fs.InodeOperations.GetFile. func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { if fs.IsSocket(d.Inode.StableAttr) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } if flags.Write { @@ -217,7 +217,7 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in fallthrough case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed f.dataMu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } if oldSize != size { @@ -278,7 +278,7 @@ func (f *fileInodeOperations) Allocate(ctx context.Context, _ *fs.Inode, offset, // Check if current seals allow growth. if f.seals&linux.F_SEAL_GROW != 0 { - return syserror.EPERM + return linuxerr.EPERM } f.attr.Size = newSize @@ -455,13 +455,13 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) if end == math.MaxInt64 { // Overflow. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check if seals prevent either file growth or all writes. switch { case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed - return 0, syserror.EPERM + return 0, linuxerr.EPERM case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would // normally grow the file to partially succeed up to the current EOF, @@ -482,7 +482,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) } if end <= rw.offset { // Truncation would result in no data being written. - return 0, syserror.EPERM + return 0, linuxerr.EPERM } } @@ -550,7 +550,7 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS // Reject writable mapping if F_SEAL_WRITE is set. if f.seals&linux.F_SEAL_WRITE != 0 && writable { - return syserror.EPERM + return linuxerr.EPERM } f.mappings.AddMapping(ms, ar, offset, writable) @@ -655,7 +655,7 @@ func GetSeals(inode *fs.Inode) (uint32, error) { return f.seals, nil } // Not a memfd inode. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // AddSeals adds new file seals to a memfd inode. @@ -668,13 +668,13 @@ func AddSeals(inode *fs.Inode, val uint32) error { if f.seals&linux.F_SEAL_SEAL != 0 { // Seal applied which prevents addition of any new seals. - return syserror.EPERM + return linuxerr.EPERM } // F_SEAL_WRITE can only be added if there are no active writable maps. if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { if f.writableMappingPages > 0 { - return syserror.EBUSY + return linuxerr.EBUSY } } @@ -683,5 +683,5 @@ func AddSeals(inode *fs.Inode, val uint32) error { return nil } // Not a memfd inode. - return syserror.EINVAL + return linuxerr.EINVAL } diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 6aa8ff331..9a835b556 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) var fsInfo = fs.Info{ @@ -49,7 +49,7 @@ var fsInfo = fs.Info{ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { // Don't allow renames across different mounts. if newParent.MountSource != oldParent.MountSource { - return syserror.EXDEV + return linuxerr.EXDEV } op := oldParent.InodeOperations.(*Dir) diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 86ada820e..5933cb67b 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -17,6 +17,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal/primitive", "//pkg/refs", diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 13c9dbe7d..3242dcb6a 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -170,54 +171,54 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str // // Creation is never allowed. func (d *dirInodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } // CreateDirectory implements fs.InodeOperations.CreateDirectory. // // Creation is never allowed. func (d *dirInodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syserror.EACCES + return linuxerr.EACCES } // CreateLink implements fs.InodeOperations.CreateLink. // // Creation is never allowed. func (d *dirInodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error { - return syserror.EACCES + return linuxerr.EACCES } // CreateHardLink implements fs.InodeOperations.CreateHardLink. // // Creation is never allowed. func (d *dirInodeOperations) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error { - return syserror.EACCES + return linuxerr.EACCES } // CreateFifo implements fs.InodeOperations.CreateFifo. // // Creation is never allowed. func (d *dirInodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syserror.EACCES + return linuxerr.EACCES } // Remove implements fs.InodeOperations.Remove. // // Removal is never allowed. func (d *dirInodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. // // Removal is never allowed. func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // Bind implements fs.InodeOperations.Bind. func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // GetFile implements fs.InodeOperations.GetFile. diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 13f4901db..0e5916380 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -16,9 +16,9 @@ package tty import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) // ptsDevice is the pseudo-filesystem device. @@ -64,7 +64,7 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou // No options are supported. if data != "" { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } return newDir(ctx, fs.NewMountSource(ctx, &superOperations{}, f, flags)), nil diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index 1cf869b62..88d6703a8 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -17,13 +17,13 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -157,7 +157,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, file *fs.File, io use t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -201,7 +201,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, file *fs.File, io use return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fs/tty/replica.go b/pkg/sentry/fs/tty/replica.go index 0e3eea3bd..ca5bc7535 100644 --- a/pkg/sentry/fs/tty/replica.go +++ b/pkg/sentry/fs/tty/replica.go @@ -17,12 +17,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -142,7 +142,7 @@ func (sf *replicaFileOperations) Ioctl(ctx context.Context, file *fs.File, io us t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -179,7 +179,7 @@ func (sf *replicaFileOperations) Ioctl(ctx context.Context, file *fs.File, io us return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD index 6c798f0bd..4631db2bb 100644 --- a/pkg/sentry/fsbridge/BUILD +++ b/pkg/sentry/fsbridge/BUILD @@ -13,12 +13,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go index 9785fd62a..527bde181 100644 --- a/pkg/sentry/fsbridge/fs.go +++ b/pkg/sentry/fsbridge/fs.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -127,7 +127,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio defer d.DecRef(ctx) if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP } fsPerm := openOptionsToPermMask(&opts) @@ -138,13 +138,13 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio // If they claim it's a directory, then make sure. if strings.HasSuffix(path, "/") { if d.Inode.StableAttr.Type != fs.Directory { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } } if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile { ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags)) diff --git a/pkg/sentry/fsimpl/cgroupfs/BUILD b/pkg/sentry/fsimpl/cgroupfs/BUILD index 37efb641a..4c9c5b344 100644 --- a/pkg/sentry/fsimpl/cgroupfs/BUILD +++ b/pkg/sentry/fsimpl/cgroupfs/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/coverage", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go index fe9871bdd..4290ffe0d 100644 --- a/pkg/sentry/fsimpl/cgroupfs/base.go +++ b/pkg/sentry/fsimpl/cgroupfs/base.go @@ -23,10 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -253,7 +253,7 @@ func parseInt64FromString(ctx context.Context, src usermem.IOSequence, offset in // Note: This also handles zero-len writes if offset is beyond the end // of src, or src is empty. ctx.Warningf("cgroupfs.parseInt64FromString: failed to parse %q: %v", string(buf), err) - return 0, int64(n), syserror.EINVAL + return 0, int64(n), linuxerr.EINVAL } return val, int64(n), nil diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index 05d7eb4ce..24e28a51f 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -62,12 +62,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -167,7 +167,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } @@ -195,7 +195,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if _, ok := mopts["all"]; ok { if len(wantControllers) > 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "all") @@ -209,7 +209,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if len(mopts) != 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } k := kernel.KernelFromContext(ctx) @@ -294,7 +294,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) - return nil, nil, syserror.EBUSY + return nil, nil, linuxerr.EBUSY } // Move all existing tasks to the root of the new hierarchy. @@ -364,7 +364,7 @@ func (*dir) Keep() bool { // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // Open implements kernfs.Inode.Open. diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 6af3c3781..f981ff296 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -29,6 +29,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", @@ -59,5 +60,6 @@ go_test( "//pkg/abi/linux", "//pkg/sentry/contexttest", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index e75954105..7a488e9fd 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -56,7 +57,7 @@ func (*FilesystemType) Name() string { func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // No data allowed. if opts.Data != "" { - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fstype.initOnce.Do(func() { diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go index 448390cfe..1ef07d702 100644 --- a/pkg/sentry/fsimpl/devpts/devpts_test.go +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) func TestSimpleMasterToReplica(t *testing.T) { @@ -54,3 +55,36 @@ func TestSimpleMasterToReplica(t *testing.T) { t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) } } + +type callback func(*waiter.Entry, waiter.EventMask) + +func (cb callback) Callback(entry *waiter.Entry, mask waiter.EventMask) { + cb(entry, mask) +} + +func TestEchoDeadlock(t *testing.T) { + ctx := contexttest.Context(t) + termios := linux.DefaultReplicaTermios + termios.LocalFlags |= linux.ECHO + ld := newLineDiscipline(termios) + outBytes := make([]byte, 32) + dst := usermem.BytesIOSequence(outBytes) + entry := &waiter.Entry{Callback: callback(func(*waiter.Entry, waiter.EventMask) { + ld.inputQueueRead(ctx, dst) + })} + ld.masterWaiter.EventRegister(entry, waiter.ReadableEvents) + defer ld.masterWaiter.EventUnregister(entry) + inBytes := []byte("hello, tty\n") + n, err := ld.inputQueueWrite(ctx, usermem.BytesIOSequence(inBytes)) + if err != nil { + t.Fatalf("inputQueueWrite: %v", err) + } + if int(n) != len(inBytes) { + t.Fatalf("read wrong length: got %d, want %d", n, len(inBytes)) + } + outStr := string(outBytes[:n]) + inStr := string(inBytes) + if outStr != inStr { + t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) + } +} diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index e94a5bac3..9cb21e83b 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -70,6 +70,10 @@ const ( // +------------------------| output queue |<--------------------------+ // (outputQueueRead) +--------------+ (outputQueueWrite) // +// There is special handling for the ECHO option, where bytes written to the +// input queue are also output back to the terminal by being written to +// l.outQueue by the input queue transformer. +// // Lock order: // termiosMu // inQueue.mu @@ -126,7 +130,6 @@ func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArgument // setTermios sets a linux.Termios for the tty. func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.Lock() - defer l.termiosMu.Unlock() oldCanonEnabled := l.termios.LEnabled(linux.ICANON) // We must copy a Termios struct, not KernelTermios. var t linux.Termios @@ -141,7 +144,10 @@ func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArgument l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = true l.inQueue.mu.Unlock() + l.termiosMu.Unlock() l.replicaWaiter.Notify(waiter.ReadableEvents) + } else { + l.termiosMu.Unlock() } return 0, err @@ -179,28 +185,37 @@ func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, pushed, err := l.inQueue.read(ctx, dst, l) + n, pushed, notifyEcho, err := l.inQueue.read(ctx, dst, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } if n > 0 { - l.masterWaiter.Notify(waiter.WritableEvents) + if notifyEcho { + l.masterWaiter.Notify(waiter.ReadableEvents | waiter.WritableEvents) + } else { + l.masterWaiter.Notify(waiter.WritableEvents) + } if pushed { l.replicaWaiter.Notify(waiter.ReadableEvents) } return n, nil + } else if notifyEcho { + l.masterWaiter.Notify(waiter.ReadableEvents) } return 0, syserror.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, err := l.inQueue.write(ctx, src, l) + n, notifyEcho, err := l.inQueue.write(ctx, src, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } + if notifyEcho { + l.masterWaiter.Notify(waiter.ReadableEvents) + } if n > 0 { l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil @@ -214,8 +229,9 @@ func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, pushed, err := l.outQueue.read(ctx, dst, l) + // Ignore notifyEcho, as it cannot happen when reading from the output queue. + n, pushed, _, err := l.outQueue.read(ctx, dst, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } @@ -231,8 +247,9 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, err := l.outQueue.write(ctx, src, l) + // Ignore notifyEcho, as it cannot happen when writing to the output queue. + n, _, err := l.outQueue.write(ctx, src, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } @@ -246,7 +263,8 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq // transformer is a helper interface to make it easier to stateify queue. type transformer interface { // transform functions require queue's mutex to be held. - transform(*lineDiscipline, *queue, []byte) int + // The boolean indicates whether there was any echoed bytes. + transform(*lineDiscipline, *queue, []byte) (int, bool) } // outputQueueTransformer implements transformer. It performs line discipline @@ -261,7 +279,7 @@ type outputQueueTransformer struct{} // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be held. -func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { +func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) { // transformOutput is effectively always in noncanonical mode, as the // master termios never has ICANON set. @@ -270,7 +288,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte if len(q.readBuf) > 0 { q.readable = true } - return len(buf) + return len(buf), false } var ret int @@ -321,7 +339,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte if len(q.readBuf) > 0 { q.readable = true } - return ret + return ret, false } // inputQueueTransformer implements transformer. It performs line discipline @@ -334,15 +352,17 @@ type inputQueueTransformer struct{} // transformed according to flags set in the termios struct. See // drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel // function. +// It returns an extra boolean indicating whether any characters need to be +// echoed, in which case we need to notify readers. // // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be held. -func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { +func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) { // If there's a line waiting to be read in canonical mode, don't write // anything else to the read buffer. if l.termios.LEnabled(linux.ICANON) && q.readable { - return 0 + return 0, false } maxBytes := nonCanonMaxBytes @@ -351,6 +371,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) } var ret int + var notifyEcho bool for len(buf) > 0 && len(q.readBuf) < canonMaxBytes { size := l.peek(buf) cBytes := append([]byte{}, buf[:size]...) @@ -397,7 +418,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) // Anything written to the readBuf will have to be echoed. if l.termios.LEnabled(linux.ECHO) { l.outQueue.writeBytes(cBytes, l) - l.masterWaiter.Notify(waiter.ReadableEvents) + notifyEcho = true } // If we finish a line, make it available for reading. @@ -412,7 +433,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) q.readable = true } - return ret + return ret, notifyEcho } // shouldDiscard returns whether c should be discarded. In canonical mode, if diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 93c031c89..9a1a245dc 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -17,6 +17,7 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -80,7 +80,7 @@ func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs // SetStat implements kernfs.Inode.SetStat func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&linux.STATX_SIZE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } @@ -132,7 +132,7 @@ func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -177,7 +177,7 @@ func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index 47b0f1599..ff1d89955 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -98,17 +98,19 @@ func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArg } -// read reads from q to userspace. It returns the number of bytes read as well -// as whether the read caused more readable data to become available (whether +// read reads from q to userspace. It returns: +// - The number of bytes read +// - Whether the read caused more readable data to become available (whether // data was pushed from the wait buffer to the read buffer). +// - Whether any data was echoed back (need to notify readers). // // Preconditions: l.termiosMu must be held for reading. -func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { +func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, bool, error) { q.mu.Lock() defer q.mu.Unlock() if !q.readable { - return 0, false, syserror.ErrWouldBlock + return 0, false, false, syserror.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { @@ -131,19 +133,20 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl return n, nil })) if err != nil { - return 0, false, err + return 0, false, false, err } // Move data from the queue's wait buffer to its read buffer. - nPushed := q.pushWaitBufLocked(l) + nPushed, notifyEcho := q.pushWaitBufLocked(l) - return int64(n), nPushed > 0, nil + return int64(n), nPushed > 0, notifyEcho, nil } // write writes to q from userspace. +// The returned boolean indicates whether any data was echoed back. // // Preconditions: l.termiosMu must be held for reading. -func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { +func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { q.mu.Lock() defer q.mu.Unlock() @@ -173,44 +176,49 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip return n, nil })) if err != nil { - return 0, err + return 0, false, err } // Push data from the wait to the read buffer. - q.pushWaitBufLocked(l) + _, notifyEcho := q.pushWaitBufLocked(l) - return n, nil + return n, notifyEcho, nil } // writeBytes writes to q from b. +// The returned boolean indicates whether any data was echoed back. // // Preconditions: l.termiosMu must be held for reading. -func (q *queue) writeBytes(b []byte, l *lineDiscipline) { +func (q *queue) writeBytes(b []byte, l *lineDiscipline) bool { q.mu.Lock() defer q.mu.Unlock() // Write to the wait buffer. q.waitBufAppend(b) - q.pushWaitBufLocked(l) + _, notifyEcho := q.pushWaitBufLocked(l) + return notifyEcho } // pushWaitBufLocked fills the queue's read buffer with data from the wait // buffer. +// The returned boolean indicates whether any data was echoed back. // // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be locked. -func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { +func (q *queue) pushWaitBufLocked(l *lineDiscipline) (int, bool) { if q.waitBufLen == 0 { - return 0 + return 0, false } // Move data from the wait to the read buffer. var total int var i int + var notifyEcho bool for i = 0; i < len(q.waitBuf); i++ { - n := q.transform(l, q, q.waitBuf[i]) + n, echo := q.transform(l, q, q.waitBuf[i]) total += n + notifyEcho = notifyEcho || echo if n != len(q.waitBuf[i]) { // The read buffer filled up without consuming the // entire buffer. @@ -223,7 +231,7 @@ func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { q.waitBuf = q.waitBuf[i:] q.waitBufLen -= uint64(total) - return total + return total, notifyEcho } // Precondition: q.mu must be locked. diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go index 96d2054cb..e251897b4 100644 --- a/pkg/sentry/fsimpl/devpts/replica.go +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -17,13 +17,13 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -92,7 +92,7 @@ func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vf // SetStat implements kernfs.Inode.SetStat func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&linux.STATX_SIZE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } @@ -141,7 +141,7 @@ func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, arg t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -179,7 +179,7 @@ func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, arg return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD deleted file mode 100644 index 1060b5301..000000000 --- a/pkg/sentry/fsimpl/ext/BUILD +++ /dev/null @@ -1,103 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "dirent_list", - out = "dirent_list.go", - package = "ext", - prefix = "dirent", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*dirent", - "Linker": "*dirent", - }, -) - -go_template_instance( - name = "fstree", - out = "fstree.go", - package = "ext", - prefix = "generic", - template = "//pkg/sentry/vfs/genericfstree:generic_fstree", - types = { - "Dentry": "dentry", - }, -) - -go_library( - name = "ext", - srcs = [ - "block_map_file.go", - "dentry.go", - "directory.go", - "dirent_list.go", - "ext.go", - "extent_file.go", - "file_description.go", - "filesystem.go", - "fstree.go", - "inode.go", - "regular_file.go", - "symlink.go", - "utils.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/fd", - "//pkg/fspath", - "//pkg/log", - "//pkg/marshal", - "//pkg/marshal/primitive", - "//pkg/safemem", - "//pkg/sentry/arch", - "//pkg/sentry/fs", - "//pkg/sentry/fs/lock", - "//pkg/sentry/fsimpl/ext/disklayout", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/memmap", - "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/syscalls/linux", - "//pkg/sentry/vfs", - "//pkg/sync", - "//pkg/syserror", - "//pkg/usermem", - "//pkg/waiter", - ], -) - -go_test( - name = "ext_test", - size = "small", - srcs = [ - "block_map_test.go", - "ext_test.go", - "extent_test.go", - ], - data = [ - "//pkg/sentry/fsimpl/ext:assets/bigfile.txt", - "//pkg/sentry/fsimpl/ext:assets/file.txt", - "//pkg/sentry/fsimpl/ext:assets/tiny.ext2", - "//pkg/sentry/fsimpl/ext:assets/tiny.ext3", - "//pkg/sentry/fsimpl/ext:assets/tiny.ext4", - ], - library = ":ext", - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/errors/linuxerr", - "//pkg/fspath", - "//pkg/marshal/primitive", - "//pkg/sentry/contexttest", - "//pkg/sentry/fsimpl/ext/disklayout", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/vfs", - "//pkg/test/testutil", - "//pkg/usermem", - "@com_github_google_go_cmp//cmp:go_default_library", - "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", - ], -) diff --git a/pkg/sentry/fsimpl/ext/README.md b/pkg/sentry/fsimpl/ext/README.md deleted file mode 100644 index af00cfda8..000000000 --- a/pkg/sentry/fsimpl/ext/README.md +++ /dev/null @@ -1,117 +0,0 @@ -## EXT(2/3/4) File System - -This is a filesystem driver which supports ext2, ext3 and ext4 filesystems. -Linux has specialized drivers for each variant but none which supports all. This -library takes advantage of ext's backward compatibility and understands the -internal organization of on-disk structures to support all variants. - -This driver implementation diverges from the Linux implementations in being more -forgiving about versioning. For instance, if a filesystem contains both extent -based inodes and classical block map based inodes, this driver will not complain -and interpret them both correctly. While in Linux this would be an issue. This -blurs the line between the three ext fs variants. - -Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has -been superseded by ext4 by large performance gains. Thus it is recommended to -upgrade older filesystem images to ext4 using e2fsprogs for better performance. - -### Read Only - -This driver currently only allows read only operations. A lot of the design -decisions are based on this feature. There are plans to implement write (the -process for which is documented in the future work section). - -### Performance - -One of the biggest wins about this driver is that it directly talks to the -underlying block device (or whatever persistent storage is being used), instead -of making expensive RPCs to a gofer. - -Another advantage is that ext fs supports fast concurrent reads. Currently the -device is represented using a `io.ReaderAt` which allows for concurrent reads. -All reads are directly passed to the device driver which intelligently serves -the read requests in the optimal order. There is no congestion due to locking -while reading in the filesystem level. - -Reads are optimized further in the way file data is transferred over to user -memory. Ext fs directly copies over file data from disk into user memory with no -additional allocations on the way. We can only get faster by preloading file -data into memory (see future work section). - -The internal structures used to represent files, inodes and file descriptors use -a lot of inheritance. With the level of indirection that an interface adds with -an internal pointer, it can quickly fragment a structure across memory. As this -runs along side a full blown kernel (which is memory intensive), having a -fragmented struct might hurt performance. Hence these internal structures, -though interfaced, are tightly packed in memory using the same inheritance -pattern that pkg/sentry/vfs uses. The pkg/sentry/fsimpl/ext/disklayout package -makes an execption to this pattern for reasons documented in the package. - -### Security - -This driver also intends to help sandbox the container better by reducing the -surface of the host kernel that the application touches. It prevents the -application from exploiting vulnerabilities in the host filesystem driver. All -`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly -passed to the device driver in the kernel. Hence this reduces the surface for -attack. - -The application can not affect any host filesystems other than the one passed -via block device by the user. - -### Future Work - -#### Write - -To support write operations we would need to modify the block device underneath. -Currently, the driver does not modify the device at all, not even for updating -the access times for reads. Modifying the filesystem incorrectly can corrupt it -and render it unreadable for other correct ext(x) drivers. Hence caution must be -maintained while modifying metadata structures. - -Ext4 specifically is built for performance and has added a lot of complexity as -to how metadata structures are modified. For instance, files that are organized -via an extent tree which must be balanced and file data blocks must be placed in -the same extent as much as possible to increase locality. Such properties must -be maintained while modifying the tree. - -Ext filesystems boast a lot about locality, which plays a big role in them being -performant. The block allocation algorithm in Linux does a good job in keeping -related data together. This behavior must be maintained as much as possible, -else we might end up degrading the filesystem performance over time. - -Ext4 also supports a wide variety of features which are specialized for varying -use cases. Implementing all of them can get difficult very quickly. - -Ext(x) checksums all its metadata structures to check for corruption, so -modification of any metadata struct must correspond with re-checksumming the -struct. Linux filesystem drivers also order on-disk updates intelligently to not -corrupt the filesystem and also remain performant. The in-memory metadata -structures must be kept in sync with what is on disk. - -There is also replication of some important structures across the filesystem. -All replicas must be updated when their original copy is updated. There is also -provisioning for snapshotting which must be kept in mind, although it should not -affect this implementation unless we allow users to create filesystem snapshots. - -Ext4 also introduced journaling (jbd2). The journal must be updated -appropriately. - -#### Performance - -To improve performance we should implement a buffer cache, and optionally, read -ahead for small files. While doing so we must also keep in mind the memory usage -and have a reasonable cap on how much file data we want to hold in memory. - -#### Features - -Our current implementation will work with most ext4 filesystems for readonly -purposed. However, the following features are not supported yet: - -- Journal -- Snapshotting -- Extended Attributes -- Hash Tree Directories -- Meta Block Groups -- Multiple Mount Protection -- Bigalloc diff --git a/pkg/sentry/fsimpl/ext/assets/README.md b/pkg/sentry/fsimpl/ext/assets/README.md deleted file mode 100644 index 6f1e81b3a..000000000 --- a/pkg/sentry/fsimpl/ext/assets/README.md +++ /dev/null @@ -1,36 +0,0 @@ -### Tiny Ext(2/3/4) Images - -The images are of size 64Kb which supports 64 1k blocks and 16 inodes. This is -the smallest size mkfs.ext(2/3/4) works with. - -These images were generated using the following commands. - -```bash -fallocate -l 64K tiny.ext$VERSION -mkfs.ext$VERSION -j tiny.ext$VERSION -``` - -where `VERSION` is `2`, `3` or `4`. - -You can mount it using: - -```bash -sudo mount -o loop tiny.ext$VERSION $MOUNTPOINT -``` - -`file.txt`, `bigfile.txt` and `symlink.txt` were added to this image by just -mounting it and copying (while preserving links) those files to the mountpoint -directory using: - -```bash -sudo cp -P {file.txt,symlink.txt,bigfile.txt} $MOUNTPOINT -``` - -The files in this directory mirror the contents and organisation of the files -stored in the image. - -You can umount the filesystem using: - -```bash -sudo umount $MOUNTPOINT -``` diff --git a/pkg/sentry/fsimpl/ext/assets/bigfile.txt b/pkg/sentry/fsimpl/ext/assets/bigfile.txt deleted file mode 100644 index 3857cf516..000000000 --- a/pkg/sentry/fsimpl/ext/assets/bigfile.txt +++ /dev/null @@ -1,41 +0,0 @@ -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus faucibus eleifend orci, ut ornare nibh faucibus eu. Cras at condimentum massa. Nullam luctus, elit non porttitor congue, sapien diam feugiat sapien, sed eleifend nulla mauris non arcu. Sed lacinia mauris magna, eu mollis libero varius sit amet. Donec mollis, quam convallis commodo posuere, dolor nisi placerat nisi, in faucibus augue mi eu lorem. In pharetra consectetur faucibus. Ut euismod ex efficitur egestas tincidunt. Maecenas condimentum ut ante in rutrum. Vivamus sed arcu tempor, faucibus turpis et, lacinia diam. - -Sed in lacus vel nisl interdum bibendum in sed justo. Nunc tellus risus, molestie vitae arcu sed, molestie tempus ligula. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc risus neque, volutpat et ante non, ullamcorper condimentum ante. Aliquam sed metus in urna condimentum convallis. Vivamus ut libero mauris. Proin mollis posuere consequat. Vestibulum placerat mollis est et pulvinar. - -Donec rutrum odio ac diam pharetra, id fermentum magna cursus. Pellentesque in dapibus elit, et condimentum orci. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse euismod dapibus est, id vestibulum mauris. Nulla facilisi. Nulla cursus gravida nisi. Phasellus vestibulum rutrum lectus, a dignissim mauris hendrerit vitae. In at elementum mauris. Integer vel efficitur velit. Nullam fringilla sapien mi, quis luctus neque efficitur ac. Aenean nec quam dapibus nunc commodo pharetra. Proin sapien mi, fermentum aliquet vulputate non, aliquet porttitor diam. Quisque lacinia, urna et finibus fermentum, nunc lacus vehicula ex, sed congue metus lectus ac quam. Aliquam erat volutpat. Suspendisse sodales, dolor ut tincidunt finibus, augue erat varius tellus, a interdum erat sem at nunc. Vestibulum cursus iaculis sapien, vitae feugiat dui auctor quis. - -Pellentesque nec maximus nulla, eu blandit diam. Maecenas quis arcu ornare, congue ante at, vehicula ipsum. Praesent feugiat mauris rutrum sem fermentum, nec luctus ipsum placerat. Pellentesque placerat ipsum at dignissim fringilla. Vivamus et posuere sem, eget hendrerit felis. Aenean vulputate, augue vel mollis feugiat, justo ipsum mollis dolor, eu mollis elit neque ut ipsum. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce bibendum sem quam, vulputate laoreet mi dapibus imperdiet. Sed a purus non nibh pretium aliquet. Integer eget luctus augue, vitae tincidunt magna. Ut eros enim, egestas eu nulla et, lobortis egestas arcu. Cras id ipsum ac justo lacinia rutrum. Vivamus lectus leo, ultricies sed justo at, pellentesque feugiat magna. Ut sollicitudin neque elit, vel ornare mauris commodo id. - -Duis dapibus orci et sapien finibus finibus. Mauris eleifend, lacus at vestibulum maximus, quam ligula pharetra erat, sit amet dapibus neque elit vitae neque. In bibendum sollicitudin erat, eget ultricies tortor malesuada at. Sed sit amet orci turpis. Donec feugiat ligula nibh, molestie tincidunt lectus elementum id. Donec volutpat maximus nibh, in vulputate felis posuere eu. Cras tincidunt ullamcorper lacus. Phasellus porta lorem auctor, congue magna a, commodo elit. - -Etiam auctor mi quis elit sodales, eu pulvinar arcu condimentum. Aenean imperdiet risus et dapibus tincidunt. Nullam tincidunt dictum dui, sed commodo urna rutrum id. Ut mollis libero vel elit laoreet bibendum. Quisque arcu arcu, tincidunt at ultricies id, vulputate nec metus. In tristique posuere quam sit amet volutpat. Vivamus scelerisque et nunc at dapibus. Fusce finibus libero ut ligula pretium rhoncus. Mauris non elit in arcu finibus imperdiet. Pellentesque nec massa odio. Proin rutrum mauris non sagittis efficitur. Aliquam auctor quam at dignissim faucibus. Ut eget ligula in magna posuere ultricies vitae sit amet turpis. Duis maximus odio nulla. Donec gravida sem tristique tempus scelerisque. - -Interdum et malesuada fames ac ante ipsum primis in faucibus. Fusce pharetra magna vulputate aliquet tempus. Duis id hendrerit arcu. Quisque ut ex elit. Integer velit orci, venenatis ut sapien ac, placerat porttitor dui. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc hendrerit cursus diam, hendrerit finibus ipsum scelerisque ut. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. - -Nulla non euismod neque. Phasellus vel sapien eu metus pulvinar rhoncus. Suspendisse eu mollis tellus, quis vestibulum tortor. Maecenas interdum dolor sed nulla fermentum maximus. Donec imperdiet ullamcorper condimentum. Nam quis nibh ante. Praesent quis tellus ut tortor pulvinar blandit sit amet ut sapien. Vestibulum est orci, pellentesque vitae tristique sit amet, tristique non felis. - -Vivamus sodales pellentesque varius. Sed vel tempus ligula. Nulla tristique nisl vel dui facilisis, ac sodales augue hendrerit. Proin augue nisi, vestibulum quis augue nec, sagittis tincidunt velit. Vestibulum euismod, nulla nec sodales faucibus, urna sapien vulputate magna, id varius metus sapien ut neque. Duis in mollis urna, in scelerisque enim. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc condimentum dictum turpis, et egestas neque dapibus eget. Quisque fringilla, dui eu venenatis eleifend, erat nibh lacinia urna, at lacinia lacus sapien eu dui. Duis eu erat ut mi lacinia convallis a sed ex. - -Fusce elit metus, tincidunt nec eleifend a, hendrerit nec ligula. Duis placerat finibus sollicitudin. In euismod porta tellus, in luctus justo bibendum bibendum. Maecenas at magna eleifend lectus tincidunt suscipit ut a ligula. Nulla tempor accumsan felis, fermentum dapibus est eleifend vitae. Mauris urna sem, fringilla at ultricies non, ultrices in arcu. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam vehicula nunc at laoreet imperdiet. Nunc tristique ut risus id aliquet. Integer eleifend massa orci. - -Vestibulum sed ante sollicitudin nisi fringilla bibendum nec vel quam. Sed pretium augue eu ligula congue pulvinar. Donec vitae magna tincidunt, pharetra lacus id, convallis nulla. Cras viverra nisl nisl, varius convallis leo vulputate nec. Morbi at consequat dui, sed aliquet metus. Sed suscipit fermentum mollis. Maecenas nec mi sodales, tincidunt purus in, tristique mauris. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec interdum mi in velit efficitur, quis ultrices ex imperdiet. Sed vestibulum, magna ut tristique pretium, mi ipsum placerat tellus, non tempor enim augue et ex. Pellentesque eget felis quis ante sodales viverra ac sed lacus. Donec suscipit tempus massa, eget laoreet massa molestie at. - -Aenean fringilla dui non aliquet consectetur. Fusce cursus quam nec orci hendrerit faucibus. Donec consequat suscipit enim, non volutpat lectus auctor interdum. Proin lorem purus, maximus vel orci vitae, suscipit egestas turpis. Donec risus urna, congue a sem eu, aliquet placerat odio. Morbi gravida tristique turpis, quis efficitur enim. Nunc interdum gravida ipsum vel facilisis. Nunc congue finibus sollicitudin. Quisque euismod aliquet lectus et tincidunt. Curabitur ultrices sem ut mi fringilla fermentum. Morbi pretium, nisi sit amet dapibus congue, dolor enim consectetur risus, a interdum ligula odio sed odio. Quisque facilisis, mi at suscipit gravida, nunc sapien cursus justo, ut luctus odio nulla quis leo. Integer condimentum lobortis mauris, non egestas tellus lobortis sit amet. - -In sollicitudin velit ac ante vehicula, vitae varius tortor mollis. In hac habitasse platea dictumst. Quisque et orci lorem. Integer malesuada fringilla luctus. Pellentesque malesuada, mi non lobortis porttitor, ante ligula vulputate ante, nec dictum risus eros sit amet sapien. Nulla aliquam lorem libero, ac varius nulla tristique eget. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut pellentesque mauris orci, vel consequat mi varius a. Ut sit amet elit vulputate, lacinia metus non, fermentum nisl. Pellentesque eu nisi sed quam egestas blandit. Duis sit amet lobortis dolor. Donec consectetur sem interdum, tristique elit sit amet, sodales lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Fusce id aliquam augue. Sed pretium congue risus vitae lacinia. Vestibulum non vulputate risus, ut malesuada justo. - -Sed odio elit, consectetur ac mauris quis, consequat commodo libero. Fusce sodales velit vulputate pulvinar fermentum. Donec iaculis nec nisl eget faucibus. Mauris at dictum velit. Donec fermentum lectus eu viverra volutpat. Aliquam consequat facilisis lorem, cursus consequat dui bibendum ullamcorper. Pellentesque nulla magna, imperdiet at magna et, cursus egestas enim. Nullam semper molestie lectus sit amet semper. Duis eget tincidunt est. Integer id neque risus. Integer ultricies hendrerit vestibulum. Donec blandit blandit sagittis. Nunc consectetur vitae nisi consectetur volutpat. - -Nulla id lorem fermentum, efficitur magna a, hendrerit dui. Vivamus sagittis orci gravida, bibendum quam eget, molestie est. Phasellus nec enim tincidunt, volutpat sapien non, laoreet diam. Nulla posuere enim nec porttitor lobortis. Donec auctor odio ut orci eleifend, ut eleifend purus convallis. Interdum et malesuada fames ac ante ipsum primis in faucibus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut hendrerit, purus eget viverra tincidunt, sem magna imperdiet libero, et aliquam turpis neque vitae elit. Maecenas semper varius iaculis. Cras non lorem quis quam bibendum eleifend in et libero. Curabitur at purus mauris. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus porta diam sed elit eleifend gravida. - -Nulla facilisi. Ut ultricies diam vel diam consectetur, vel porta augue molestie. Fusce interdum sapien et metus facilisis pellentesque. Nulla convallis sem at nunc vehicula facilisis. Nam ac rutrum purus. Nunc bibendum, dolor sit amet tempus ullamcorper, lorem leo tempor sem, id fringilla nunc augue scelerisque augue. Nullam sit amet rutrum nisl. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Donec sed mauris gravida eros vehicula sagittis at eget orci. Cras elementum, eros at accumsan bibendum, libero neque blandit purus, vitae vestibulum libero massa ac nibh. Integer at placerat nulla. Mauris eu eleifend orci. Aliquam consequat ligula vitae erat porta lobortis. Duis fermentum elit ac aliquet ornare. - -Mauris eget cursus tellus, eget sodales purus. Aliquam malesuada, augue id vulputate finibus, nisi ex bibendum nisl, sit amet laoreet quam urna a dolor. Nullam ultricies, sapien eu laoreet consequat, erat eros dignissim diam, ultrices sodales lectus mauris et leo. Morbi lacinia eu ante at tempus. Sed iaculis finibus magna malesuada efficitur. Donec faucibus erat sit amet elementum feugiat. Praesent a placerat nisi. Etiam lacinia gravida diam, et sollicitudin sapien tincidunt ut. - -Maecenas felis quam, tincidunt vitae venenatis scelerisque, viverra vitae odio. Phasellus enim neque, ultricies suscipit malesuada sit amet, vehicula sit amet purus. Nulla placerat sit amet dui vel tincidunt. Nam quis neque vel magna commodo egestas. Vestibulum sagittis rutrum lorem ut congue. Maecenas vel ultrices tellus. Donec efficitur, urna ac consequat iaculis, lorem felis pharetra eros, eget faucibus orci lectus sit amet arcu. - -Ut a tempus nisi. Nulla facilisi. Praesent vulputate maximus mi et dapibus. Sed sit amet libero ac augue hendrerit efficitur in a sapien. Mauris placerat velit sit amet tellus sollicitudin faucibus. Donec egestas a magna ac suscipit. Duis enim sapien, mollis sed egestas et, vestibulum vel leo. - -Proin quis dapibus dui. Donec eu tincidunt nunc. Vivamus eget purus consectetur, maximus ante vitae, tincidunt elit. Aenean mattis dolor a gravida aliquam. Praesent quis tellus id sem maximus vulputate nec sed nulla. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur metus nulla, volutpat volutpat est eu, hendrerit congue erat. Aliquam sollicitudin augue ante. Sed sollicitudin, magna eu consequat elementum, mi augue ullamcorper felis, molestie imperdiet erat metus iaculis est. Proin ac tortor nisi. Pellentesque quis nisi risus. Integer enim sapien, tincidunt quis tortor id, accumsan venenatis mi. Nulla facilisi. - -Cras pretium sit amet quam congue maximus. Morbi lacus libero, imperdiet commodo massa sed, scelerisque placerat libero. Cras nisl nisi, consectetur sed bibendum eu, venenatis at enim. Proin sodales justo at quam aliquam, a consectetur mi ornare. Donec porta ac est sit amet efficitur. Suspendisse vestibulum tortor id neque imperdiet, id lacinia risus vehicula. Phasellus ac eleifend purus. Mauris vel gravida ante. Aliquam vitae lobortis risus. Sed vehicula consectetur tincidunt. Nam et justo vitae purus molestie consequat. Pellentesque ipsum ex, convallis quis blandit non, gravida et urna. Donec diam ligula amet. diff --git a/pkg/sentry/fsimpl/ext/assets/file.txt b/pkg/sentry/fsimpl/ext/assets/file.txt deleted file mode 100644 index 980a0d5f1..000000000 --- a/pkg/sentry/fsimpl/ext/assets/file.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/pkg/sentry/fsimpl/ext/assets/symlink.txt b/pkg/sentry/fsimpl/ext/assets/symlink.txt deleted file mode 120000 index 4c330738c..000000000 --- a/pkg/sentry/fsimpl/ext/assets/symlink.txt +++ /dev/null @@ -1 +0,0 @@ -file.txt
\ No newline at end of file diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext2 b/pkg/sentry/fsimpl/ext/assets/tiny.ext2 Binary files differdeleted file mode 100644 index 381ade9bf..000000000 --- a/pkg/sentry/fsimpl/ext/assets/tiny.ext2 +++ /dev/null diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext3 b/pkg/sentry/fsimpl/ext/assets/tiny.ext3 Binary files differdeleted file mode 100644 index 0e97a324c..000000000 --- a/pkg/sentry/fsimpl/ext/assets/tiny.ext3 +++ /dev/null diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext4 b/pkg/sentry/fsimpl/ext/assets/tiny.ext4 Binary files differdeleted file mode 100644 index a6859736d..000000000 --- a/pkg/sentry/fsimpl/ext/assets/tiny.ext4 +++ /dev/null diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD deleted file mode 100644 index 6c5a559fd..000000000 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -load("//tools:defs.bzl", "go_test") - -package(licenses = ["notice"]) - -go_test( - name = "benchmark_test", - size = "small", - srcs = ["benchmark_test.go"], - deps = [ - "//pkg/context", - "//pkg/fspath", - "//pkg/sentry/contexttest", - "//pkg/sentry/fsimpl/ext", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/vfs", - ], -) diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go deleted file mode 100644 index 2ee7cc7ac..000000000 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// These benchmarks emulate memfs benchmarks. Ext4 images must be created -// before this benchmark is run using the `make_deep_ext4.sh` script at -// /tmp/image-{depth}.ext4 for all the depths tested below. -// -// The benchmark itself cannot run the script because the script requires -// sudo privileges to create the file system images. -package benchmark_test - -import ( - "fmt" - "os" - "runtime" - "strings" - "testing" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -var depths = []int{1, 2, 3, 8, 64, 100} - -const filename = "file.txt" - -// setUp opens imagePath as an ext Filesystem and returns all necessary -// elements required to run tests. If error is nil, it also returns a tear -// down function which must be called after the test is run for clean up. -func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { - f, err := os.Open(imagePath) - if err != nil { - return nil, nil, nil, nil, err - } - - ctx := contexttest.Context(b) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(ctx); err != nil { - return nil, nil, nil, nil, err - } - vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - InternalData: int(f.Fd()), - }, - }) - if err != nil { - f.Close() - return nil, nil, nil, nil, err - } - - root := mntns.Root() - root.IncRef() - - tearDown := func() { - root.DecRef(ctx) - - if err := f.Close(); err != nil { - b.Fatalf("tearDown failed: %v", err) - } - } - return ctx, vfsObj, &root, tearDown, nil -} - -// mount mounts extfs at the path operation passed. Returns a tear down -// function which must be called after the test is run for clean up. -func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vfs.PathOperation) func() { - b.Helper() - - f, err := os.Open(imagePath) - if err != nil { - b.Fatalf("could not open image at %s: %v", imagePath, err) - } - - ctx := contexttest.Context(b) - creds := auth.CredentialsFromContext(ctx) - - if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - InternalData: int(f.Fd()), - }, - }); err != nil { - b.Fatalf("failed to mount tmpfs submount: %v", err) - } - return func() { - if err := f.Close(); err != nil { - b.Fatalf("tearDown failed: %v", err) - } - } -} - -// BenchmarkVFS2Ext4fsStat emulates BenchmarkVFS2MemfsStat. -func BenchmarkVFS2Ext4fsStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", depth)) - if err != nil { - b.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - creds := auth.CredentialsFromContext(ctx) - var filePathBuilder strings.Builder - filePathBuilder.WriteByte('/') - for i := 1; i <= depth; i++ { - filePathBuilder.WriteString(fmt.Sprintf("%d", i)) - filePathBuilder.WriteByte('/') - } - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ - Root: *root, - Start: *root, - Path: fspath.Parse(filePath), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - // Sanity check. - if stat.Size > 0 { - b.Fatalf("got wrong file size (%d)", stat.Size) - } - } - }) - } -} - -// BenchmarkVFS2ExtfsMountStat emulates BenchmarkVFS2MemfsMountStat. -func BenchmarkVFS2ExtfsMountStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - // Create root extfs with depth 1 so we can mount extfs again at /1/. - ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", 1)) - if err != nil { - b.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - creds := auth.CredentialsFromContext(ctx) - mountPointName := "/1/" - pop := vfs.PathOperation{ - Root: *root, - Start: *root, - Path: fspath.Parse(mountPointName), - } - - // Save the mount point for later use. - mountPoint, err := vfsfs.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - b.Fatalf("failed to walk to mount point: %v", err) - } - defer mountPoint.DecRef(ctx) - - // Create extfs submount. - mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) - defer mountTearDown() - - var filePathBuilder strings.Builder - filePathBuilder.WriteString(mountPointName) - for i := 1; i <= depth; i++ { - filePathBuilder.WriteString(fmt.Sprintf("%d", i)) - filePathBuilder.WriteByte('/') - } - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ - Root: *root, - Start: *root, - Path: fspath.Parse(filePath), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - // Sanity check. touch(1) always creates files of size 0 (empty). - if stat.Size > 0 { - b.Fatalf("got wrong file size (%d)", stat.Size) - } - } - }) - } -} diff --git a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh deleted file mode 100755 index d0910da1f..000000000 --- a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script creates an ext4 image with $1 depth of directories and a file in -# the inner most directory. The created file is at path /1/2/.../depth/file.txt. -# The ext4 image is written to $2. The image is temporarily mounted at -# /tmp/mountpoint. This script must be run with sudo privileges. - -# Usage: -# sudo bash make_deep_ext4.sh {depth} {output path} - -# Check positional arguments. -if [ "$#" -ne 2 ]; then - echo "Usage: sudo bash make_deep_ext4.sh {depth} {output path}" - exit 1 -fi - -# Make sure depth is a non-negative number. -if ! [[ "$1" =~ ^[0-9]+$ ]]; then - echo "Depth must be a non-negative number." - exit 1 -fi - -# Create a 1 MB filesystem image at the requested output path. -rm -f $2 -fallocate -l 1M $2 -if [ $? -ne 0 ]; then - echo "fallocate failed" - exit $? -fi - -# Convert that blank into an ext4 image. -mkfs.ext4 -j $2 -if [ $? -ne 0 ]; then - echo "mkfs.ext4 failed" - exit $? -fi - -# Mount the image. -MOUNTPOINT=/tmp/mountpoint -mkdir -p $MOUNTPOINT -mount -o loop $2 $MOUNTPOINT -if [ $? -ne 0 ]; then - echo "mount failed" - exit $? -fi - -# Create nested directories and the file. -if [ "$1" -eq 0 ]; then - FILEPATH=$MOUNTPOINT/file.txt -else - FILEPATH=$MOUNTPOINT/$(seq -s '/' 1 $1)/file.txt -fi -mkdir -p $(dirname $FILEPATH) || exit -touch $FILEPATH - -# Clean up. -umount $MOUNTPOINT -rm -rf $MOUNTPOINT diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go deleted file mode 100644 index 1165234f9..000000000 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - "math" - - "gvisor.dev/gvisor/pkg/marshal/primitive" - "gvisor.dev/gvisor/pkg/syserror" -) - -const ( - // numDirectBlks is the number of direct blocks in ext block map inodes. - numDirectBlks = 12 -) - -// blockMapFile is a type of regular file which uses direct/indirect block -// addressing to store file data. This was deprecated in ext4. -type blockMapFile struct { - regFile regularFile - - // directBlks are the direct blocks numbers. The physical blocks pointed by - // these holds file data. Contains file blocks 0 to 11. - directBlks [numDirectBlks]primitive.Uint32 - - // indirectBlk is the physical block which contains (blkSize/4) direct block - // numbers (as uint32 integers). - indirectBlk primitive.Uint32 - - // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect - // block numbers (as uint32 integers). - doubleIndirectBlk primitive.Uint32 - - // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly - // indirect block numbers (as uint32 integers). - tripleIndirectBlk primitive.Uint32 - - // coverage at (i)th index indicates the amount of file data a node at - // height (i) covers. Height 0 is the direct block. - coverage [4]uint64 -} - -// Compiles only if blockMapFile implements io.ReaderAt. -var _ io.ReaderAt = (*blockMapFile)(nil) - -// newBlockMapFile is the blockMapFile constructor. It initializes the file to -// physical blocks map with (at most) the first 12 (direct) blocks. -func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { - file := &blockMapFile{} - file.regFile.impl = file - file.regFile.inode.init(args, &file.regFile) - - for i := uint(0); i < 4; i++ { - file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i) - } - - blkMap := file.regFile.inode.diskInode.Data() - for i := 0; i < numDirectBlks; i++ { - file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4]) - } - file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4]) - file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4]) - file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4]) - return file, nil -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { - if len(dst) == 0 { - return 0, nil - } - - if off < 0 { - return 0, syserror.EINVAL - } - - offset := uint64(off) - size := f.regFile.inode.diskInode.Size() - if offset >= size { - return 0, io.EOF - } - - // dirBlksEnd is the file offset until which direct blocks cover file data. - // Direct blocks cover 0 <= file offset < dirBlksEnd. - dirBlksEnd := numDirectBlks * f.coverage[0] - - // indirBlkEnd is the file offset until which the indirect block covers file - // data. The indirect block covers dirBlksEnd <= file offset < indirBlkEnd. - indirBlkEnd := dirBlksEnd + f.coverage[1] - - // doubIndirBlkEnd is the file offset until which the double indirect block - // covers file data. The double indirect block covers the range - // indirBlkEnd <= file offset < doubIndirBlkEnd. - doubIndirBlkEnd := indirBlkEnd + f.coverage[2] - - read := 0 - toRead := len(dst) - if uint64(toRead)+offset > size { - toRead = int(size - offset) - } - for read < toRead { - var err error - var curR int - - // Figure out which block to delegate the read to. - switch { - case offset < dirBlksEnd: - // Direct block. - curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:]) - case offset < indirBlkEnd: - // Indirect block. - curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:]) - case offset < doubIndirBlkEnd: - // Doubly indirect block. - curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:]) - default: - // Triply indirect block. - curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:]) - } - - read += curR - offset += uint64(curR) - if err != nil { - return read, err - } - } - - if read < len(dst) { - return read, io.EOF - } - return read, nil -} - -// read is the recursive step of the ReadAt function. It relies on knowing the -// current node's location on disk (curPhyBlk) and its height in the block map -// tree. A height of 0 shows that the current node is actually holding file -// data. relFileOff tells the offset from which we need to start to reading -// under the current node. It is completely relative to the current node. -func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, dst []byte) (int, error) { - curPhyBlkOff := int64(curPhyBlk) * int64(f.regFile.inode.blkSize) - if height == 0 { - toRead := int(f.regFile.inode.blkSize - relFileOff) - if len(dst) < toRead { - toRead = len(dst) - } - - n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) - if n < toRead { - return n, syserror.EIO - } - return n, nil - } - - childCov := f.coverage[height-1] - startIdx := relFileOff / childCov - endIdx := f.regFile.inode.blkSize / 4 // This is exclusive. - wantEndIdx := (relFileOff + uint64(len(dst))) / childCov - wantEndIdx++ // Make this exclusive. - if wantEndIdx < endIdx { - endIdx = wantEndIdx - } - - read := 0 - curChildOff := relFileOff % childCov - for i := startIdx; i < endIdx; i++ { - var childPhyBlk primitive.Uint32 - err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) - if err != nil { - return read, err - } - - n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:]) - read += n - if err != nil { - return read, err - } - - curChildOff = 0 - } - - return read, nil -} - -// getCoverage returns the number of bytes a node at the given height covers. -// Height 0 is the file data block itself. Height 1 is the indirect block. -// -// Formula: blkSize * ((blkSize / 4)^height) -func getCoverage(blkSize uint64, height uint) uint64 { - return blkSize * uint64(math.Pow(float64(blkSize/4), float64(height))) -} diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go deleted file mode 100644 index ed98b482e..000000000 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "bytes" - "math/rand" - "testing" - - "github.com/google/go-cmp/cmp" - "gvisor.dev/gvisor/pkg/marshal/primitive" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" -) - -// These consts are for mocking the block map tree. -const ( - mockBMBlkSize = uint32(16) - mockBMDiskSize = 2500 -) - -// TestBlockMapReader stress tests block map reader functionality. It performs -// random length reads from all possible positions in the block map structure. -func TestBlockMapReader(t *testing.T) { - mockBMFile, want := blockMapSetUp(t) - n := len(want) - - for from := 0; from < n; from++ { - got := make([]byte, n-from) - - if read, err := mockBMFile.ReadAt(got, int64(from)); err != nil { - t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err) - } - - if diff := cmp.Diff(got, want[from:]); diff != "" { - t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff) - } - } -} - -// blkNumGen is a number generator which gives block numbers for building the -// block map file on disk. It gives unique numbers in a random order which -// facilitates in creating an extremely fragmented filesystem. -type blkNumGen struct { - nums []uint32 -} - -// newBlkNumGen is the blkNumGen constructor. -func newBlkNumGen() *blkNumGen { - blkNums := &blkNumGen{} - lim := mockBMDiskSize / mockBMBlkSize - blkNums.nums = make([]uint32, lim) - for i := uint32(0); i < lim; i++ { - blkNums.nums[i] = i - } - - rand.Shuffle(int(lim), func(i, j int) { - blkNums.nums[i], blkNums.nums[j] = blkNums.nums[j], blkNums.nums[i] - }) - return blkNums -} - -// next returns the next random block number. -func (n *blkNumGen) next() uint32 { - ret := n.nums[0] - n.nums = n.nums[1:] - return ret -} - -// blockMapSetUp creates a mock disk and a block map file. It initializes the -// block map file with 12 direct block, 1 indirect block, 1 double indirect -// block and 1 triple indirect block (basically fill it till the rim). It -// initializes the disk to reflect the inode. Also returns the file data that -// the inode covers and that is written to disk. -func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { - mockDisk := make([]byte, mockBMDiskSize) - var fileData []byte - blkNums := newBlkNumGen() - off := 0 - data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes()) - - // Write the direct blocks. - for i := 0; i < numDirectBlks; i++ { - curBlkNum := primitive.Uint32(blkNums.next()) - curBlkNum.MarshalBytes(data[off:]) - off += curBlkNum.SizeBytes() - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...) - } - - // Write to indirect block. - indirectBlk := primitive.Uint32(blkNums.next()) - indirectBlk.MarshalBytes(data[off:]) - off += indirectBlk.SizeBytes() - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...) - - // Write to double indirect block. - doublyIndirectBlk := primitive.Uint32(blkNums.next()) - doublyIndirectBlk.MarshalBytes(data[off:]) - off += doublyIndirectBlk.SizeBytes() - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...) - - // Write to triple indirect block. - triplyIndirectBlk := primitive.Uint32(blkNums.next()) - triplyIndirectBlk.MarshalBytes(data[off:]) - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...) - - args := inodeArgs{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: getMockBMFileFize(), - }, - }, - blkSize: uint64(mockBMBlkSize), - } - copy(args.diskInode.Data(), data) - - mockFile, err := newBlockMapFile(args) - if err != nil { - t.Fatalf("newBlockMapFile failed: %v", err) - } - return mockFile, fileData -} - -// writeFileDataToBlock writes random bytes to the block on disk. -func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkNumGen) []byte { - if height == 0 { - start := blkNum * mockBMBlkSize - end := start + mockBMBlkSize - rand.Read(disk[start:end]) - return disk[start:end] - } - - var fileData []byte - for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 { - curBlkNum := primitive.Uint32(blkNums.next()) - curBlkNum.MarshalBytes(disk[off : off+4]) - fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...) - } - return fileData -} - -// getMockBMFileFize gets the size of the mock block map file which is used for -// testing. -func getMockBMFileFize() uint32 { - return uint32(numDirectBlks*getCoverage(uint64(mockBMBlkSize), 0) + getCoverage(uint64(mockBMBlkSize), 1) + getCoverage(uint64(mockBMBlkSize), 2) + getCoverage(uint64(mockBMBlkSize), 3)) -} diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go deleted file mode 100644 index 9bfed883a..000000000 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// dentry implements vfs.DentryImpl. -// -// +stateify savable -type dentry struct { - vfsd vfs.Dentry - - // Protected by filesystem.mu. - parent *dentry - name string - - // inode is the inode represented by this dentry. Multiple Dentries may - // share a single non-directory Inode (with hard links). inode is - // immutable. - inode *inode -} - -// Compiles only if dentry implements vfs.DentryImpl. -var _ vfs.DentryImpl = (*dentry)(nil) - -// newDentry is the dentry constructor. -func newDentry(in *inode) *dentry { - d := &dentry{ - inode: in, - } - d.vfsd.Init(d) - return d -} - -// IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef() { - d.inode.incRef() -} - -// TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef() bool { - return d.inode.tryIncRef() -} - -// DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef(ctx context.Context) { - // FIXME(b/134676337): filesystem.mu may not be locked as required by - // inode.decRef(). - d.inode.decRef() -} - -// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -// -// TODO(b/134676337): Implement inotify. -func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} - -// Watches implements vfs.DentryImpl.Watches. -// -// TODO(b/134676337): Implement inotify. -func (d *dentry) Watches() *vfs.Watches { - return nil -} - -// OnZeroWatches implements vfs.Dentry.OnZeroWatches. -// -// TODO(b/134676337): Implement inotify. -func (d *dentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go deleted file mode 100644 index 512b70ede..000000000 --- a/pkg/sentry/fsimpl/ext/directory.go +++ /dev/null @@ -1,312 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" -) - -// directory represents a directory inode. It holds the childList in memory. -// -// +stateify savable -type directory struct { - inode inode - - // childCache maps filenames to dentries for children for which dentries - // have been instantiated. childCache is protected by filesystem.mu. - childCache map[string]*dentry - - // mu serializes the changes to childList. - // Lock Order (outermost locks must be taken first): - // directory.mu - // filesystem.mu - mu sync.Mutex `state:"nosave"` - - // childList is a list containing (1) child dirents and (2) fake dirents - // (with diskDirent == nil) that represent the iteration position of - // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by mu. - childList direntList - - // childMap maps the child's filename to the dirent structure stored in - // childList. This adds some data replication but helps in faster path - // traversal. For consistency, key == childMap[key].diskDirent.FileName(). - // Immutable. - childMap map[string]*dirent -} - -// newDirectory is the directory constructor. -func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { - file := &directory{ - childCache: make(map[string]*dentry), - childMap: make(map[string]*dirent), - } - file.inode.init(args, file) - - // Initialize childList by reading dirents from the underlying file. - if args.diskInode.Flags().Index { - // TODO(b/134676337): Support hash tree directories. Currently only the '.' - // and '..' entries are read in. - - // Users cannot navigate this hash tree directory yet. - log.Warningf("hash tree directory being used which is unsupported") - return file, nil - } - - // The dirents are organized in a linear array in the file data. - // Extract the file data and decode the dirents. - regFile, err := newRegularFile(args) - if err != nil { - return nil, err - } - - // buf is used as scratch space for reading in dirents from disk and - // unmarshalling them into dirent structs. - buf := make([]byte, disklayout.DirentSize) - size := args.diskInode.Size() - for off, inc := uint64(0), uint64(0); off < size; off += inc { - toRead := size - off - if toRead > disklayout.DirentSize { - toRead = disklayout.DirentSize - } - if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead { - return nil, err - } - - var curDirent dirent - if newDirent { - curDirent.diskDirent = &disklayout.DirentNew{} - } else { - curDirent.diskDirent = &disklayout.DirentOld{} - } - curDirent.diskDirent.UnmarshalBytes(buf) - - if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { - // Inode number and name length fields being set to 0 is used to indicate - // an unused dirent. - file.childList.PushBack(&curDirent) - file.childMap[curDirent.diskDirent.FileName()] = &curDirent - } - - // The next dirent is placed exactly after this dirent record on disk. - inc = uint64(curDirent.diskDirent.RecordSize()) - } - - return file, nil -} - -func (i *inode) isDir() bool { - _, ok := i.impl.(*directory) - return ok -} - -// dirent is the directory.childList node. -// -// +stateify savable -type dirent struct { - diskDirent disklayout.Dirent - - // direntEntry links dirents into their parent directory.childList. - direntEntry -} - -// directoryFD represents a directory file description. It implements -// vfs.FileDescriptionImpl. -// -// +stateify savable -type directoryFD struct { - fileDescription - vfs.DirectoryFileDescriptionDefaultImpl - - // Protected by directory.mu. - iter *dirent - off int64 -} - -// Compiles only if directoryFD implements vfs.FileDescriptionImpl. -var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release(ctx context.Context) { - if fd.iter == nil { - return - } - - dir := fd.inode().impl.(*directory) - dir.mu.Lock() - dir.childList.Remove(fd.iter) - dir.mu.Unlock() - fd.iter = nil -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - extfs := fd.filesystem() - dir := fd.inode().impl.(*directory) - - dir.mu.Lock() - defer dir.mu.Unlock() - - // Ensure that fd.iter exists and is not linked into dir.childList. - var child *dirent - if fd.iter == nil { - // Start iteration at the beginning of dir. - child = dir.childList.Front() - fd.iter = &dirent{} - } else { - // Continue iteration from where we left off. - child = fd.iter.Next() - dir.childList.Remove(fd.iter) - } - for ; child != nil; child = child.Next() { - // Skip other directoryFD iterators. - if child.diskDirent != nil { - childType, ok := child.diskDirent.FileType() - if !ok { - // We will need to read the inode off disk. Do not increment - // ref count here because this inode is not being added to the - // dentry tree. - extfs.mu.Lock() - childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode()) - extfs.mu.Unlock() - if err != nil { - // Usage of the file description after the error is - // undefined. This implementation would continue reading - // from the next dirent. - fd.off++ - dir.childList.InsertAfter(child, fd.iter) - return err - } - childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) - } - - if err := cb.Handle(vfs.Dirent{ - Name: child.diskDirent.FileName(), - Type: fs.ToDirentType(childType), - Ino: uint64(child.diskDirent.Inode()), - NextOff: fd.off + 1, - }); err != nil { - dir.childList.InsertBefore(child, fd.iter) - return err - } - fd.off++ - } - } - dir.childList.PushBack(fd.iter) - return nil -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - if whence != linux.SEEK_SET && whence != linux.SEEK_CUR { - return 0, syserror.EINVAL - } - - dir := fd.inode().impl.(*directory) - - dir.mu.Lock() - defer dir.mu.Unlock() - - // Find resulting offset. - if whence == linux.SEEK_CUR { - offset += fd.off - } - - if offset < 0 { - // lseek(2) specifies that EINVAL should be returned if the resulting offset - // is negative. - return 0, syserror.EINVAL - } - - n := int64(len(dir.childMap)) - realWantOff := offset - if realWantOff > n { - realWantOff = n - } - realCurOff := fd.off - if realCurOff > n { - realCurOff = n - } - - // Ensure that fd.iter exists and is linked into dir.childList so we can - // intelligently seek from the optimal position. - if fd.iter == nil { - fd.iter = &dirent{} - dir.childList.PushFront(fd.iter) - } - - // Guess that iterating from the current position is optimal. - child := fd.iter - diff := realWantOff - realCurOff // Shows direction and magnitude of travel. - - // See if starting from the beginning or end is better. - abDiff := diff - if diff < 0 { - abDiff = -diff - } - if abDiff > realWantOff { - // Starting from the beginning is best. - child = dir.childList.Front() - diff = realWantOff - } else if abDiff > (n - realWantOff) { - // Starting from the end is best. - child = dir.childList.Back() - // (n - 1) because the last non-nil dirent represents the (n-1)th offset. - diff = realWantOff - (n - 1) - } - - for child != nil { - // Skip other directoryFD iterators. - if child.diskDirent != nil { - if diff == 0 { - if child != fd.iter { - dir.childList.Remove(fd.iter) - dir.childList.InsertBefore(child, fd.iter) - } - - fd.off = offset - return offset, nil - } - - if diff < 0 { - diff++ - child = child.Prev() - } else { - diff-- - child = child.Next() - } - continue - } - - if diff < 0 { - child = child.Prev() - } else { - child = child.Next() - } - } - - // Reaching here indicates that the offset is beyond the end of the childList. - dir.childList.Remove(fd.iter) - dir.childList.PushBack(fd.iter) - fd.off = offset - return offset, nil -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD deleted file mode 100644 index d98a05dd8..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ /dev/null @@ -1,48 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "disklayout", - srcs = [ - "block_group.go", - "block_group_32.go", - "block_group_64.go", - "dirent.go", - "dirent_new.go", - "dirent_old.go", - "disklayout.go", - "extent.go", - "inode.go", - "inode_new.go", - "inode_old.go", - "superblock.go", - "superblock_32.go", - "superblock_64.go", - "superblock_old.go", - "test_utils.go", - ], - marshal = True, - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/marshal", - "//pkg/sentry/fs", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/time", - ], -) - -go_test( - name = "disklayout_test", - size = "small", - srcs = [ - "block_group_test.go", - "dirent_test.go", - "extent_test.go", - "inode_test.go", - "superblock_test.go", - ], - library = ":disklayout", - deps = ["//pkg/sentry/kernel/time"], -) diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go deleted file mode 100644 index 0d56ae9da..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" -) - -// BlockGroup represents a Linux ext block group descriptor. An ext file system -// is split into a series of block groups. This provides an access layer to -// information needed to access and use a block group. -// -// Location: -// - The block group descriptor table is always placed in the blocks -// immediately after the block containing the superblock. -// - The 1st block group descriptor in the original table is in the -// (sb.FirstDataBlock() + 1)th block. -// - See SuperBlock docs to see where the block group descriptor table is -// replicated. -// - sb.BgDescSize() must be used as the block group descriptor entry size -// while reading the table from disk. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. -type BlockGroup interface { - marshal.Marshallable - - // InodeTable returns the absolute block number of the block containing the - // inode table. This points to an array of Inode structs. Inode tables are - // statically allocated at mkfs time. The superblock records the number of - // inodes per group (length of this table) and the size of each inode struct. - InodeTable() uint64 - - // BlockBitmap returns the absolute block number of the block containing the - // block bitmap. This bitmap tracks the usage of data blocks within this block - // group and has its own checksum. - BlockBitmap() uint64 - - // InodeBitmap returns the absolute block number of the block containing the - // inode bitmap. This bitmap tracks the usage of this group's inode table - // entries and has its own checksum. - InodeBitmap() uint64 - - // ExclusionBitmap returns the absolute block number of the snapshot exclusion - // bitmap. - ExclusionBitmap() uint64 - - // FreeBlocksCount returns the number of free blocks in the group. - FreeBlocksCount() uint32 - - // FreeInodesCount returns the number of free inodes in the group. - FreeInodesCount() uint32 - - // DirectoryCount returns the number of inodes that represent directories - // under this block group. - DirectoryCount() uint32 - - // UnusedInodeCount returns the number of unused inodes beyond the last used - // inode in this group's inode table. As a result, we needn’t scan past the - // (InodesPerGroup - UnusedInodeCount())th entry in the inode table. - UnusedInodeCount() uint32 - - // BlockBitmapChecksum returns the block bitmap checksum. This is calculated - // using crc32c(FS UUID + group number + entire bitmap). - BlockBitmapChecksum() uint32 - - // InodeBitmapChecksum returns the inode bitmap checksum. This is calculated - // using crc32c(FS UUID + group number + entire bitmap). - InodeBitmapChecksum() uint32 - - // Checksum returns this block group's checksum. - // - // If SbMetadataCsum feature is set: - // - checksum is crc32c(FS UUID + group number + group descriptor - // structure) & 0xFFFF. - // - // If SbGdtCsum feature is set: - // - checksum is crc16(FS UUID + group number + group descriptor - // structure). - // - // SbMetadataCsum and SbGdtCsum should not be both set. - // If they are, Linux warns and asks to run fsck. - Checksum() uint16 - - // Flags returns BGFlags which represents the block group flags. - Flags() BGFlags -} - -// These are the different block group flags. -const ( - // BgInodeUninit indicates that inode table and bitmap are not initialized. - BgInodeUninit uint16 = 0x1 - - // BgBlockUninit indicates that block bitmap is not initialized. - BgBlockUninit uint16 = 0x2 - - // BgInodeZeroed indicates that inode table is zeroed. - BgInodeZeroed uint16 = 0x4 -) - -// BGFlags represents all the different combinations of block group flags. -type BGFlags struct { - InodeUninit bool - BlockUninit bool - InodeZeroed bool -} - -// ToInt converts a BGFlags struct back to its 16-bit representation. -func (f BGFlags) ToInt() uint16 { - var res uint16 - - if f.InodeUninit { - res |= BgInodeUninit - } - if f.BlockUninit { - res |= BgBlockUninit - } - if f.InodeZeroed { - res |= BgInodeZeroed - } - - return res -} - -// BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. -func BGFlagsFromInt(flags uint16) BGFlags { - return BGFlags{ - InodeUninit: flags&BgInodeUninit > 0, - BlockUninit: flags&BgBlockUninit > 0, - InodeZeroed: flags&BgInodeZeroed > 0, - } -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go deleted file mode 100644 index a35fa22a0..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup32Bit emulates the first half of struct ext4_group_desc in -// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and -// 32-bit ext4 filesystems. It implements BlockGroup interface. -// -// +marshal -type BlockGroup32Bit struct { - BlockBitmapLo uint32 - InodeBitmapLo uint32 - InodeTableLo uint32 - FreeBlocksCountLo uint16 - FreeInodesCountLo uint16 - UsedDirsCountLo uint16 - FlagsRaw uint16 - ExcludeBitmapLo uint32 - BlockBitmapChecksumLo uint16 - InodeBitmapChecksumLo uint16 - ItableUnusedLo uint16 - ChecksumRaw uint16 -} - -// Compiles only if BlockGroup32Bit implements BlockGroup. -var _ BlockGroup = (*BlockGroup32Bit)(nil) - -// InodeTable implements BlockGroup.InodeTable. -func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } - -// BlockBitmap implements BlockGroup.BlockBitmap. -func (bg *BlockGroup32Bit) BlockBitmap() uint64 { return uint64(bg.BlockBitmapLo) } - -// InodeBitmap implements BlockGroup.InodeBitmap. -func (bg *BlockGroup32Bit) InodeBitmap() uint64 { return uint64(bg.InodeBitmapLo) } - -// ExclusionBitmap implements BlockGroup.ExclusionBitmap. -func (bg *BlockGroup32Bit) ExclusionBitmap() uint64 { return uint64(bg.ExcludeBitmapLo) } - -// FreeBlocksCount implements BlockGroup.FreeBlocksCount. -func (bg *BlockGroup32Bit) FreeBlocksCount() uint32 { return uint32(bg.FreeBlocksCountLo) } - -// FreeInodesCount implements BlockGroup.FreeInodesCount. -func (bg *BlockGroup32Bit) FreeInodesCount() uint32 { return uint32(bg.FreeInodesCountLo) } - -// DirectoryCount implements BlockGroup.DirectoryCount. -func (bg *BlockGroup32Bit) DirectoryCount() uint32 { return uint32(bg.UsedDirsCountLo) } - -// UnusedInodeCount implements BlockGroup.UnusedInodeCount. -func (bg *BlockGroup32Bit) UnusedInodeCount() uint32 { return uint32(bg.ItableUnusedLo) } - -// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. -func (bg *BlockGroup32Bit) BlockBitmapChecksum() uint32 { return uint32(bg.BlockBitmapChecksumLo) } - -// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. -func (bg *BlockGroup32Bit) InodeBitmapChecksum() uint32 { return uint32(bg.InodeBitmapChecksumLo) } - -// Checksum implements BlockGroup.Checksum. -func (bg *BlockGroup32Bit) Checksum() uint16 { return bg.ChecksumRaw } - -// Flags implements BlockGroup.Flags. -func (bg *BlockGroup32Bit) Flags() BGFlags { return BGFlagsFromInt(bg.FlagsRaw) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go deleted file mode 100644 index d54d1d345..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup64Bit emulates struct ext4_group_desc in fs/ext4/ext4.h. -// It is the block group descriptor struct for 64-bit ext4 filesystems. -// It implements BlockGroup interface. It is an extension of the 32-bit -// version of BlockGroup. -// -// +marshal -type BlockGroup64Bit struct { - // We embed the 32-bit struct here because 64-bit version is just an extension - // of the 32-bit version. - BlockGroup32Bit - - // 64-bit specific fields. - BlockBitmapHi uint32 - InodeBitmapHi uint32 - InodeTableHi uint32 - FreeBlocksCountHi uint16 - FreeInodesCountHi uint16 - UsedDirsCountHi uint16 - ItableUnusedHi uint16 - ExcludeBitmapHi uint32 - BlockBitmapChecksumHi uint16 - InodeBitmapChecksumHi uint16 - _ uint32 // Padding to 64 bytes. -} - -// Compiles only if BlockGroup64Bit implements BlockGroup. -var _ BlockGroup = (*BlockGroup64Bit)(nil) - -// Methods to override. Checksum() and Flags() are not overridden. - -// InodeTable implements BlockGroup.InodeTable. -func (bg *BlockGroup64Bit) InodeTable() uint64 { - return (uint64(bg.InodeTableHi) << 32) | uint64(bg.InodeTableLo) -} - -// BlockBitmap implements BlockGroup.BlockBitmap. -func (bg *BlockGroup64Bit) BlockBitmap() uint64 { - return (uint64(bg.BlockBitmapHi) << 32) | uint64(bg.BlockBitmapLo) -} - -// InodeBitmap implements BlockGroup.InodeBitmap. -func (bg *BlockGroup64Bit) InodeBitmap() uint64 { - return (uint64(bg.InodeBitmapHi) << 32) | uint64(bg.InodeBitmapLo) -} - -// ExclusionBitmap implements BlockGroup.ExclusionBitmap. -func (bg *BlockGroup64Bit) ExclusionBitmap() uint64 { - return (uint64(bg.ExcludeBitmapHi) << 32) | uint64(bg.ExcludeBitmapLo) -} - -// FreeBlocksCount implements BlockGroup.FreeBlocksCount. -func (bg *BlockGroup64Bit) FreeBlocksCount() uint32 { - return (uint32(bg.FreeBlocksCountHi) << 16) | uint32(bg.FreeBlocksCountLo) -} - -// FreeInodesCount implements BlockGroup.FreeInodesCount. -func (bg *BlockGroup64Bit) FreeInodesCount() uint32 { - return (uint32(bg.FreeInodesCountHi) << 16) | uint32(bg.FreeInodesCountLo) -} - -// DirectoryCount implements BlockGroup.DirectoryCount. -func (bg *BlockGroup64Bit) DirectoryCount() uint32 { - return (uint32(bg.UsedDirsCountHi) << 16) | uint32(bg.UsedDirsCountLo) -} - -// UnusedInodeCount implements BlockGroup.UnusedInodeCount. -func (bg *BlockGroup64Bit) UnusedInodeCount() uint32 { - return (uint32(bg.ItableUnusedHi) << 16) | uint32(bg.ItableUnusedLo) -} - -// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. -func (bg *BlockGroup64Bit) BlockBitmapChecksum() uint32 { - return (uint32(bg.BlockBitmapChecksumHi) << 16) | uint32(bg.BlockBitmapChecksumLo) -} - -// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. -func (bg *BlockGroup64Bit) InodeBitmapChecksum() uint32 { - return (uint32(bg.InodeBitmapChecksumHi) << 16) | uint32(bg.InodeBitmapChecksumLo) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go deleted file mode 100644 index e4ce484e4..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestBlockGroupSize tests that the block group descriptor structs are of the -// correct size. -func TestBlockGroupSize(t *testing.T) { - var bgSmall BlockGroup32Bit - assertSize(t, &bgSmall, 32) - var bgBig BlockGroup64Bit - assertSize(t, &bgBig, 64) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go deleted file mode 100644 index 568c8cb4c..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -const ( - // MaxFileName is the maximum length of an ext fs file's name. - MaxFileName = 255 - - // DirentSize is the size of ext dirent structures. - DirentSize = 263 -) - -var ( - // inodeTypeByFileType maps ext4 file types to vfs inode types. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. - inodeTypeByFileType = map[uint8]fs.InodeType{ - 0: fs.Anonymous, - 1: fs.RegularFile, - 2: fs.Directory, - 3: fs.CharacterDevice, - 4: fs.BlockDevice, - 5: fs.Pipe, - 6: fs.Socket, - 7: fs.Symlink, - } -) - -// The Dirent interface should be implemented by structs representing ext -// directory entries. These are for the linear classical directories which -// just store a list of dirent structs. A directory is a series of data blocks -// where is each data block contains a linear array of dirents. The last entry -// of the block has a record size that takes it to the end of the block. The -// end of the directory is when you read dirInode.Size() bytes from the blocks. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. -type Dirent interface { - marshal.Marshallable - - // Inode returns the absolute inode number of the underlying inode. - // Inode number 0 signifies an unused dirent. - Inode() uint32 - - // RecordSize returns the record length of this dirent on disk. The next - // dirent in the dirent list should be read after these many bytes from - // the current dirent. Must be a multiple of 4. - RecordSize() uint16 - - // FileName returns the name of the file. Can be at most 255 is length. - FileName() string - - // FileType returns the inode type of the underlying inode. This is a - // performance hack so that we do not have to read the underlying inode struct - // to know the type of inode. This will only work when the SbDirentFileType - // feature is set. If not, the second returned value will be false indicating - // that user code has to use the inode mode to extract the file type. - FileType() (fs.InodeType, bool) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go deleted file mode 100644 index 51f9c2946..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// DirentNew represents the ext4 directory entry struct. This emulates Linux's -// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we -// only need 8 bits to store the NameLength. As a result, NameLength has been -// shortened and the other 8 bits are used to encode the file type. Use the -// FileTypeRaw field only if the SbDirentFileType feature is set. -// -// Note: This struct can be of variable size on disk. The one described below -// is of maximum size and the FileName beyond NameLength bytes might contain -// garbage. -// -// +marshal -type DirentNew struct { - InodeNumber uint32 - RecordLength uint16 - NameLength uint8 - FileTypeRaw uint8 - FileNameRaw [MaxFileName]byte `marshal:"unaligned"` -} - -// Compiles only if DirentNew implements Dirent. -var _ Dirent = (*DirentNew)(nil) - -// Inode implements Dirent.Inode. -func (d *DirentNew) Inode() uint32 { return d.InodeNumber } - -// RecordSize implements Dirent.RecordSize. -func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } - -// FileName implements Dirent.FileName. -func (d *DirentNew) FileName() string { - return string(d.FileNameRaw[:d.NameLength]) -} - -// FileType implements Dirent.FileType. -func (d *DirentNew) FileType() (fs.InodeType, bool) { - if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { - return inodeType, true - } - - panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go deleted file mode 100644 index d4b19e086..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import "gvisor.dev/gvisor/pkg/sentry/fs" - -// DirentOld represents the old directory entry struct which does not contain -// the file type. This emulates Linux's ext4_dir_entry struct. -// -// Note: This struct can be of variable size on disk. The one described below -// is of maximum size and the FileName beyond NameLength bytes might contain -// garbage. -// -// +marshal -type DirentOld struct { - InodeNumber uint32 - RecordLength uint16 - NameLength uint16 - FileNameRaw [MaxFileName]byte `marshal:"unaligned"` -} - -// Compiles only if DirentOld implements Dirent. -var _ Dirent = (*DirentOld)(nil) - -// Inode implements Dirent.Inode. -func (d *DirentOld) Inode() uint32 { return d.InodeNumber } - -// RecordSize implements Dirent.RecordSize. -func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } - -// FileName implements Dirent.FileName. -func (d *DirentOld) FileName() string { - return string(d.FileNameRaw[:d.NameLength]) -} - -// FileType implements Dirent.FileType. -func (d *DirentOld) FileType() (fs.InodeType, bool) { - return fs.Anonymous, false -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go deleted file mode 100644 index 3486864dc..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestDirentSize tests that the dirent structs are of the correct -// size. -func TestDirentSize(t *testing.T) { - var dOld DirentOld - assertSize(t, &dOld, DirentSize) - var dNew DirentNew - assertSize(t, &dNew, DirentSize) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go deleted file mode 100644 index 0834e9ba8..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package disklayout provides Linux ext file system's disk level structures -// which can be directly read into from the underlying device. Structs aim to -// emulate structures `exactly` how they are layed out on disk. -// -// This library aims to be compatible with all ext(2/3/4) systems so it -// provides a generic interface for all major structures and various -// implementations (for different versions). The user code is responsible for -// using appropriate implementations based on the underlying device. -// -// Interfacing all major structures here serves a few purposes: -// - Abstracts away the complexity of the underlying structure from client -// code. The client only has to figure out versioning on set up and then -// can use these as black boxes and pass it higher up the stack. -// - Having pointer receivers forces the user to use pointers to these -// heavy structs. Hence, prevents the client code from unintentionally -// copying these by value while passing the interface around. -// - Version-based implementation selection is resolved on set up hence -// avoiding per call overhead of choosing implementation. -// - All interface methods are pretty light weight (do not take in any -// parameters by design). Passing pointer arguments to interface methods -// can lead to heap allocation as the compiler won't be able to perform -// escape analysis on an unknown implementation at compile time. -// -// Notes: -// - All structures on disk are in little-endian order. Only jbd2 (journal) -// structures are in big-endian order. -// - All OS dependent fields in these structures will be interpretted using -// the Linux version of that field. -// - The suffix `Lo` in field names stands for lower bits of that field. -// - The suffix `Hi` in field names stands for upper bits of that field. -// - The suffix `Raw` has been added to indicate that the field is not split -// into Lo and Hi fields and also to resolve name collision with the -// respective interface. -package disklayout diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go deleted file mode 100644 index b13999bfc..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/extent.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" -) - -// Extents were introduced in ext4 and provide huge performance gains in terms -// data locality and reduced metadata block usage. Extents are organized in -// extent trees. The root node is contained in inode.BlocksRaw. -// -// Terminology: -// - Physical Block: -// Filesystem data block which is addressed normally wrt the entire -// filesystem (addressed with 48 bits). -// -// - File Block: -// Data block containing *only* file data and addressed wrt to the file -// with only 32 bits. The (i)th file block contains file data from -// byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()). - -const ( - // ExtentHeaderSize is the size of the header of an extent tree node. - ExtentHeaderSize = 12 - - // ExtentEntrySize is the size of an entry in an extent tree node. - // This size is the same for both leaf and internal nodes. - ExtentEntrySize = 12 - - // ExtentMagic is the magic number which must be present in the header. - ExtentMagic = 0xf30a -) - -// ExtentEntryPair couples an in-memory ExtendNode with the ExtentEntry that -// points to it. We want to cache these structs in memory to avoid repeated -// disk reads. -// -// Note: This struct itself does not represent an on-disk struct. -type ExtentEntryPair struct { - // Entry points to the child node on disk. - Entry ExtentEntry - // Node points to child node in memory. Is nil if the current node is a leaf. - Node *ExtentNode -} - -// ExtentNode represents an extent tree node. For internal nodes, all Entries -// will be ExtendIdxs. For leaf nodes, they will all be Extents. -// -// Note: This struct itself does not represent an on-disk struct. -type ExtentNode struct { - Header ExtentHeader - Entries []ExtentEntryPair -} - -// ExtentEntry represents an extent tree node entry. The entry can either be -// an ExtentIdx or Extent itself. This exists to simplify navigation logic. -type ExtentEntry interface { - marshal.Marshallable - - // FileBlock returns the first file block number covered by this entry. - FileBlock() uint32 - - // PhysicalBlock returns the child physical block that this entry points to. - PhysicalBlock() uint64 -} - -// ExtentHeader emulates the ext4_extent_header struct in ext4. Each extent -// tree node begins with this and is followed by `NumEntries` number of: -// - Extent if `Depth` == 0 -// - ExtentIdx otherwise -// -// +marshal -type ExtentHeader struct { - // Magic in the extent magic number, must be 0xf30a. - Magic uint16 - - // NumEntries indicates the number of valid entries following the header. - NumEntries uint16 - - // MaxEntries that could follow the header. Used while adding entries. - MaxEntries uint16 - - // Height represents the distance of this node from the farthest leaf. Please - // note that Linux incorrectly calls this `Depth` (which means the distance - // of the node from the root). - Height uint16 - _ uint32 -} - -// ExtentIdx emulates the ext4_extent_idx struct in ext4. Only present in -// internal nodes. Sorted in ascending order based on FirstFileBlock since -// Linux does a binary search on this. This points to a block containing the -// child node. -// -// +marshal -type ExtentIdx struct { - FirstFileBlock uint32 - ChildBlockLo uint32 - ChildBlockHi uint16 - _ uint16 -} - -// Compiles only if ExtentIdx implements ExtentEntry. -var _ ExtentEntry = (*ExtentIdx)(nil) - -// FileBlock implements ExtentEntry.FileBlock. -func (ei *ExtentIdx) FileBlock() uint32 { - return ei.FirstFileBlock -} - -// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the -// physical block number of the child block. -func (ei *ExtentIdx) PhysicalBlock() uint64 { - return (uint64(ei.ChildBlockHi) << 32) | uint64(ei.ChildBlockLo) -} - -// Extent represents the ext4_extent struct in ext4. Only present in leaf -// nodes. Sorted in ascending order based on FirstFileBlock since Linux does a -// binary search on this. This points to an array of data blocks containing the -// file data. It covers `Length` data blocks starting from `StartBlock`. -// -// +marshal -type Extent struct { - FirstFileBlock uint32 - Length uint16 - StartBlockHi uint16 - StartBlockLo uint32 -} - -// Compiles only if Extent implements ExtentEntry. -var _ ExtentEntry = (*Extent)(nil) - -// FileBlock implements ExtentEntry.FileBlock. -func (e *Extent) FileBlock() uint32 { - return e.FirstFileBlock -} - -// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the -// physical block number of the first data block this extent covers. -func (e *Extent) PhysicalBlock() uint64 { - return (uint64(e.StartBlockHi) << 32) | uint64(e.StartBlockLo) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go deleted file mode 100644 index c96002e19..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestExtentSize tests that the extent structs are of the correct -// size. -func TestExtentSize(t *testing.T) { - var h ExtentHeader - assertSize(t, &h, ExtentHeaderSize) - var i ExtentIdx - assertSize(t, &i, ExtentEntrySize) - var e Extent - assertSize(t, &e, ExtentEntrySize) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go deleted file mode 100644 index ef25040a9..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode.go +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -// Special inodes. See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#special-inodes. -const ( - // RootDirInode is the inode number of the root directory inode. - RootDirInode = 2 -) - -// The Inode interface must be implemented by structs representing ext inodes. -// The inode stores all the metadata pertaining to the file (except for the -// file name which is held by the directory entry). It does NOT expose all -// fields and should be extended if need be. -// -// Some file systems (e.g. FAT) use the directory entry to store all this -// information. Ext file systems do not so that they can support hard links. -// However, ext4 cheats a little bit and duplicates the file type in the -// directory entry for performance gains. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. -type Inode interface { - marshal.Marshallable - - // Mode returns the linux file mode which is majorly used to extract - // information like: - // - File permissions (read/write/execute by user/group/others). - // - Sticky, set UID and GID bits. - // - File type. - // - // Masks to extract this information are provided in pkg/abi/linux/file.go. - Mode() linux.FileMode - - // UID returns the owner UID. - UID() auth.KUID - - // GID returns the owner GID. - GID() auth.KGID - - // Size returns the size of the file in bytes. - Size() uint64 - - // InodeSize returns the size of this inode struct in bytes. - // In ext2 and ext3, the inode struct and inode disk record size was fixed at - // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. - // However, accessing any field beyond the 128 bytes marker must be verified - // using this method. - InodeSize() uint16 - - // AccessTime returns the last access time. Shows when the file was last read. - // - // If InExtendedAttr is set, then this should NOT be used because the - // underlying field is used to store the extended attribute value checksum. - AccessTime() time.Time - - // ChangeTime returns the last change time. Shows when the file meta data - // (like permissions) was last changed. - // - // If InExtendedAttr is set, then this should NOT be used because the - // underlying field is used to store the lower 32 bits of the attribute - // value’s reference count. - ChangeTime() time.Time - - // ModificationTime returns the last modification time. Shows when the file - // content was last modified. - // - // If InExtendedAttr is set, then this should NOT be used because - // the underlying field contains the number of the inode that owns the - // extended attribute. - ModificationTime() time.Time - - // DeletionTime returns the deletion time. Inodes are marked as deleted by - // writing to the underlying field. FS tools can restore files until they are - // actually overwritten. - DeletionTime() time.Time - - // LinksCount returns the number of hard links to this inode. - // - // Normally there is an upper limit on the number of hard links: - // - ext2/ext3 = 32,000 - // - ext4 = 65,000 - // - // This implies that an ext4 directory cannot have more than 64,998 - // subdirectories because each subdirectory will have a hard link to the - // directory via the `..` entry. The directory has hard link via the `.` entry - // of its own. And finally the inode is initiated with 1 hard link (itself). - // - // The underlying value is reset to 1 if all the following hold: - // - Inode is a directory. - // - SbDirNlink is enabled. - // - Number of hard links is incremented past 64,999. - // Hard link value of 1 for a directory would indicate that the number of hard - // links is unknown because a directory can have minimum 2 hard links (itself - // and `.` entry). - LinksCount() uint16 - - // Flags returns InodeFlags which represents the inode flags. - Flags() InodeFlags - - // Data returns the underlying inode.i_block array as a slice so it's - // modifiable. This field is special and is used to store various kinds of - // things depending on the filesystem version and inode type. The underlying - // field name in Linux is a little misleading. - // - In ext2/ext3, it contains the block map. - // - In ext4, it contains the extent tree root node. - // - For inline files, it contains the file contents. - // - For symlinks, it contains the link path (if it fits here). - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. - Data() []byte -} - -// Inode flags. This is not comprehensive and flags which were not used in -// the Linux kernel have been excluded. -const ( - // InSync indicates that all writes to the file must be synchronous. - InSync = 0x8 - - // InImmutable indicates that this file is immutable. - InImmutable = 0x10 - - // InAppend indicates that this file can only be appended to. - InAppend = 0x20 - - // InNoDump indicates that teh dump(1) utility should not dump this file. - InNoDump = 0x40 - - // InNoAccessTime indicates that the access time of this inode must not be - // updated. - InNoAccessTime = 0x80 - - // InIndex indicates that this directory has hashed indexes. - InIndex = 0x1000 - - // InJournalData indicates that file data must always be written through a - // journal device. - InJournalData = 0x4000 - - // InDirSync indicates that all the directory entiry data must be written - // synchronously. - InDirSync = 0x10000 - - // InTopDir indicates that this inode is at the top of the directory hierarchy. - InTopDir = 0x20000 - - // InHugeFile indicates that this is a huge file. - InHugeFile = 0x40000 - - // InExtents indicates that this inode uses extents. - InExtents = 0x80000 - - // InExtendedAttr indicates that this inode stores a large extended attribute - // value in its data blocks. - InExtendedAttr = 0x200000 - - // InInline indicates that this inode has inline data. - InInline = 0x10000000 - - // InReserved indicates that this inode is reserved for the ext4 library. - InReserved = 0x80000000 -) - -// InodeFlags represents all possible combinations of inode flags. It aims to -// cover the bit masks and provide a more user-friendly interface. -type InodeFlags struct { - Sync bool - Immutable bool - Append bool - NoDump bool - NoAccessTime bool - Index bool - JournalData bool - DirSync bool - TopDir bool - HugeFile bool - Extents bool - ExtendedAttr bool - Inline bool - Reserved bool -} - -// ToInt converts inode flags back to its 32-bit rep. -func (f InodeFlags) ToInt() uint32 { - var res uint32 - - if f.Sync { - res |= InSync - } - if f.Immutable { - res |= InImmutable - } - if f.Append { - res |= InAppend - } - if f.NoDump { - res |= InNoDump - } - if f.NoAccessTime { - res |= InNoAccessTime - } - if f.Index { - res |= InIndex - } - if f.JournalData { - res |= InJournalData - } - if f.DirSync { - res |= InDirSync - } - if f.TopDir { - res |= InTopDir - } - if f.HugeFile { - res |= InHugeFile - } - if f.Extents { - res |= InExtents - } - if f.ExtendedAttr { - res |= InExtendedAttr - } - if f.Inline { - res |= InInline - } - if f.Reserved { - res |= InReserved - } - - return res -} - -// InodeFlagsFromInt converts the integer representation of inode flags to -// a InodeFlags struct. -func InodeFlagsFromInt(f uint32) InodeFlags { - return InodeFlags{ - Sync: f&InSync > 0, - Immutable: f&InImmutable > 0, - Append: f&InAppend > 0, - NoDump: f&InNoDump > 0, - NoAccessTime: f&InNoAccessTime > 0, - Index: f&InIndex > 0, - JournalData: f&InJournalData > 0, - DirSync: f&InDirSync > 0, - TopDir: f&InTopDir > 0, - HugeFile: f&InHugeFile > 0, - Extents: f&InExtents > 0, - ExtendedAttr: f&InExtendedAttr > 0, - Inline: f&InInline > 0, - Reserved: f&InReserved > 0, - } -} - -// These masks define how users can view/modify inode flags. The rest of the -// flags are for internal kernel usage only. -const ( - InUserReadFlagMask = 0x4BDFFF - InUserWriteFlagMask = 0x4B80FF -) diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go deleted file mode 100644 index a4503f5cf..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import "gvisor.dev/gvisor/pkg/sentry/kernel/time" - -// InodeNew represents ext4 inode structure which can be bigger than -// OldInodeSize. The actual size of this struct should be determined using -// inode.ExtraInodeSize. Accessing any field here should be verified with the -// actual size. The extra space between the end of the inode struct and end of -// the inode record can be used to store extended attr. -// -// If the TimeExtra fields are in scope, the lower 2 bits of those are used -// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits -// are used to provide nanoscond precision. Hence, these timestamps will now -// overflow in May 2446. -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. -// -// +marshal -type InodeNew struct { - InodeOld - - ExtraInodeSize uint16 - ChecksumHi uint16 - ChangeTimeExtra uint32 - ModificationTimeExtra uint32 - AccessTimeExtra uint32 - CreationTime uint32 - CreationTimeExtra uint32 - VersionHi uint32 - ProjectID uint32 -} - -// Compiles only if InodeNew implements Inode. -var _ Inode = (*InodeNew)(nil) - -// fromExtraTime decodes the extra time and constructs the kernel time struct -// with nanosecond precision. -func fromExtraTime(lo int32, extra uint32) time.Time { - // See description above InodeNew for format. - seconds := (int64(extra&0x3) << 32) + int64(lo) - nanoseconds := int64(extra >> 2) - return time.FromUnix(seconds, nanoseconds) -} - -// Only override methods which change due to ext4 specific fields. - -// Size implements Inode.Size. -func (in *InodeNew) Size() uint64 { - return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) -} - -// InodeSize implements Inode.InodeSize. -func (in *InodeNew) InodeSize() uint16 { - return OldInodeSize + in.ExtraInodeSize -} - -// ChangeTime implements Inode.ChangeTime. -func (in *InodeNew) ChangeTime() time.Time { - // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. - if in.ExtraInodeSize >= 8 { - return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) - } - - return in.InodeOld.ChangeTime() -} - -// ModificationTime implements Inode.ModificationTime. -func (in *InodeNew) ModificationTime() time.Time { - // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. - if in.ExtraInodeSize >= 12 { - return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) - } - - return in.InodeOld.ModificationTime() -} - -// AccessTime implements Inode.AccessTime. -func (in *InodeNew) AccessTime() time.Time { - // Apply new timestamp logic if inode.AccessTimeExtra is in scope. - if in.ExtraInodeSize >= 16 { - return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) - } - - return in.InodeOld.AccessTime() -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go deleted file mode 100644 index e6b28babf..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -const ( - // OldInodeSize is the inode size in ext2/ext3. - OldInodeSize = 128 -) - -// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. -// Inode struct size and record size are both 128 bytes for this. -// -// All fields representing time are in seconds since the epoch. Which means that -// they will overflow in January 2038. -// -// +marshal -type InodeOld struct { - ModeRaw uint16 - UIDLo uint16 - SizeLo uint32 - - // The time fields are signed integers because they could be negative to - // represent time before the epoch. - AccessTimeRaw int32 - ChangeTimeRaw int32 - ModificationTimeRaw int32 - DeletionTimeRaw int32 - - GIDLo uint16 - LinksCountRaw uint16 - BlocksCountLo uint32 - FlagsRaw uint32 - VersionLo uint32 // This is OS dependent. - DataRaw [60]byte - Generation uint32 - FileACLLo uint32 - SizeHi uint32 - ObsoFaddr uint32 - - // OS dependent fields have been inlined here. - BlocksCountHi uint16 - FileACLHi uint16 - UIDHi uint16 - GIDHi uint16 - ChecksumLo uint16 - _ uint16 -} - -// Compiles only if InodeOld implements Inode. -var _ Inode = (*InodeOld)(nil) - -// Mode implements Inode.Mode. -func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } - -// UID implements Inode.UID. -func (in *InodeOld) UID() auth.KUID { - return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) -} - -// GID implements Inode.GID. -func (in *InodeOld) GID() auth.KGID { - return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) -} - -// Size implements Inode.Size. -func (in *InodeOld) Size() uint64 { - // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. - return uint64(in.SizeLo) -} - -// InodeSize implements Inode.InodeSize. -func (in *InodeOld) InodeSize() uint16 { return OldInodeSize } - -// AccessTime implements Inode.AccessTime. -func (in *InodeOld) AccessTime() time.Time { - return time.FromUnix(int64(in.AccessTimeRaw), 0) -} - -// ChangeTime implements Inode.ChangeTime. -func (in *InodeOld) ChangeTime() time.Time { - return time.FromUnix(int64(in.ChangeTimeRaw), 0) -} - -// ModificationTime implements Inode.ModificationTime. -func (in *InodeOld) ModificationTime() time.Time { - return time.FromUnix(int64(in.ModificationTimeRaw), 0) -} - -// DeletionTime implements Inode.DeletionTime. -func (in *InodeOld) DeletionTime() time.Time { - return time.FromUnix(int64(in.DeletionTimeRaw), 0) -} - -// LinksCount implements Inode.LinksCount. -func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } - -// Flags implements Inode.Flags. -func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } - -// Data implements Inode.Data. -func (in *InodeOld) Data() []byte { return in.DataRaw[:] } diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go deleted file mode 100644 index 90744e956..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "fmt" - "strconv" - "testing" - - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -// TestInodeSize tests that the inode structs are of the correct size. -func TestInodeSize(t *testing.T) { - var iOld InodeOld - assertSize(t, &iOld, OldInodeSize) - - // This was updated from 156 bytes to 160 bytes in Oct 2015. - var iNew InodeNew - assertSize(t, &iNew, 160) -} - -// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in -// ext4 inode structs are decoded correctly. -// -// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. -func TestTimestampSeconds(t *testing.T) { - type timestampTest struct { - // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. - // If this is set then the 32-bit time is negative. - msbSet bool - - // lowerBound tells if we should take the lowest possible value of - // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to - // false it tells to take the highest possible value. - lowerBound bool - - // extraBits is InodeNew.[X]TimeExtra. - extraBits uint32 - - // want is the kernel time struct that is expected. - want time.Time - } - - tests := []timestampTest{ - // 1901-12-13 - { - msbSet: true, - lowerBound: true, - extraBits: 0, - want: time.FromUnix(int64(-0x80000000), 0), - }, - - // 1969-12-31 - { - msbSet: true, - lowerBound: false, - extraBits: 0, - want: time.FromUnix(int64(-1), 0), - }, - - // 1970-01-01 - { - msbSet: false, - lowerBound: true, - extraBits: 0, - want: time.FromUnix(int64(0), 0), - }, - - // 2038-01-19 - { - msbSet: false, - lowerBound: false, - extraBits: 0, - want: time.FromUnix(int64(0x7fffffff), 0), - }, - - // 2038-01-19 - { - msbSet: true, - lowerBound: true, - extraBits: 1, - want: time.FromUnix(int64(0x80000000), 0), - }, - - // 2106-02-07 - { - msbSet: true, - lowerBound: false, - extraBits: 1, - want: time.FromUnix(int64(0xffffffff), 0), - }, - - // 2106-02-07 - { - msbSet: false, - lowerBound: true, - extraBits: 1, - want: time.FromUnix(int64(0x100000000), 0), - }, - - // 2174-02-25 - { - msbSet: false, - lowerBound: false, - extraBits: 1, - want: time.FromUnix(int64(0x17fffffff), 0), - }, - - // 2174-02-25 - { - msbSet: true, - lowerBound: true, - extraBits: 2, - want: time.FromUnix(int64(0x180000000), 0), - }, - - // 2242-03-16 - { - msbSet: true, - lowerBound: false, - extraBits: 2, - want: time.FromUnix(int64(0x1ffffffff), 0), - }, - - // 2242-03-16 - { - msbSet: false, - lowerBound: true, - extraBits: 2, - want: time.FromUnix(int64(0x200000000), 0), - }, - - // 2310-04-04 - { - msbSet: false, - lowerBound: false, - extraBits: 2, - want: time.FromUnix(int64(0x27fffffff), 0), - }, - - // 2310-04-04 - { - msbSet: true, - lowerBound: true, - extraBits: 3, - want: time.FromUnix(int64(0x280000000), 0), - }, - - // 2378-04-22 - { - msbSet: true, - lowerBound: false, - extraBits: 3, - want: time.FromUnix(int64(0x2ffffffff), 0), - }, - - // 2378-04-22 - { - msbSet: false, - lowerBound: true, - extraBits: 3, - want: time.FromUnix(int64(0x300000000), 0), - }, - - // 2446-05-10 - { - msbSet: false, - lowerBound: false, - extraBits: 3, - want: time.FromUnix(int64(0x37fffffff), 0), - }, - } - - lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 - upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 - lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 - upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 - - get32BitTime := func(test timestampTest) int32 { - if test.msbSet { - if test.lowerBound { - return lowerMSB1 - } - - return upperMSB1 - } - - if test.lowerBound { - return lowerMSB0 - } - - return upperMSB0 - } - - getTestName := func(test timestampTest) string { - return fmt.Sprintf( - "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", - strconv.FormatInt(int64(test.extraBits), 2), - test.msbSet, - test.lowerBound, - ) - } - - for _, test := range tests { - t.Run(getTestName(test), func(t *testing.T) { - if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { - t.Errorf("Expected: %v, Got: %v", test.want, got) - } - }) - } -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go deleted file mode 100644 index 70948ebe9..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" -) - -const ( - // SbOffset is the absolute offset at which the superblock is placed. - SbOffset = 1024 -) - -// SuperBlock should be implemented by structs representing the ext superblock. -// The superblock holds a lot of information about the enclosing filesystem. -// This interface aims to provide access methods to important information held -// by the superblock. It does NOT expose all fields of the superblock, only the -// ones necessary. This can be expanded when need be. -// -// Location and replication: -// - The superblock is located at offset 1024 in block group 0. -// - Redundant copies of the superblock and group descriptors are kept in -// all groups if SbSparse feature flag is NOT set. If it is set, the -// replicas only exist in groups whose group number is either 0 or a -// power of 3, 5, or 7. -// - There is also a sparse superblock feature v2 in which there are just -// two replicas saved in the block groups pointed by sb.s_backup_bgs. -// -// Replicas should eventually be updated if the superblock is updated. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. -type SuperBlock interface { - marshal.Marshallable - - // InodesCount returns the total number of inodes in this filesystem. - InodesCount() uint32 - - // BlocksCount returns the total number of data blocks in this filesystem. - BlocksCount() uint64 - - // FreeBlocksCount returns the number of free blocks in this filesystem. - FreeBlocksCount() uint64 - - // FreeInodesCount returns the number of free inodes in this filesystem. - FreeInodesCount() uint32 - - // MountCount returns the number of mounts since the last fsck. - MountCount() uint16 - - // MaxMountCount returns the number of mounts allowed beyond which a fsck is - // needed. - MaxMountCount() uint16 - - // FirstDataBlock returns the absolute block number of the first data block, - // which contains the super block itself. - // - // If the filesystem has 1kb data blocks then this should return 1. For all - // other configurations, this typically returns 0. - FirstDataBlock() uint32 - - // BlockSize returns the size of one data block in this filesystem. - // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that - // the smallest block size is 1kb. - BlockSize() uint64 - - // BlocksPerGroup returns the number of data blocks in a block group. - BlocksPerGroup() uint32 - - // ClusterSize returns block cluster size (set during mkfs time by admin). - // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that - // the smallest cluster size is 1kb. - // - // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature - // is NOT set and consequently BlockSize() = ClusterSize() in that case. - ClusterSize() uint64 - - // ClustersPerGroup returns: - // - number of clusters per group if bigalloc is enabled. - // - BlocksPerGroup() otherwise. - ClustersPerGroup() uint32 - - // InodeSize returns the size of the inode disk record size in bytes. Use this - // to iterate over inode arrays on disk. - // - // In ext2 and ext3: - // - Each inode had a disk record of 128 bytes. - // - The inode struct size was fixed at 128 bytes. - // - // In ext4 its possible to allocate larger on-disk inodes: - // - Inode disk record size = sb.s_inode_size (function return value). - // = 256 (default) - // - Inode struct size = 128 + inode.i_extra_isize. - // = 128 + 32 = 160 (default) - InodeSize() uint16 - - // InodesPerGroup returns the number of inodes in a block group. - InodesPerGroup() uint32 - - // BgDescSize returns the size of the block group descriptor struct. - // - // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor - // is only 32 bytes long. - // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST - // 64 bytes. It might be bigger than that. - BgDescSize() uint16 - - // CompatibleFeatures returns the CompatFeatures struct which holds all the - // compatible features this fs supports. - CompatibleFeatures() CompatFeatures - - // IncompatibleFeatures returns the CompatFeatures struct which holds all the - // incompatible features this fs supports. - IncompatibleFeatures() IncompatFeatures - - // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the - // readonly compatible features this fs supports. - ReadOnlyCompatibleFeatures() RoCompatFeatures - - // Magic() returns the magic signature which must be 0xef53. - Magic() uint16 - - // Revision returns the superblock revision. Superblock struct fields from - // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. - Revision() SbRevision -} - -// SbRevision is the type for superblock revisions. -type SbRevision uint32 - -// Super block revisions. -const ( - // OldRev is the good old (original) format. - OldRev SbRevision = 0 - - // DynamicRev is v2 format w/ dynamic inode sizes. - DynamicRev SbRevision = 1 -) - -// Superblock compatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbDirPrealloc indicates directory preallocation. - SbDirPrealloc = 0x1 - - // SbHasJournal indicates the presence of a journal. jbd2 should only work - // with this being set. - SbHasJournal = 0x4 - - // SbExtAttr indicates extended attributes support. - SbExtAttr = 0x8 - - // SbResizeInode indicates that the fs has reserved GDT blocks (right after - // group descriptors) for fs expansion. - SbResizeInode = 0x10 - - // SbDirIndex indicates that the fs has directory indices. - SbDirIndex = 0x20 - - // SbSparseV2 stands for Sparse superblock version 2. - SbSparseV2 = 0x200 -) - -// CompatFeatures represents a superblock's compatible feature set. If the -// kernel does not understand any of these feature, it can still read/write -// to this fs. -type CompatFeatures struct { - DirPrealloc bool - HasJournal bool - ExtAttr bool - ResizeInode bool - DirIndex bool - SparseV2 bool -} - -// ToInt converts superblock compatible features back to its 32-bit rep. -func (f CompatFeatures) ToInt() uint32 { - var res uint32 - - if f.DirPrealloc { - res |= SbDirPrealloc - } - if f.HasJournal { - res |= SbHasJournal - } - if f.ExtAttr { - res |= SbExtAttr - } - if f.ResizeInode { - res |= SbResizeInode - } - if f.DirIndex { - res |= SbDirIndex - } - if f.SparseV2 { - res |= SbSparseV2 - } - - return res -} - -// CompatFeaturesFromInt converts the integer representation of superblock -// compatible features to CompatFeatures struct. -func CompatFeaturesFromInt(f uint32) CompatFeatures { - return CompatFeatures{ - DirPrealloc: f&SbDirPrealloc > 0, - HasJournal: f&SbHasJournal > 0, - ExtAttr: f&SbExtAttr > 0, - ResizeInode: f&SbResizeInode > 0, - DirIndex: f&SbDirIndex > 0, - SparseV2: f&SbSparseV2 > 0, - } -} - -// Superblock incompatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbDirentFileType indicates that directory entries record the file type. - // We should use struct DirentNew for dirents then. - SbDirentFileType = 0x2 - - // SbRecovery indicates that the filesystem needs recovery. - SbRecovery = 0x4 - - // SbJournalDev indicates that the filesystem has a separate journal device. - SbJournalDev = 0x8 - - // SbMetaBG indicates that the filesystem is using Meta block groups. Moves - // the group descriptors from the congested first block group into the first - // group of each metablock group to increase the maximum block groups limit - // and hence support much larger filesystems. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. - SbMetaBG = 0x10 - - // SbExtents indicates that the filesystem uses extents. Must be set in ext4 - // filesystems. - SbExtents = 0x40 - - // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. - // Hence can support 2^64 data blocks. - SbIs64Bit = 0x80 - - // SbMMP indicates that this filesystem has multiple mount protection. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. - SbMMP = 0x100 - - // SbFlexBg indicates that this filesystem has flexible block groups. Several - // block groups are tied into one logical block group so that all the metadata - // for the block groups (bitmaps and inode tables) are close together for - // faster loading. Consequently, large files will be continuous on disk. - // However, this does not affect the placement of redundant superblocks and - // group descriptors. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. - SbFlexBg = 0x200 - - // SbLargeDir shows that large directory enabled. Directory htree can be 3 - // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. - SbLargeDir = 0x4000 - - // SbInlineData allows inline data in inodes for really small files. - SbInlineData = 0x8000 - - // SbEncrypted indicates that this fs contains encrypted inodes. - SbEncrypted = 0x10000 -) - -// IncompatFeatures represents a superblock's incompatible feature set. If the -// kernel does not understand any of these feature, it should refuse to mount. -type IncompatFeatures struct { - DirentFileType bool - Recovery bool - JournalDev bool - MetaBG bool - Extents bool - Is64Bit bool - MMP bool - FlexBg bool - LargeDir bool - InlineData bool - Encrypted bool -} - -// ToInt converts superblock incompatible features back to its 32-bit rep. -func (f IncompatFeatures) ToInt() uint32 { - var res uint32 - - if f.DirentFileType { - res |= SbDirentFileType - } - if f.Recovery { - res |= SbRecovery - } - if f.JournalDev { - res |= SbJournalDev - } - if f.MetaBG { - res |= SbMetaBG - } - if f.Extents { - res |= SbExtents - } - if f.Is64Bit { - res |= SbIs64Bit - } - if f.MMP { - res |= SbMMP - } - if f.FlexBg { - res |= SbFlexBg - } - if f.LargeDir { - res |= SbLargeDir - } - if f.InlineData { - res |= SbInlineData - } - if f.Encrypted { - res |= SbEncrypted - } - - return res -} - -// IncompatFeaturesFromInt converts the integer representation of superblock -// incompatible features to IncompatFeatures struct. -func IncompatFeaturesFromInt(f uint32) IncompatFeatures { - return IncompatFeatures{ - DirentFileType: f&SbDirentFileType > 0, - Recovery: f&SbRecovery > 0, - JournalDev: f&SbJournalDev > 0, - MetaBG: f&SbMetaBG > 0, - Extents: f&SbExtents > 0, - Is64Bit: f&SbIs64Bit > 0, - MMP: f&SbMMP > 0, - FlexBg: f&SbFlexBg > 0, - LargeDir: f&SbLargeDir > 0, - InlineData: f&SbInlineData > 0, - Encrypted: f&SbEncrypted > 0, - } -} - -// Superblock readonly compatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbSparse indicates sparse superblocks. Only groups with number either 0 or - // a power of 3, 5, or 7 will have redundant copies of the superblock and - // block descriptors. - SbSparse = 0x1 - - // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. - SbLargeFile = 0x2 - - // SbHugeFile indicates that this fs contains files whose sizes are - // represented in units of logicals blocks, not 512-byte sectors. - SbHugeFile = 0x8 - - // SbGdtCsum indicates that group descriptors have checksums. - SbGdtCsum = 0x10 - - // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a - // 32,000 subdirectory limit. - SbDirNlink = 0x20 - - // SbExtraIsize indicates that large inodes exist on this filesystem. - SbExtraIsize = 0x40 - - // SbHasSnapshot indicates the existence of a snapshot. - SbHasSnapshot = 0x80 - - // SbQuota enables usage tracking for all quota types. - SbQuota = 0x100 - - // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation - // unit becomes a cluster rather than a data block. Then block bitmaps track - // clusters, not data blocks. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. - SbBigalloc = 0x200 - - // SbMetadataCsum indicates that the fs supports metadata checksumming. - SbMetadataCsum = 0x400 - - // SbReadOnly marks this filesystem as readonly. Should refuse to mount in - // read/write mode. - SbReadOnly = 0x1000 -) - -// RoCompatFeatures represents a superblock's readonly compatible feature set. -// If the kernel does not understand any of these feature, it can still mount -// readonly. But if the user wants to mount read/write, the kernel should -// refuse to mount. -type RoCompatFeatures struct { - Sparse bool - LargeFile bool - HugeFile bool - GdtCsum bool - DirNlink bool - ExtraIsize bool - HasSnapshot bool - Quota bool - Bigalloc bool - MetadataCsum bool - ReadOnly bool -} - -// ToInt converts superblock readonly compatible features to its 32-bit rep. -func (f RoCompatFeatures) ToInt() uint32 { - var res uint32 - - if f.Sparse { - res |= SbSparse - } - if f.LargeFile { - res |= SbLargeFile - } - if f.HugeFile { - res |= SbHugeFile - } - if f.GdtCsum { - res |= SbGdtCsum - } - if f.DirNlink { - res |= SbDirNlink - } - if f.ExtraIsize { - res |= SbExtraIsize - } - if f.HasSnapshot { - res |= SbHasSnapshot - } - if f.Quota { - res |= SbQuota - } - if f.Bigalloc { - res |= SbBigalloc - } - if f.MetadataCsum { - res |= SbMetadataCsum - } - if f.ReadOnly { - res |= SbReadOnly - } - - return res -} - -// RoCompatFeaturesFromInt converts the integer representation of superblock -// readonly compatible features to RoCompatFeatures struct. -func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { - return RoCompatFeatures{ - Sparse: f&SbSparse > 0, - LargeFile: f&SbLargeFile > 0, - HugeFile: f&SbHugeFile > 0, - GdtCsum: f&SbGdtCsum > 0, - DirNlink: f&SbDirNlink > 0, - ExtraIsize: f&SbExtraIsize > 0, - HasSnapshot: f&SbHasSnapshot > 0, - Quota: f&SbQuota > 0, - Bigalloc: f&SbBigalloc > 0, - MetadataCsum: f&SbMetadataCsum > 0, - ReadOnly: f&SbReadOnly > 0, - } -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go deleted file mode 100644 index 4dc6080fb..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if -// RevLevel = DynamicRev and 64-bit feature is disabled. -// -// +marshal -type SuperBlock32Bit struct { - // We embed the old superblock struct here because the 32-bit version is just - // an extension of the old version. - SuperBlockOld - - FirstInode uint32 - InodeSizeRaw uint16 - BlockGroupNumber uint16 - FeatureCompat uint32 - FeatureIncompat uint32 - FeatureRoCompat uint32 - UUID [16]byte - VolumeName [16]byte - LastMounted [64]byte - AlgoUsageBitmap uint32 - PreallocBlocks uint8 - PreallocDirBlocks uint8 - ReservedGdtBlocks uint16 - JournalUUID [16]byte - JournalInum uint32 - JournalDev uint32 - LastOrphan uint32 - HashSeed [4]uint32 - DefaultHashVersion uint8 - JnlBackupType uint8 - BgDescSizeRaw uint16 - DefaultMountOpts uint32 - FirstMetaBg uint32 - MkfsTime uint32 - JnlBlocks [17]uint32 -} - -// Compiles only if SuperBlock32Bit implements SuperBlock. -var _ SuperBlock = (*SuperBlock32Bit)(nil) - -// Only override methods which change based on the additional fields above. -// Not overriding SuperBlock.BgDescSize because it would still return 32 here. - -// InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlock32Bit) InodeSize() uint16 { - return sb.InodeSizeRaw -} - -// CompatibleFeatures implements SuperBlock.CompatibleFeatures. -func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { - return CompatFeaturesFromInt(sb.FeatureCompat) -} - -// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. -func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { - return IncompatFeaturesFromInt(sb.FeatureIncompat) -} - -// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. -func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { - return RoCompatFeaturesFromInt(sb.FeatureRoCompat) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go deleted file mode 100644 index 2c9039327..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly -// 1024 bytes (smallest possible block size) and hence the superblock always -// fits in no more than one data block. Should only be used when the 64-bit -// feature is set. -// -// +marshal -type SuperBlock64Bit struct { - // We embed the 32-bit struct here because 64-bit version is just an extension - // of the 32-bit version. - SuperBlock32Bit - - BlocksCountHi uint32 - ReservedBlocksCountHi uint32 - FreeBlocksCountHi uint32 - MinInodeSize uint16 - WantInodeSize uint16 - Flags uint32 - RaidStride uint16 - MmpInterval uint16 - MmpBlock uint64 - RaidStripeWidth uint32 - LogGroupsPerFlex uint8 - ChecksumType uint8 - _ uint16 - KbytesWritten uint64 - SnapshotInum uint32 - SnapshotID uint32 - SnapshotRsrvBlocksCount uint64 - SnapshotList uint32 - ErrorCount uint32 - FirstErrorTime uint32 - FirstErrorInode uint32 - FirstErrorBlock uint64 - FirstErrorFunction [32]byte - FirstErrorLine uint32 - LastErrorTime uint32 - LastErrorInode uint32 - LastErrorLine uint32 - LastErrorBlock uint64 - LastErrorFunction [32]byte - MountOpts [64]byte - UserQuotaInum uint32 - GroupQuotaInum uint32 - OverheadBlocks uint32 - BackupBgs [2]uint32 - EncryptAlgos [4]uint8 - EncryptPwSalt [16]uint8 - LostFoundInode uint32 - ProjectQuotaInode uint32 - ChecksumSeed uint32 - WtimeHi uint8 - MtimeHi uint8 - MkfsTimeHi uint8 - LastCheckHi uint8 - FirstErrorTimeHi uint8 - LastErrorTimeHi uint8 - _ [2]uint8 - Encoding uint16 - EncodingFlags uint16 - _ [95]uint32 - Checksum uint32 -} - -// Compiles only if SuperBlock64Bit implements SuperBlock. -var _ SuperBlock = (*SuperBlock64Bit)(nil) - -// Only override methods which change based on the 64-bit feature. - -// BlocksCount implements SuperBlock.BlocksCount. -func (sb *SuperBlock64Bit) BlocksCount() uint64 { - return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) -} - -// FreeBlocksCount implements SuperBlock.FreeBlocksCount. -func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { - return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) -} - -// BgDescSize implements SuperBlock.BgDescSize. -func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go deleted file mode 100644 index e4709f23c..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlockOld implements SuperBlock and represents the old version of the -// superblock struct. Should be used only if RevLevel = OldRev. -// -// +marshal -type SuperBlockOld struct { - InodesCountRaw uint32 - BlocksCountLo uint32 - ReservedBlocksCount uint32 - FreeBlocksCountLo uint32 - FreeInodesCountRaw uint32 - FirstDataBlockRaw uint32 - LogBlockSize uint32 - LogClusterSize uint32 - BlocksPerGroupRaw uint32 - ClustersPerGroupRaw uint32 - InodesPerGroupRaw uint32 - Mtime uint32 - Wtime uint32 - MountCountRaw uint16 - MaxMountCountRaw uint16 - MagicRaw uint16 - State uint16 - Errors uint16 - MinorRevLevel uint16 - LastCheck uint32 - CheckInterval uint32 - CreatorOS uint32 - RevLevel uint32 - DefResUID uint16 - DefResGID uint16 -} - -// Compiles only if SuperBlockOld implements SuperBlock. -var _ SuperBlock = (*SuperBlockOld)(nil) - -// InodesCount implements SuperBlock.InodesCount. -func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } - -// BlocksCount implements SuperBlock.BlocksCount. -func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } - -// FreeBlocksCount implements SuperBlock.FreeBlocksCount. -func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } - -// FreeInodesCount implements SuperBlock.FreeInodesCount. -func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } - -// MountCount implements SuperBlock.MountCount. -func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } - -// MaxMountCount implements SuperBlock.MaxMountCount. -func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } - -// FirstDataBlock implements SuperBlock.FirstDataBlock. -func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } - -// BlockSize implements SuperBlock.BlockSize. -func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } - -// BlocksPerGroup implements SuperBlock.BlocksPerGroup. -func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } - -// ClusterSize implements SuperBlock.ClusterSize. -func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } - -// ClustersPerGroup implements SuperBlock.ClustersPerGroup. -func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } - -// InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlockOld) InodeSize() uint16 { return OldInodeSize } - -// InodesPerGroup implements SuperBlock.InodesPerGroup. -func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } - -// BgDescSize implements SuperBlock.BgDescSize. -func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } - -// CompatibleFeatures implements SuperBlock.CompatibleFeatures. -func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } - -// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. -func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } - -// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. -func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } - -// Magic implements SuperBlock.Magic. -func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } - -// Revision implements SuperBlock.Revision. -func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go deleted file mode 100644 index b734b6987..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestSuperBlockSize tests that the superblock structs are of the correct -// size. -func TestSuperBlockSize(t *testing.T) { - var sbOld SuperBlockOld - assertSize(t, &sbOld, 84) - var sb32 SuperBlock32Bit - assertSize(t, &sb32, 336) - var sb64 SuperBlock64Bit - assertSize(t, &sb64, 1024) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go deleted file mode 100644 index a4bc08411..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "reflect" - "testing" - - "gvisor.dev/gvisor/pkg/marshal" -) - -func assertSize(t *testing.T, v marshal.Marshallable, want int) { - t.Helper() - - if got := v.SizeBytes(); got != want { - t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) - } -} diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go deleted file mode 100644 index 38fb7962b..000000000 --- a/pkg/sentry/fsimpl/ext/ext.go +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ext implements readonly ext(2/3/4) filesystems. -package ext - -import ( - "errors" - "fmt" - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// Name is the name of this filesystem. -const Name = "ext" - -// FilesystemType implements vfs.FilesystemType. -// -// +stateify savable -type FilesystemType struct{} - -// getDeviceFd returns an io.ReaderAt to the underlying device. -// Currently there are two ways of mounting an ext(2/3/4) fs: -// 1. Specify a mount with our internal special MountType in the OCI spec. -// 2. Expose the device to the container and mount it from application layer. -func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) { - if opts.InternalData == nil { - // User mount call. - // TODO(b/134676337): Open the device specified by `source` and return that. - panic("unimplemented") - } - - // GetFilesystem call originated from within the sentry. - devFd, ok := opts.InternalData.(int) - if !ok { - return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") - } - - if devFd < 0 { - return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd) - } - - // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership - // of the file descriptor and hence will not close it when it is garbage - // collected. - return fd.NewReadWriter(devFd), nil -} - -// isCompatible checks if the superblock has feature sets which are compatible. -// We only need to check the superblock incompatible feature set since we are -// mounting readonly. We will also need to check readonly compatible feature -// set when mounting for read/write. -func isCompatible(sb disklayout.SuperBlock) bool { - // Please note that what is being checked is limited based on the fact that we - // are mounting readonly and that we are not journaling. When mounting - // read/write or with a journal, this must be reevaluated. - incompatFeatures := sb.IncompatibleFeatures() - if incompatFeatures.MetaBG { - log.Warningf("ext fs: meta block groups are not supported") - return false - } - if incompatFeatures.MMP { - log.Warningf("ext fs: multiple mount protection is not supported") - return false - } - if incompatFeatures.Encrypted { - log.Warningf("ext fs: encrypted inodes not supported") - return false - } - if incompatFeatures.InlineData { - log.Warningf("ext fs: inline files not supported") - return false - } - return true -} - -// Name implements vfs.FilesystemType.Name. -func (FilesystemType) Name() string { - return Name -} - -// Release implements vfs.FilesystemType.Release. -func (FilesystemType) Release(ctx context.Context) {} - -// GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - // TODO(b/134676337): Ensure that the user is mounting readonly. If not, - // EACCESS should be returned according to mount(2). Filesystem independent - // flags (like readonly) are currently not available in pkg/sentry/vfs. - - devMinor, err := vfsObj.GetAnonBlockDevMinor() - if err != nil { - return nil, nil, err - } - - dev, err := getDeviceFd(source, opts) - if err != nil { - return nil, nil, err - } - - fs := filesystem{ - dev: dev, - inodeCache: make(map[uint32]*inode), - devMinor: devMinor, - } - fs.vfsfs.Init(vfsObj, &fsType, &fs) - fs.sb, err = readSuperBlock(dev) - if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - - if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { - // mount(2) specifies that EINVAL should be returned if the superblock is - // invalid. - fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EINVAL - } - - // Refuse to mount if the filesystem is incompatible. - if !isCompatible(fs.sb) { - fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EINVAL - } - - fs.bgs, err = readBlockGroups(dev, fs.sb) - if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - - rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) - if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - rootInode.incRef() - - return &fs.vfsfs, &newDentry(rootInode).vfsd, nil -} diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go deleted file mode 100644 index db712e71f..000000000 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ /dev/null @@ -1,926 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "fmt" - "io" - "os" - "path" - "sort" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/test/testutil" - "gvisor.dev/gvisor/pkg/usermem" -) - -const ( - assetsDir = "pkg/sentry/fsimpl/ext/assets" -) - -var ( - ext2ImagePath = path.Join(assetsDir, "tiny.ext2") - ext3ImagePath = path.Join(assetsDir, "tiny.ext3") - ext4ImagePath = path.Join(assetsDir, "tiny.ext4") -) - -// setUp opens imagePath as an ext Filesystem and returns all necessary -// elements required to run tests. If error is non-nil, it also returns a tear -// down function which must be called after the test is run for clean up. -func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { - localImagePath, err := testutil.FindFile(imagePath) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err) - } - - f, err := os.Open(localImagePath) - if err != nil { - return nil, nil, nil, nil, err - } - - ctx := contexttest.Context(t) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(ctx); err != nil { - t.Fatalf("VFS init: %v", err) - } - vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - InternalData: int(f.Fd()), - }, - }) - if err != nil { - f.Close() - return nil, nil, nil, nil, err - } - - root := mntns.Root() - root.IncRef() - - tearDown := func() { - root.DecRef(ctx) - - if err := f.Close(); err != nil { - t.Fatalf("tearDown failed: %v", err) - } - } - return ctx, vfsObj, &root, tearDown, nil -} - -// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and -// vfs.FilesystemImpl.StatFSAt which are not implemented in -// vfs.VirtualFilesystem yet. - -// TestSeek tests vfs.FileDescriptionImpl.Seek functionality. -func TestSeek(t *testing.T) { - type seekTest struct { - name string - image string - path string - } - - tests := []seekTest{ - { - name: "ext4 root dir seek", - image: ext4ImagePath, - path: "/", - }, - { - name: "ext3 root dir seek", - image: ext3ImagePath, - path: "/", - }, - { - name: "ext2 root dir seek", - image: ext2ImagePath, - path: "/", - }, - { - name: "ext4 reg file seek", - image: ext4ImagePath, - path: "/file.txt", - }, - { - name: "ext3 reg file seek", - image: ext3ImagePath, - path: "/file.txt", - }, - { - name: "ext2 reg file seek", - image: ext2ImagePath, - path: "/file.txt", - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fd, err := vfsfs.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } - - if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { - t.Errorf("expected seek position 0, got %d and error %v", n, err) - } - - stat, err := fd.Stat(ctx, vfs.StatOptions{}) - if err != nil { - t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) - } - - // We should be able to seek beyond the end of file. - size := int64(stat.Size) - if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size, n, err) - } - - // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); !linuxerr.Equals(linuxerr.EINVAL, err) { - t.Errorf("expected error EINVAL but got %v", err) - } - - if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) - } - - // Make sure negative offsets work with SEEK_CUR. - if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) - } - - // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); !linuxerr.Equals(linuxerr.EINVAL, err) { - t.Errorf("expected error EINVAL but got %v", err) - } - - // Make sure SEEK_END works with regular files. - if _, ok := fd.Impl().(*regularFileFD); ok { - // Seek back to 0. - if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) - } - - // Seek forward beyond EOF. - if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) - } - - // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); !linuxerr.Equals(linuxerr.EINVAL, err) { - t.Errorf("expected error EINVAL but got %v", err) - } - } - }) - } -} - -// TestStatAt tests filesystem.StatAt functionality. -func TestStatAt(t *testing.T) { - type statAtTest struct { - name string - image string - path string - want linux.Statx - } - - tests := []statAtTest{ - { - name: "ext4 statx small file", - image: ext4ImagePath, - path: "/file.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13, - }, - }, - { - name: "ext3 statx small file", - image: ext3ImagePath, - path: "/file.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13, - }, - }, - { - name: "ext2 statx small file", - image: ext2ImagePath, - path: "/file.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13, - }, - }, - { - name: "ext4 statx big file", - image: ext4ImagePath, - path: "/bigfile.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13042, - }, - }, - { - name: "ext3 statx big file", - image: ext3ImagePath, - path: "/bigfile.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13042, - }, - }, - { - name: "ext2 statx big file", - image: ext2ImagePath, - path: "/bigfile.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13042, - }, - }, - { - name: "ext4 statx symlink file", - image: ext4ImagePath, - path: "/symlink.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0777 | linux.ModeSymlink, - Size: 8, - }, - }, - { - name: "ext3 statx symlink file", - image: ext3ImagePath, - path: "/symlink.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0777 | linux.ModeSymlink, - Size: 8, - }, - }, - { - name: "ext2 statx symlink file", - image: ext2ImagePath, - path: "/symlink.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0777 | linux.ModeSymlink, - Size: 8, - }, - }, - } - - // Ignore the fields that are not supported by filesystem.StatAt yet and - // those which are likely to change as the image does. - ignoredFields := map[string]bool{ - "Attributes": true, - "AttributesMask": true, - "Atime": true, - "Blocks": true, - "Btime": true, - "Ctime": true, - "DevMajor": true, - "DevMinor": true, - "Ino": true, - "Mask": true, - "Mtime": true, - "RdevMajor": true, - "RdevMinor": true, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - got, err := vfsfs.StatAt(ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, - &vfs.StatOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err) - } - - cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { - _, ok := ignoredFields[p.String()] - return ok - }, cmp.Ignore()) - if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" { - t.Errorf("stat mismatch (-want +got):\n%s", diff) - } - }) - } -} - -// TestRead tests the read functionality for vfs file descriptions. -func TestRead(t *testing.T) { - type readTest struct { - name string - image string - absPath string - } - - tests := []readTest{ - { - name: "ext4 read small file", - image: ext4ImagePath, - absPath: "/file.txt", - }, - { - name: "ext3 read small file", - image: ext3ImagePath, - absPath: "/file.txt", - }, - { - name: "ext2 read small file", - image: ext2ImagePath, - absPath: "/file.txt", - }, - { - name: "ext4 read big file", - image: ext4ImagePath, - absPath: "/bigfile.txt", - }, - { - name: "ext3 read big file", - image: ext3ImagePath, - absPath: "/bigfile.txt", - }, - { - name: "ext2 read big file", - image: ext2ImagePath, - absPath: "/bigfile.txt", - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fd, err := vfsfs.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } - - // Get a local file descriptor and compare its functionality with a vfs file - // description for the same file. - localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath)) - if err != nil { - t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err) - } - - f, err := os.Open(localFile) - if err != nil { - t.Fatalf("os.Open failed for %s: %v", localFile, err) - } - defer f.Close() - - // Read the entire file by reading one byte repeatedly. Doing this stress - // tests the underlying file reader implementation. - got := make([]byte, 1) - want := make([]byte, 1) - for { - n, err := f.Read(want) - fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) - - if diff := cmp.Diff(got, want); diff != "" { - t.Errorf("file data mismatch (-want +got):\n%s", diff) - } - - // Make sure there is no more file data left after getting EOF. - if n == 0 || err == io.EOF { - if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { - t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) - } - - break - } - - if err != nil { - t.Fatalf("read failed: %v", err) - } - } - }) - } -} - -// iterDirentsCb is a simple callback which just keeps adding the dirents to an -// internal list. Implements vfs.IterDirentsCallback. -type iterDirentsCb struct { - dirents []vfs.Dirent -} - -// Compiles only if iterDirentCb implements vfs.IterDirentsCallback. -var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil) - -// newIterDirentsCb is the iterDirent -func newIterDirentCb() *iterDirentsCb { - return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)} -} - -// Handle implements vfs.IterDirentsCallback.Handle. -func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error { - cb.dirents = append(cb.dirents, dirent) - return nil -} - -// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality. -func TestIterDirents(t *testing.T) { - type iterDirentTest struct { - name string - image string - path string - want []vfs.Dirent - } - - wantDirents := []vfs.Dirent{ - { - Name: ".", - Type: linux.DT_DIR, - }, - { - Name: "..", - Type: linux.DT_DIR, - }, - { - Name: "lost+found", - Type: linux.DT_DIR, - }, - { - Name: "file.txt", - Type: linux.DT_REG, - }, - { - Name: "bigfile.txt", - Type: linux.DT_REG, - }, - { - Name: "symlink.txt", - Type: linux.DT_LNK, - }, - } - tests := []iterDirentTest{ - { - name: "ext4 root dir iteration", - image: ext4ImagePath, - path: "/", - want: wantDirents, - }, - { - name: "ext3 root dir iteration", - image: ext3ImagePath, - path: "/", - want: wantDirents, - }, - { - name: "ext2 root dir iteration", - image: ext2ImagePath, - path: "/", - want: wantDirents, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fd, err := vfsfs.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } - - cb := &iterDirentsCb{} - if err = fd.IterDirents(ctx, cb); err != nil { - t.Fatalf("dir fd.IterDirents() failed: %v", err) - } - - sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name }) - sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name }) - - // Ignore the inode number and offset of dirents because those are likely to - // change as the underlying image changes. - cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { - return p.String() == "Ino" || p.String() == "NextOff" - }, cmp.Ignore()) - if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" { - t.Errorf("dirents mismatch (-want +got):\n%s", diff) - } - }) - } -} - -// TestRootDir tests that the root directory inode is correctly initialized and -// returned from setUp. -func TestRootDir(t *testing.T) { - type inodeProps struct { - Mode linux.FileMode - UID auth.KUID - GID auth.KGID - Size uint64 - InodeSize uint16 - Links uint16 - Flags disklayout.InodeFlags - } - - type rootDirTest struct { - name string - image string - wantInode inodeProps - } - - tests := []rootDirTest{ - { - name: "ext4 root dir", - image: ext4ImagePath, - wantInode: inodeProps{ - Mode: linux.ModeDirectory | 0755, - Size: 0x400, - InodeSize: 0x80, - Links: 3, - Flags: disklayout.InodeFlags{Extents: true}, - }, - }, - { - name: "ext3 root dir", - image: ext3ImagePath, - wantInode: inodeProps{ - Mode: linux.ModeDirectory | 0755, - Size: 0x400, - InodeSize: 0x80, - Links: 3, - }, - }, - { - name: "ext2 root dir", - image: ext2ImagePath, - wantInode: inodeProps{ - Mode: linux.ModeDirectory | 0755, - Size: 0x400, - InodeSize: 0x80, - Links: 3, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - _, _, vd, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - d, ok := vd.Dentry().Impl().(*dentry) - if !ok { - t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl()) - } - - // Offload inode contents into local structs for comparison. - gotInode := inodeProps{ - Mode: d.inode.diskInode.Mode(), - UID: d.inode.diskInode.UID(), - GID: d.inode.diskInode.GID(), - Size: d.inode.diskInode.Size(), - InodeSize: d.inode.diskInode.InodeSize(), - Links: d.inode.diskInode.LinksCount(), - Flags: d.inode.diskInode.Flags(), - } - - if diff := cmp.Diff(gotInode, test.wantInode); diff != "" { - t.Errorf("inode mismatch (-want +got):\n%s", diff) - } - }) - } -} - -// TestFilesystemInit tests that the filesystem superblock and block group -// descriptors are correctly read in and initialized. -func TestFilesystemInit(t *testing.T) { - // sb only contains the immutable properties of the superblock. - type sb struct { - InodesCount uint32 - BlocksCount uint64 - MaxMountCount uint16 - FirstDataBlock uint32 - BlockSize uint64 - BlocksPerGroup uint32 - ClusterSize uint64 - ClustersPerGroup uint32 - InodeSize uint16 - InodesPerGroup uint32 - BgDescSize uint16 - Magic uint16 - Revision disklayout.SbRevision - CompatFeatures disklayout.CompatFeatures - IncompatFeatures disklayout.IncompatFeatures - RoCompatFeatures disklayout.RoCompatFeatures - } - - // bg only contains the immutable properties of the block group descriptor. - type bg struct { - InodeTable uint64 - BlockBitmap uint64 - InodeBitmap uint64 - ExclusionBitmap uint64 - Flags disklayout.BGFlags - } - - type fsInitTest struct { - name string - image string - wantSb sb - wantBgs []bg - } - - tests := []fsInitTest{ - { - name: "ext4 filesystem init", - image: ext4ImagePath, - wantSb: sb{ - InodesCount: 0x10, - BlocksCount: 0x40, - MaxMountCount: 0xffff, - FirstDataBlock: 0x1, - BlockSize: 0x400, - BlocksPerGroup: 0x2000, - ClusterSize: 0x400, - ClustersPerGroup: 0x2000, - InodeSize: 0x80, - InodesPerGroup: 0x10, - BgDescSize: 0x40, - Magic: linux.EXT_SUPER_MAGIC, - Revision: disklayout.DynamicRev, - CompatFeatures: disklayout.CompatFeatures{ - ExtAttr: true, - ResizeInode: true, - DirIndex: true, - }, - IncompatFeatures: disklayout.IncompatFeatures{ - DirentFileType: true, - Extents: true, - Is64Bit: true, - FlexBg: true, - }, - RoCompatFeatures: disklayout.RoCompatFeatures{ - Sparse: true, - LargeFile: true, - HugeFile: true, - DirNlink: true, - ExtraIsize: true, - MetadataCsum: true, - }, - }, - wantBgs: []bg{ - { - InodeTable: 0x23, - BlockBitmap: 0x3, - InodeBitmap: 0x13, - Flags: disklayout.BGFlags{ - InodeZeroed: true, - }, - }, - }, - }, - { - name: "ext3 filesystem init", - image: ext3ImagePath, - wantSb: sb{ - InodesCount: 0x10, - BlocksCount: 0x40, - MaxMountCount: 0xffff, - FirstDataBlock: 0x1, - BlockSize: 0x400, - BlocksPerGroup: 0x2000, - ClusterSize: 0x400, - ClustersPerGroup: 0x2000, - InodeSize: 0x80, - InodesPerGroup: 0x10, - BgDescSize: 0x20, - Magic: linux.EXT_SUPER_MAGIC, - Revision: disklayout.DynamicRev, - CompatFeatures: disklayout.CompatFeatures{ - ExtAttr: true, - ResizeInode: true, - DirIndex: true, - }, - IncompatFeatures: disklayout.IncompatFeatures{ - DirentFileType: true, - }, - RoCompatFeatures: disklayout.RoCompatFeatures{ - Sparse: true, - LargeFile: true, - }, - }, - wantBgs: []bg{ - { - InodeTable: 0x5, - BlockBitmap: 0x3, - InodeBitmap: 0x4, - Flags: disklayout.BGFlags{ - InodeZeroed: true, - }, - }, - }, - }, - { - name: "ext2 filesystem init", - image: ext2ImagePath, - wantSb: sb{ - InodesCount: 0x10, - BlocksCount: 0x40, - MaxMountCount: 0xffff, - FirstDataBlock: 0x1, - BlockSize: 0x400, - BlocksPerGroup: 0x2000, - ClusterSize: 0x400, - ClustersPerGroup: 0x2000, - InodeSize: 0x80, - InodesPerGroup: 0x10, - BgDescSize: 0x20, - Magic: linux.EXT_SUPER_MAGIC, - Revision: disklayout.DynamicRev, - CompatFeatures: disklayout.CompatFeatures{ - ExtAttr: true, - ResizeInode: true, - DirIndex: true, - }, - IncompatFeatures: disklayout.IncompatFeatures{ - DirentFileType: true, - }, - RoCompatFeatures: disklayout.RoCompatFeatures{ - Sparse: true, - LargeFile: true, - }, - }, - wantBgs: []bg{ - { - InodeTable: 0x5, - BlockBitmap: 0x3, - InodeBitmap: 0x4, - Flags: disklayout.BGFlags{ - InodeZeroed: true, - }, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - _, _, vd, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fs, ok := vd.Mount().Filesystem().Impl().(*filesystem) - if !ok { - t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl()) - } - - // Offload superblock and block group descriptors contents into - // local structs for comparison. - totalFreeInodes := uint32(0) - totalFreeBlocks := uint64(0) - gotSb := sb{ - InodesCount: fs.sb.InodesCount(), - BlocksCount: fs.sb.BlocksCount(), - MaxMountCount: fs.sb.MaxMountCount(), - FirstDataBlock: fs.sb.FirstDataBlock(), - BlockSize: fs.sb.BlockSize(), - BlocksPerGroup: fs.sb.BlocksPerGroup(), - ClusterSize: fs.sb.ClusterSize(), - ClustersPerGroup: fs.sb.ClustersPerGroup(), - InodeSize: fs.sb.InodeSize(), - InodesPerGroup: fs.sb.InodesPerGroup(), - BgDescSize: fs.sb.BgDescSize(), - Magic: fs.sb.Magic(), - Revision: fs.sb.Revision(), - CompatFeatures: fs.sb.CompatibleFeatures(), - IncompatFeatures: fs.sb.IncompatibleFeatures(), - RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(), - } - gotNumBgs := len(fs.bgs) - gotBgs := make([]bg, gotNumBgs) - for i := 0; i < gotNumBgs; i++ { - gotBgs[i].InodeTable = fs.bgs[i].InodeTable() - gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap() - gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap() - gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap() - gotBgs[i].Flags = fs.bgs[i].Flags() - - totalFreeInodes += fs.bgs[i].FreeInodesCount() - totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount()) - } - - if diff := cmp.Diff(gotSb, test.wantSb); diff != "" { - t.Errorf("superblock mismatch (-want +got):\n%s", diff) - } - - if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" { - t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff) - } - - if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" { - t.Errorf("total free inodes mismatch (-want +got):\n%s", diff) - } - - if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" { - t.Errorf("total free blocks mismatch (-want +got):\n%s", diff) - } - }) - } -} diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go deleted file mode 100644 index 778460107..000000000 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - "sort" - - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/syserror" -) - -// extentFile is a type of regular file which uses extents to store file data. -// -// +stateify savable -type extentFile struct { - regFile regularFile - - // root is the root extent node. This lives in the 60 byte diskInode.Data(). - // Immutable. - root disklayout.ExtentNode -} - -// Compiles only if extentFile implements io.ReaderAt. -var _ io.ReaderAt = (*extentFile)(nil) - -// newExtentFile is the extent file constructor. It reads the entire extent -// tree into memory. -// TODO(b/134676337): Build extent tree on demand to reduce memory usage. -func newExtentFile(args inodeArgs) (*extentFile, error) { - file := &extentFile{} - file.regFile.impl = file - file.regFile.inode.init(args, &file.regFile) - err := file.buildExtTree() - if err != nil { - return nil, err - } - return file, nil -} - -// buildExtTree builds the extent tree by reading it from disk by doing -// running a simple DFS. It first reads the root node from the inode struct in -// memory. Then it recursively builds the rest of the tree by reading it off -// disk. -// -// Precondition: inode flag InExtents must be set. -func (f *extentFile) buildExtTree() error { - rootNodeData := f.regFile.inode.diskInode.Data() - - f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize]) - - // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. - if f.root.Header.NumEntries > 4 { - // read(2) specifies that EINVAL should be returned if the file is unsuitable - // for reading. - return syserror.EINVAL - } - - f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries) - for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { - var curEntry disklayout.ExtentEntry - if f.root.Header.Height == 0 { - // Leaf node. - curEntry = &disklayout.Extent{} - } else { - // Internal node. - curEntry = &disklayout.ExtentIdx{} - } - curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize]) - f.root.Entries[i].Entry = curEntry - } - - // If this node is internal, perform DFS. - if f.root.Header.Height > 0 { - for i := uint16(0); i < f.root.Header.NumEntries; i++ { - var err error - if f.root.Entries[i].Node, err = f.buildExtTreeFromDisk(f.root.Entries[i].Entry); err != nil { - return err - } - } - } - - return nil -} - -// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively -// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to -// by the ExtentEntry. -func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) { - var header disklayout.ExtentHeader - off := entry.PhysicalBlock() * f.regFile.inode.blkSize - err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header) - if err != nil { - return nil, err - } - - entries := make([]disklayout.ExtentEntryPair, header.NumEntries) - for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { - var curEntry disklayout.ExtentEntry - if header.Height == 0 { - // Leaf node. - curEntry = &disklayout.Extent{} - } else { - // Internal node. - curEntry = &disklayout.ExtentIdx{} - } - - err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry) - if err != nil { - return nil, err - } - entries[i].Entry = curEntry - } - - // If this node is internal, perform DFS. - if header.Height > 0 { - for i := uint16(0); i < header.NumEntries; i++ { - var err error - entries[i].Node, err = f.buildExtTreeFromDisk(entries[i].Entry) - if err != nil { - return nil, err - } - } - } - - return &disklayout.ExtentNode{header, entries}, nil -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) { - if len(dst) == 0 { - return 0, nil - } - - if off < 0 { - return 0, syserror.EINVAL - } - - if uint64(off) >= f.regFile.inode.diskInode.Size() { - return 0, io.EOF - } - - n, err := f.read(&f.root, uint64(off), dst) - if n < len(dst) && err == nil { - err = io.EOF - } - return n, err -} - -// read is the recursive step of extentFile.ReadAt which traverses the extent -// tree from the node passed and reads file data. -func (f *extentFile) read(node *disklayout.ExtentNode, off uint64, dst []byte) (int, error) { - // Perform a binary search for the node covering bytes starting at r.fileOff. - // A highly fragmented filesystem can have upto 340 entries and so linear - // search should be avoided. Finds the first entry which does not cover the - // file block we want and subtracts 1 to get the desired index. - fileBlk := uint32(off / f.regFile.inode.blkSize) - n := len(node.Entries) - found := sort.Search(n, func(i int) bool { - return node.Entries[i].Entry.FileBlock() > fileBlk - }) - 1 - - // We should be in this recursive step only if the data we want exists under - // the current node. - if found < 0 { - panic("searching for a file block in an extent entry which does not cover it") - } - - read := 0 - toRead := len(dst) - var curR int - var err error - for i := found; i < n && read < toRead; i++ { - if node.Header.Height == 0 { - curR, err = f.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), off, dst[read:]) - } else { - curR, err = f.read(node.Entries[i].Node, off, dst[read:]) - } - - read += curR - off += uint64(curR) - if err != nil { - return read, err - } - } - - return read, nil -} - -// readFromExtent reads file data from the extent. It takes advantage of the -// sequential nature of extents and reads file data from multiple blocks in one -// call. -// -// A non-nil error indicates that this is a partial read and there is probably -// more to read from this extent. The caller should propagate the error upward -// and not move to the next extent in the tree. -// -// A subsequent call to extentReader.Read should continue reading from where we -// left off as expected. -func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byte) (int, error) { - curFileBlk := uint32(off / f.regFile.inode.blkSize) - exFirstFileBlk := ex.FileBlock() - exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive. - - // We should be in this recursive step only if the data we want exists under - // the current extent. - if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk { - panic("searching for a file block in an extent which does not cover it") - } - - curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock() - readStart := curPhyBlk*f.regFile.inode.blkSize + (off % f.regFile.inode.blkSize) - - endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length) - extentEnd := endPhyBlk * f.regFile.inode.blkSize // This is exclusive. - - toRead := int(extentEnd - readStart) - if len(dst) < toRead { - toRead = len(dst) - } - - n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart)) - if n < toRead { - return n, syserror.EIO - } - return n, nil -} diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go deleted file mode 100644 index 985f76ac0..000000000 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ /dev/null @@ -1,266 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "bytes" - "math/rand" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" -) - -const ( - // mockExtentBlkSize is the mock block size used for testing. - // No block has more than 1 header + 4 entries. - mockExtentBlkSize = uint64(64) -) - -// The tree described below looks like: -// -// 0.{Head}[Idx][Idx] -// / \ -// / \ -// 1.{Head}[Ext][Ext] 2.{Head}[Idx] -// / | \ -// [Phy] [Phy, Phy] 3.{Head}[Ext] -// | -// [Phy, Phy, Phy] -// -// Legend: -// - Head = ExtentHeader -// - Idx = ExtentIdx -// - Ext = Extent -// - Phy = Physical Block -// -// Please note that ext4 might not construct extent trees looking like this. -// This is purely for testing the tree traversal logic. -var ( - node3 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 1, - MaxEntries: 4, - Height: 0, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.Extent{ - FirstFileBlock: 3, - Length: 3, - StartBlockLo: 6, - }, - Node: nil, - }, - }, - } - - node2 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 1, - MaxEntries: 4, - Height: 1, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.ExtentIdx{ - FirstFileBlock: 3, - ChildBlockLo: 2, - }, - Node: node3, - }, - }, - } - - node1 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 2, - MaxEntries: 4, - Height: 0, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.Extent{ - FirstFileBlock: 0, - Length: 1, - StartBlockLo: 3, - }, - Node: nil, - }, - { - Entry: &disklayout.Extent{ - FirstFileBlock: 1, - Length: 2, - StartBlockLo: 4, - }, - Node: nil, - }, - }, - } - - node0 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 2, - MaxEntries: 4, - Height: 2, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.ExtentIdx{ - FirstFileBlock: 0, - ChildBlockLo: 0, - }, - Node: node1, - }, - { - Entry: &disklayout.ExtentIdx{ - FirstFileBlock: 3, - ChildBlockLo: 1, - }, - Node: node2, - }, - }, - } -) - -// TestExtentReader stress tests extentReader functionality. It performs random -// length reads from all possible positions in the extent tree. -func TestExtentReader(t *testing.T) { - mockExtentFile, want := extentTreeSetUp(t, node0) - n := len(want) - - for from := 0; from < n; from++ { - got := make([]byte, n-from) - - if read, err := mockExtentFile.ReadAt(got, int64(from)); err != nil { - t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err) - } - - if diff := cmp.Diff(got, want[from:]); diff != "" { - t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff) - } - } -} - -// TestBuildExtentTree tests the extent tree building logic. -func TestBuildExtentTree(t *testing.T) { - mockExtentFile, _ := extentTreeSetUp(t, node0) - - opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{}) - if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" { - t.Errorf("extent tree mismatch (-want +got):\n%s", diff) - } -} - -// extentTreeSetUp writes the passed extent tree to a mock disk as an extent -// tree. It also constucts a mock extent file with the same tree built in it. -// It also writes random data file data and returns it. -func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []byte) { - t.Helper() - - mockDisk := make([]byte, mockExtentBlkSize*10) - mockExtentFile := &extentFile{} - args := inodeArgs{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), - }, - }, - blkSize: mockExtentBlkSize, - } - mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile) - - fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize) - - if err := mockExtentFile.buildExtTree(); err != nil { - t.Fatalf("inode.buildExtTree failed: %v", err) - } - return mockExtentFile, fileData -} - -// writeTree writes the tree represented by `root` to the inode and disk. It -// also writes random file data on disk. -func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { - rootData := in.diskInode.Data() - root.Header.MarshalBytes(rootData) - off := root.Header.SizeBytes() - for _, ep := range root.Entries { - ep.Entry.MarshalBytes(rootData[off:]) - off += ep.Entry.SizeBytes() - } - - var fileData []byte - for _, ep := range root.Entries { - if root.Header.Height == 0 { - fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...) - } else { - fileData = append(fileData, writeTreeToDisk(disk, ep)...) - } - } - return fileData -} - -// writeTreeToDisk is the recursive step for writeTree which writes the tree -// on the disk only. Also writes random file data on disk. -func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { - nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:] - curNode.Node.Header.MarshalBytes(nodeData) - off := curNode.Node.Header.SizeBytes() - for _, ep := range curNode.Node.Entries { - ep.Entry.MarshalBytes(nodeData[off:]) - off += ep.Entry.SizeBytes() - } - - var fileData []byte - for _, ep := range curNode.Node.Entries { - if curNode.Node.Header.Height == 0 { - fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...) - } else { - fileData = append(fileData, writeTreeToDisk(disk, ep)...) - } - } - return fileData -} - -// writeFileDataToExtent writes random bytes to the blocks on disk that the -// passed extent points to. -func writeFileDataToExtent(disk []byte, ex *disklayout.Extent) []byte { - phyExStartBlk := ex.PhysicalBlock() - phyExStartOff := phyExStartBlk * mockExtentBlkSize - phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize - rand.Read(disk[phyExStartOff:phyExEndOff]) - return disk[phyExStartOff:phyExEndOff] -} - -// getNumPhyBlks returns the number of physical blocks covered under the node. -func getNumPhyBlks(node *disklayout.ExtentNode) uint32 { - var res uint32 - for _, ep := range node.Entries { - if node.Header.Height == 0 { - res += uint32(ep.Entry.(*disklayout.Extent).Length) - } else { - res += getNumPhyBlks(ep.Node) - } - } - return res -} diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go deleted file mode 100644 index 90b086468..000000000 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// fileDescription is embedded by ext implementations of -// vfs.FileDescriptionImpl. -type fileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl - vfs.LockFD -} - -func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) -} - -func (fd *fileDescription) inode() *inode { - return fd.vfsfd.Dentry().Impl().(*dentry).inode -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - var stat linux.Statx - fd.inode().statTo(&stat) - return stat, nil -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if opts.Stat.Mask == 0 { - return nil - } - return syserror.EPERM -} - -// SetStat implements vfs.FileDescriptionImpl.StatFS. -func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { - var stat linux.Statfs - fd.filesystem().statTo(&stat) - return stat, nil -} - -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *fileDescription) Sync(ctx context.Context) error { - return nil -} diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go deleted file mode 100644 index d4fc484a2..000000000 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ /dev/null @@ -1,555 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "errors" - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" -) - -var ( - // errResolveDirent indicates that the vfs.ResolvingPath.Component() does - // not exist on the dentry tree but does exist on disk. So it has to be read in - // using the in-memory dirent and added to the dentry tree. Usually indicates - // the need to lock filesystem.mu for writing. - errResolveDirent = errors.New("resolve path component using dirent") -) - -// filesystem implements vfs.FilesystemImpl. -// -// +stateify savable -type filesystem struct { - vfsfs vfs.Filesystem - - // mu serializes changes to the Dentry tree. - mu sync.RWMutex `state:"nosave"` - - // dev represents the underlying fs device. It does not require protection - // because io.ReaderAt permits concurrent read calls to it. It translates to - // the pread syscall which passes on the read request directly to the device - // driver. Device drivers are intelligent in serving multiple concurrent read - // requests in the optimal order (taking locality into consideration). - dev io.ReaderAt - - // inodeCache maps absolute inode numbers to the corresponding Inode struct. - // Inodes should be removed from this once their reference count hits 0. - // - // Protected by mu because most additions (see IterDirents) and all removals - // from this corresponds to a change in the dentry tree. - inodeCache map[uint32]*inode - - // sb represents the filesystem superblock. Immutable after initialization. - sb disklayout.SuperBlock - - // bgs represents all the block group descriptors for the filesystem. - // Immutable after initialization. - bgs []disklayout.BlockGroup - - // devMinor is this filesystem's device minor number. Immutable after - // initialization. - devMinor uint32 -} - -// Compiles only if filesystem implements vfs.FilesystemImpl. -var _ vfs.FilesystemImpl = (*filesystem)(nil) - -// stepLocked resolves rp.Component() in parent directory vfsd. The write -// parameter passed tells if the caller has acquired filesystem.mu for writing -// or not. If set to true, an existing inode on disk can be added to the dentry -// tree if not present already. -// -// stepLocked is loosely analogous to fs/namei.c:walk_component(). -// -// Preconditions: -// * filesystem.mu must be locked (for writing if write param is true). -// * !rp.Done(). -// * inode == vfsd.Impl().(*Dentry).inode. -func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { - return nil, nil, err - } - - for { - name := rp.Component() - if name == "." { - rp.Advance() - return vfsd, inode, nil - } - d := vfsd.Impl().(*dentry) - if name == ".." { - isRoot, err := rp.CheckRoot(ctx, vfsd) - if err != nil { - return nil, nil, err - } - if isRoot || d.parent == nil { - rp.Advance() - return vfsd, inode, nil - } - if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { - return nil, nil, err - } - rp.Advance() - return &d.parent.vfsd, d.parent.inode, nil - } - - dir := inode.impl.(*directory) - child, ok := dir.childCache[name] - if !ok { - // We may need to instantiate a new dentry for this child. - childDirent, ok := dir.childMap[name] - if !ok { - // The underlying inode does not exist on disk. - return nil, nil, syserror.ENOENT - } - - if !write { - // filesystem.mu must be held for writing to add to the dentry tree. - return nil, nil, errResolveDirent - } - - // Create and add the component's dirent to the dentry tree. - fs := rp.Mount().Filesystem().Impl().(*filesystem) - childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode()) - if err != nil { - return nil, nil, err - } - // incRef because this is being added to the dentry tree. - childInode.incRef() - child = newDentry(childInode) - child.parent = d - child.name = name - dir.childCache[name] = child - } - if err := rp.CheckMount(ctx, &child.vfsd); err != nil { - return nil, nil, err - } - if child.inode.isSymlink() && rp.ShouldFollowSymlink() { - if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { - return nil, nil, err - } - continue - } - rp.Advance() - return &child.vfsd, child.inode, nil - } -} - -// walkLocked resolves rp to an existing file. The write parameter -// passed tells if the caller has acquired filesystem.mu for writing or not. -// If set to true, additions can be made to the dentry tree while walking. -// If errResolveDirent is returned, the walk needs to be continued with an -// upgraded filesystem.mu. -// -// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). -// -// Preconditions: -// * filesystem.mu must be locked (for writing if write param is true). -func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Done() { - var err error - vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) - if err != nil { - return nil, nil, err - } - } - if rp.MustBeDir() && !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - return vfsd, inode, nil -} - -// walkParentLocked resolves all but the last path component of rp to an -// existing directory. It does not check that the returned directory is -// searchable by the provider of rp. The write parameter passed tells if the -// caller has acquired filesystem.mu for writing or not. If set to true, -// additions can be made to the dentry tree while walking. -// If errResolveDirent is returned, the walk needs to be continued with an -// upgraded filesystem.mu. -// -// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). -// -// Preconditions: -// * filesystem.mu must be locked (for writing if write param is true). -// * !rp.Done(). -func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Final() { - var err error - vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) - if err != nil { - return nil, nil, err - } - } - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - return vfsd, inode, nil -} - -// walk resolves rp to an existing file. If parent is set to true, it resolves -// the rp till the parent of the last component which should be an existing -// directory. If parent is false then resolves rp entirely. Attemps to resolve -// the path as far as it can with a read lock and upgrades the lock if needed. -func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { - var ( - vfsd *vfs.Dentry - inode *inode - err error - ) - - // Try walking with the hopes that all dentries have already been pulled out - // of disk. This reduces congestion (allows concurrent walks). - fs.mu.RLock() - if parent { - vfsd, inode, err = walkParentLocked(ctx, rp, false) - } else { - vfsd, inode, err = walkLocked(ctx, rp, false) - } - fs.mu.RUnlock() - - if err == errResolveDirent { - // Upgrade lock and continue walking. Lock upgrading in the middle of the - // walk is fine as this is a read only filesystem. - fs.mu.Lock() - if parent { - vfsd, inode, err = walkParentLocked(ctx, rp, true) - } else { - vfsd, inode, err = walkLocked(ctx, rp, true) - } - fs.mu.Unlock() - } - - return vfsd, inode, err -} - -// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in. -// It creates a new one with the given inode number if one does not exist. -// The caller must increment the ref count if adding this to the dentry tree. -// -// Precondition: must be holding fs.mu for writing. -func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) { - if in, ok := fs.inodeCache[inodeNum]; ok { - return in, nil - } - - in, err := newInode(fs, inodeNum) - if err != nil { - return nil, err - } - - fs.inodeCache[inodeNum] = in - return in, nil -} - -// statTo writes the statfs fields to the output parameter. -func (fs *filesystem) statTo(stat *linux.Statfs) { - stat.Type = uint64(fs.sb.Magic()) - stat.BlockSize = int64(fs.sb.BlockSize()) - stat.Blocks = fs.sb.BlocksCount() - stat.BlocksFree = fs.sb.FreeBlocksCount() - stat.BlocksAvailable = fs.sb.FreeBlocksCount() - stat.Files = uint64(fs.sb.InodesCount()) - stat.FilesFree = uint64(fs.sb.FreeInodesCount()) - stat.NameLength = disklayout.MaxFileName - stat.FragmentSize = int64(fs.sb.BlockSize()) - // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. -} - -// AccessAt implements vfs.Filesystem.Impl.AccessAt. -func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - return inode.checkPermissions(rp.Credentials(), ats) -} - -// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. -func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - - if opts.CheckSearchable { - if !inode.isDir() { - return nil, syserror.ENOTDIR - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { - return nil, err - } - } - - inode.incRef() - return vfsd, nil -} - -// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. -func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(ctx, rp, true) - if err != nil { - return nil, err - } - inode.incRef() - return vfsd, nil -} - -// OpenAt implements vfs.FilesystemImpl.OpenAt. -func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - vfsd, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - - // EROFS is returned if write access is needed. - if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { - return nil, syserror.EROFS - } - return inode.open(rp, vfsd, &opts) -} - -// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. -func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return "", err - } - symlink, ok := inode.impl.(*symlink) - if !ok { - return "", syserror.EINVAL - } - return symlink.target, nil -} - -// StatAt implements vfs.FilesystemImpl.StatAt. -func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return linux.Statx{}, err - } - var stat linux.Statx - inode.statTo(&stat) - return stat, nil -} - -// StatFSAt implements vfs.FilesystemImpl.StatFSAt. -func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, _, err := fs.walk(ctx, rp, false); err != nil { - return linux.Statfs{}, err - } - - var stat linux.Statfs - fs.statTo(&stat) - return stat, nil -} - -// Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release(ctx context.Context) { - fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) -} - -// Sync implements vfs.FilesystemImpl.Sync. -func (fs *filesystem) Sync(ctx context.Context) error { - // This is a readonly filesystem for now. - return nil -} - -// The vfs.FilesystemImpl functions below return EROFS because their respective -// man pages say that EROFS must be returned if the path resolves to a file on -// this read-only filesystem. - -// LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - if rp.Done() { - return syserror.EEXIST - } - - if _, _, err := fs.walk(ctx, rp, true); err != nil { - return err - } - - return syserror.EROFS -} - -// MkdirAt implements vfs.FilesystemImpl.MkdirAt. -func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - if rp.Done() { - return syserror.EEXIST - } - - if _, _, err := fs.walk(ctx, rp, true); err != nil { - return err - } - - return syserror.EROFS -} - -// MknodAt implements vfs.FilesystemImpl.MknodAt. -func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - if rp.Done() { - return syserror.EEXIST - } - - _, _, err := fs.walk(ctx, rp, true) - if err != nil { - return err - } - - return syserror.EROFS -} - -// RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if rp.Done() { - return syserror.ENOENT - } - - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - return syserror.EROFS -} - -// RmdirAt implements vfs.FilesystemImpl.RmdirAt. -func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - if !inode.isDir() { - return syserror.ENOTDIR - } - - return syserror.EROFS -} - -// SetStatAt implements vfs.FilesystemImpl.SetStatAt. -func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - return syserror.EROFS -} - -// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. -func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - if rp.Done() { - return syserror.EEXIST - } - - _, _, err := fs.walk(ctx, rp, true) - if err != nil { - return err - } - - return syserror.EROFS -} - -// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. -func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - if inode.isDir() { - return syserror.EISDIR - } - - return syserror.EROFS -} - -// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { - return nil, err - } - - // TODO(b/134676337): Support sockets. - return nil, syserror.ECONNREFUSED -} - -// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. -func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - return nil, syserror.ENOTSUP -} - -// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. -func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return "", err - } - return "", syserror.ENOTSUP -} - -// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. -func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - return syserror.ENOTSUP -} - -// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. -func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - return syserror.ENOTSUP -} - -// PrependPath implements vfs.FilesystemImpl.PrependPath. -func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { - fs.mu.RLock() - defer fs.mu.RUnlock() - return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) -} - -// MountOptions implements vfs.FilesystemImpl.MountOptions. -func (fs *filesystem) MountOptions() string { - return "" -} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go deleted file mode 100644 index 4a555bf72..000000000 --- a/pkg/sentry/fsimpl/ext/inode.go +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "fmt" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// inode represents an ext inode. -// -// inode uses the same inheritance pattern that pkg/sentry/vfs structures use. -// This has been done to increase memory locality. -// -// Implementations: -// inode -- -// |-- dir -// |-- symlink -// |-- regular-- -// |-- extent file -// |-- block map file -// -// +stateify savable -type inode struct { - // refs is a reference count. refs is accessed using atomic memory operations. - refs int64 - - // fs is the containing filesystem. - fs *filesystem - - // inodeNum is the inode number of this inode on disk. This is used to - // identify inodes within the ext filesystem. - inodeNum uint32 - - // blkSize is the fs data block size. Same as filesystem.sb.BlockSize(). - blkSize uint64 - - // diskInode gives us access to the inode struct on disk. Immutable. - diskInode disklayout.Inode - - locks vfs.FileLocks - - // This is immutable. The first field of the implementations must have inode - // as the first field to ensure temporality. - impl interface{} -} - -// incRef increments the inode ref count. -func (in *inode) incRef() { - atomic.AddInt64(&in.refs, 1) -} - -// tryIncRef tries to increment the ref count. Returns true if successful. -func (in *inode) tryIncRef() bool { - for { - refs := atomic.LoadInt64(&in.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) { - return true - } - } -} - -// decRef decrements the inode ref count and releases the inode resources if -// the ref count hits 0. -// -// Precondition: Must have locked filesystem.mu. -func (in *inode) decRef() { - if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { - delete(in.fs.inodeCache, in.inodeNum) - } else if refs < 0 { - panic("ext.inode.decRef() called without holding a reference") - } -} - -// newInode is the inode constructor. Reads the inode off disk. Identifies -// inodes based on the absolute inode number on disk. -func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { - if inodeNum == 0 { - panic("inode number 0 on ext filesystems is not possible") - } - - inodeRecordSize := fs.sb.InodeSize() - var diskInode disklayout.Inode - if inodeRecordSize == disklayout.OldInodeSize { - diskInode = &disklayout.InodeOld{} - } else { - diskInode = &disklayout.InodeNew{} - } - - // Calculate where the inode is actually placed. - inodesPerGrp := fs.sb.InodesPerGroup() - blkSize := fs.sb.BlockSize() - inodeTableOff := fs.bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize - inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp)) - - if err := readFromDisk(fs.dev, int64(inodeOff), diskInode); err != nil { - return nil, err - } - - // Build the inode based on its type. - args := inodeArgs{ - fs: fs, - inodeNum: inodeNum, - blkSize: blkSize, - diskInode: diskInode, - } - - switch diskInode.Mode().FileType() { - case linux.ModeSymlink: - f, err := newSymlink(args) - if err != nil { - return nil, err - } - return &f.inode, nil - case linux.ModeRegular: - f, err := newRegularFile(args) - if err != nil { - return nil, err - } - return &f.inode, nil - case linux.ModeDirectory: - f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType) - if err != nil { - return nil, err - } - return &f.inode, nil - default: - // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices. - return nil, syserror.EINVAL - } -} - -type inodeArgs struct { - fs *filesystem - inodeNum uint32 - blkSize uint64 - diskInode disklayout.Inode -} - -func (in *inode) init(args inodeArgs, impl interface{}) { - in.fs = args.fs - in.inodeNum = args.inodeNum - in.blkSize = args.blkSize - in.diskInode = args.diskInode - in.impl = impl -} - -// open creates and returns a file description for the dentry passed in. -func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(opts) - if err := in.checkPermissions(rp.Credentials(), ats); err != nil { - return nil, err - } - mnt := rp.Mount() - switch in.impl.(type) { - case *regularFile: - var fd regularFileFD - fd.LockFD.Init(&in.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - case *directory: - // Can't open directories writably. This check is not necessary for a read - // only filesystem but will be required when write is implemented. - if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR - } - var fd directoryFD - fd.LockFD.Init(&in.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - case *symlink: - if opts.Flags&linux.O_PATH == 0 { - // Can't open symlinks without O_PATH. - return nil, syserror.ELOOP - } - var fd symlinkFD - fd.LockFD.Init(&in.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - default: - panic(fmt.Sprintf("unknown inode type: %T", in.impl)) - } -} - -func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID()) -} - -// statTo writes the statx fields to the output parameter. -func (in *inode) statTo(stat *linux.Statx) { - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | - linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | - linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME - stat.Blksize = uint32(in.blkSize) - stat.Mode = uint16(in.diskInode.Mode()) - stat.Nlink = uint32(in.diskInode.LinksCount()) - stat.UID = uint32(in.diskInode.UID()) - stat.GID = uint32(in.diskInode.GID()) - stat.Ino = uint64(in.inodeNum) - stat.Size = in.diskInode.Size() - stat.Atime = in.diskInode.AccessTime().StatxTimestamp() - stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() - stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() - stat.DevMajor = linux.UNNAMED_MAJOR - stat.DevMinor = in.fs.devMinor - // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks - // (including metadata blocks) required to represent this file. -} - -// getBGNum returns the block group number that a given inode belongs to. -func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { - return (inodeNum - 1) / inodesPerGrp -} - -// getBGOff returns the offset at which the given inode lives in the block -// group's inode table, i.e. the index of the inode in the inode table. -func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 { - return (inodeNum - 1) % inodesPerGrp -} diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go deleted file mode 100644 index 5ad9befcd..000000000 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// regularFile represents a regular file's inode. This too follows the -// inheritance pattern prevelant in the vfs layer described in -// pkg/sentry/vfs/README.md. -// -// +stateify savable -type regularFile struct { - inode inode - - // This is immutable. The first field of fileReader implementations must be - // regularFile to ensure temporality. - // io.ReaderAt is more strict than io.Reader in the sense that a partial read - // is always accompanied by an error. If a read spans past the end of file, a - // partial read (within file range) is done and io.EOF is returned. - impl io.ReaderAt -} - -// newRegularFile is the regularFile constructor. It figures out what kind of -// file this is and initializes the fileReader. -func newRegularFile(args inodeArgs) (*regularFile, error) { - if args.diskInode.Flags().Extents { - file, err := newExtentFile(args) - if err != nil { - return nil, err - } - return &file.regFile, nil - } - - file, err := newBlockMapFile(args) - if err != nil { - return nil, err - } - return &file.regFile, nil -} - -func (in *inode) isRegular() bool { - _, ok := in.impl.(*regularFile) - return ok -} - -// directoryFD represents a directory file description. It implements -// vfs.FileDescriptionImpl. -// -// +stateify savable -type regularFileFD struct { - fileDescription - vfs.LockFD - - // off is the file offset. off is accessed using atomic memory operations. - off int64 - - // offMu serializes operations that may mutate off. - offMu sync.Mutex `state:"nosave"` -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release(context.Context) {} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - safeReader := safemem.FromIOReaderAt{ - ReaderAt: fd.inode().impl.(*regularFile).impl, - Offset: offset, - } - - // Copies data from disk directly into usermem without any intermediate - // allocations (if dst is converted into BlockSeq such that it does not need - // safe copying). - return dst.CopyOutFrom(ctx, safeReader) -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.offMu.Lock() - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - // write(2) specifies that EBADF must be returned if the fd is not open for - // writing. - return 0, syserror.EBADF -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.offMu.Lock() - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - return syserror.ENOTDIR -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.offMu.Lock() - defer fd.offMu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as specified. - case linux.SEEK_CUR: - offset += fd.off - case linux.SEEK_END: - offset += int64(fd.inode().diskInode.Size()) - default: - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - fd.off = offset - return offset, nil -} - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - // TODO(b/134676337): Implement mmap(2). - return syserror.ENODEV -} diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go deleted file mode 100644 index 5e2bcc837..000000000 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// symlink represents a symlink inode. -// -// +stateify savable -type symlink struct { - inode inode - target string // immutable -} - -// newSymlink is the symlink constructor. It reads out the symlink target from -// the inode (however it might have been stored). -func newSymlink(args inodeArgs) (*symlink, error) { - var link []byte - - // If the symlink target is lesser than 60 bytes, its stores in inode.Data(). - // Otherwise either extents or block maps will be used to store the link. - size := args.diskInode.Size() - if size < 60 { - link = args.diskInode.Data()[:size] - } else { - // Create a regular file out of this inode and read out the target. - regFile, err := newRegularFile(args) - if err != nil { - return nil, err - } - - link = make([]byte, size) - if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size { - return nil, err - } - } - - file := &symlink{target: string(link)} - file.inode.init(args, file) - return file, nil -} - -func (in *inode) isSymlink() bool { - _, ok := in.impl.(*symlink) - return ok -} - -// symlinkFD represents a symlink file description and implements -// vfs.FileDescriptionImpl. which may only be used if open options contains -// O_PATH. For this reason most of the functions return EBADF. -// -// +stateify savable -type symlinkFD struct { - fileDescription - vfs.NoLockFD -} - -// Compiles only if symlinkFD implements vfs.FileDescriptionImpl. -var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *symlinkFD) Release(context.Context) {} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EBADF -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EBADF -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EBADF -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EBADF -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - return syserror.ENOTDIR -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.EBADF -} - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return syserror.EBADF -} diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go deleted file mode 100644 index 58ef7b9b8..000000000 --- a/pkg/sentry/fsimpl/ext/utils.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - - "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/syserror" -) - -// readFromDisk performs a binary read from disk into the given struct from -// the absolute offset provided. -func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error { - n := v.SizeBytes() - buf := make([]byte, n) - if read, _ := dev.ReadAt(buf, abOff); read < int(n) { - return syserror.EIO - } - - v.UnmarshalBytes(buf) - return nil -} - -// readSuperBlock reads the SuperBlock from block group 0 in the underlying -// device. There are three versions of the superblock. This function identifies -// and returns the correct version. -func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) { - var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} - if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { - return nil, err - } - if sb.Revision() == disklayout.OldRev { - return sb, nil - } - - sb = &disklayout.SuperBlock32Bit{} - if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { - return nil, err - } - if !sb.IncompatibleFeatures().Is64Bit { - return sb, nil - } - - sb = &disklayout.SuperBlock64Bit{} - if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { - return nil, err - } - return sb, nil -} - -// blockGroupsCount returns the number of block groups in the ext fs. -func blockGroupsCount(sb disklayout.SuperBlock) uint64 { - blocksCount := sb.BlocksCount() - blocksPerGroup := uint64(sb.BlocksPerGroup()) - - // Round up the result. float64 can compromise precision so do it manually. - return (blocksCount + blocksPerGroup - 1) / blocksPerGroup -} - -// readBlockGroups reads the block group descriptor table from block group 0 in -// the underlying device. -func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { - bgCount := blockGroupsCount(sb) - bgdSize := uint64(sb.BgDescSize()) - is64Bit := sb.IncompatibleFeatures().Is64Bit - bgds := make([]disklayout.BlockGroup, bgCount) - - for i, off := uint64(0), uint64(sb.FirstDataBlock()+1)*sb.BlockSize(); i < bgCount; i, off = i+1, off+bgdSize { - if is64Bit { - bgds[i] = &disklayout.BlockGroup64Bit{} - } else { - bgds[i] = &disklayout.BlockGroup32Bit{} - } - - if err := readFromDisk(dev, int64(off), bgds[i]); err != nil { - return nil, err - } - } - return bgds, nil -} diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index 077bf9307..d404edaf0 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -252,11 +252,11 @@ func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { } if !conn.connected { - return nil, syserror.ENOTCONN + return nil, linuxerr.ENOTCONN } if conn.connInitError { - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } fut, err := conn.callFuture(t, r) @@ -306,7 +306,7 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes conn.mu.Unlock() // we checked connected before, // this must be due to aborted connection. - return nil, syserror.ECONNABORTED + return nil, linuxerr.ECONNABORTED } conn.mu.Unlock() diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 5d2bae14e..dab1e779d 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -18,6 +18,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -122,7 +123,7 @@ func (fd *DeviceFD) Release(ctx context.Context) { func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } return 0, syserror.ENOSYS @@ -132,7 +133,7 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } // We require that any Read done on this filesystem have a sane minimum @@ -149,7 +150,7 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R // If the read buffer is too small, error out. if dst.NumBytes() < int64(minBuffSize) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.mu.Lock() @@ -234,7 +235,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } return 0, syserror.ENOSYS @@ -251,12 +252,12 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs. func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } // Return ENODEV if the filesystem is umounted. if fd.fs.umounted { - return 0, syserror.ENODEV + return 0, linuxerr.ENODEV } var cn, n int64 @@ -293,7 +294,7 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt // Assert that the header isn't read into the writeBuf yet. if fd.writeCursor >= hdrLen { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // We don't have the full common response header yet. @@ -322,7 +323,7 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt if !ok { // Server sent us a response for a request we never sent, // or for which we already received a reply (e.g. aborted), an unlikely event. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } delete(fd.completions, hdr.Unique) @@ -391,7 +392,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } return 0, syserror.ENOSYS @@ -434,7 +435,7 @@ func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUS if !ok { // A response for a request we never sent, // or for which we already received a reply (e.g. aborted). - return syserror.EINVAL + return linuxerr.EINVAL } delete(fd.completions, respHdr.Unique) diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 47794810c..172cbd88f 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -122,30 +122,30 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt deviceDescriptorStr, ok := mopts["fd"] if !ok { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option fd missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "fd") deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) if err != nil { ctx.Debugf("fusefs.FilesystemType.GetFilesystem: invalid fd: %q (%v)", deviceDescriptorStr, err) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fuseFDGeneric := kernelTask.GetFileVFS2(int32(deviceDescriptor)) if fuseFDGeneric == nil { - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } defer fuseFDGeneric.DecRef(ctx) fuseFD, ok := fuseFDGeneric.Impl().(*DeviceFD) if !ok { log.Warningf("%s.GetFilesystem: device FD is %T, not a FUSE device", fsType.Name, fuseFDGeneric) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Parse and set all the other supported FUSE mount options. @@ -155,17 +155,17 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), uidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.uid = kuid } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option user_id missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if gidStr, ok := mopts["group_id"]; ok { @@ -173,17 +173,17 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), gidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.gid = kgid } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option group_id missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if modeStr, ok := mopts["rootmode"]; ok { @@ -191,12 +191,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.rootMode = linux.FileMode(mode) } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option rootmode missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Set the maxInFlightRequests option. @@ -207,7 +207,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if maxRead < fuseMinMaxRead { maxRead = fuseMinMaxRead @@ -230,7 +230,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Check for unparsed options. if len(mopts) != 0 { log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Create a new FUSE filesystem. @@ -259,7 +259,7 @@ func newFUSEFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, fsTyp conn, err := newFUSEConnection(ctx, fuseFD, opts) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fs := &filesystem{ @@ -376,7 +376,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a creds.RealKGID != i.fs.opts.gid || creds.EffectiveKGID != i.fs.opts.gid || creds.SavedKGID != i.fs.opts.gid { - return syserror.EACCES + return linuxerr.EACCES } } @@ -394,10 +394,10 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr isDir := i.InodeAttrs.Mode().IsDir() // return error if specified to open directory but inode is not a directory. if !isDir && opts.Mode.IsDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS { - return nil, syserror.EOVERFLOW + return nil, linuxerr.EOVERFLOW } var fd *fileDescription @@ -419,7 +419,7 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context") - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // Build the request. @@ -513,7 +513,7 @@ func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } in := linux.FUSECreateIn{ CreateMeta: linux.FUSECreateMeta{ @@ -553,7 +553,7 @@ func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) err kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) - return syserror.EINVAL + return linuxerr.EINVAL } in := linux.FUSEUnlinkIn{Name: name} req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) @@ -597,7 +597,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) res, err := i.fs.conn.Call(kernelTask, req) @@ -627,13 +627,13 @@ func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, // Readlink implements kernfs.Inode.Readlink. func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if i.Mode().FileType()&linux.S_IFLNK == 0 { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } if len(i.link) == 0 { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") - return "", syserror.EINVAL + return "", linuxerr.EINVAL } req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) res, err := i.fs.conn.Call(kernelTask, req) @@ -729,7 +729,7 @@ func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOp task := kernel.TaskFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return linux.FUSEAttr{}, syserror.EINVAL + return linux.FUSEAttr{}, linuxerr.EINVAL } creds := auth.CredentialsFromContext(ctx) @@ -834,7 +834,7 @@ func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre task := kernel.TaskFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return syserror.EINVAL + return linuxerr.EINVAL } // We should retain the original file type when assigning new mode. diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go index 66ea889f9..35d0ab6f4 100644 --- a/pkg/sentry/fsimpl/fuse/read_write.go +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -39,7 +40,7 @@ func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off ui t := kernel.TaskFromContext(ctx) if t == nil { log.Warningf("fusefs.Read: couldn't get kernel task from context") - return nil, 0, syserror.EINVAL + return nil, 0, linuxerr.EINVAL } // Round up to a multiple of page size. @@ -155,7 +156,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, t := kernel.TaskFromContext(ctx) if t == nil { log.Warningf("fusefs.Read: couldn't get kernel task from context") - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // One request cannnot exceed either maxWrite or maxPages. diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go index 5bdd096c3..6c4de3507 100644 --- a/pkg/sentry/fsimpl/fuse/regular_file.go +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -39,14 +40,14 @@ type regularFileFD struct { // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } size := dst.NumBytes() @@ -56,7 +57,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } else if size > math.MaxUint32 { // FUSE only supports uint32 for size. // Overflow. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // TODO(gvisor.dev/issue/3678): Add direct IO support. @@ -143,14 +144,14 @@ func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // final offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } inode := fd.inode() @@ -171,11 +172,11 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off if srclen > math.MaxUint32 { // FUSE only supports uint32 for size. // Overflow. - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } if end := offset + srclen; end < offset { // Overflow. - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } srclen, err = vfs.CheckLimit(ctx, offset, srclen) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 177e42649..5c48a9fee 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refsvfs2" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isDir() bool { @@ -297,7 +297,7 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in switch whence { case linux.SEEK_SET: if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset == 0 { // Ensure that the next call to fd.IterDirents() calls @@ -309,13 +309,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in case linux.SEEK_CUR: offset += fd.off if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Don't clear fd.dirents in this case, even if offset == 0. fd.off = offset return fd.off, nil default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index af2b773c3..05b776c2e 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -39,26 +39,14 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // Snapshot current syncable dentries and special file FDs. - fs.renameMu.RLock() fs.syncMu.Lock() ds := make([]*dentry, 0, len(fs.syncableDentries)) for d := range fs.syncableDentries { - // It's safe to use IncRef here even though fs.syncableDentries doesn't - // hold references since we hold fs.renameMu. Note that we can't use - // TryIncRef since cached dentries at zero references should still be - // synced. - d.IncRef() ds = append(ds, d) } - fs.renameMu.RUnlock() sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { - // As above, fs.specialFileFDs doesn't hold references. However, unlike - // dentries, an FD that has reached zero references can't be - // resurrected, so we can use TryIncRef. - if sffd.vfsfd.TryIncRef() { - sffds = append(sffds, sffd) - } + sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -68,9 +56,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync syncable dentries. for _, d := range ds { - err := d.syncCachedFile(ctx, true /* forFilesystemSync */) - d.DecRef(ctx) - if err != nil { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) if retErr == nil { retErr = err @@ -81,9 +67,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - err := sffd.sync(ctx, true /* forFilesystemSync */) - sffd.vfsfd.DecRef(ctx) - if err != nil { + if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) if retErr == nil { retErr = err @@ -147,6 +131,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { @@ -159,6 +144,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp ** putDentrySlice(*dsp) } +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() @@ -187,7 +173,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[] // Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, bool, error) { if !d.isDir() { - return nil, false, syserror.ENOTDIR + return nil, false, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, false, err @@ -245,7 +231,7 @@ afterSymlink: // * dentry at name has been revalidated func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } if child, ok := parent.children[name]; ok || parent.isSynthetic() { if child == nil { @@ -303,7 +289,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving } } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -331,7 +317,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -360,7 +346,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } name := rp.Component() if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if parent.isDeleted() { return syserror.ENOENT @@ -373,20 +359,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir defer parent.dirMu.Unlock() if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } // Check for existence only if caching information is available. Otherwise, // don't check for existence just yet. We will check for existence if the // checks for writability fail below. Existence check is done by the creation // RPCs themselves. if child, ok := parent.children[name]; ok && child != nil { - return syserror.EEXIST + return linuxerr.EEXIST } checkExistence := func() error { if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } else if child != nil { - return syserror.EEXIST + return linuxerr.EEXIST } return nil } @@ -413,7 +399,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } if parent.isSynthetic() { if createInSyntheticDir == nil { - return syserror.EPERM + return linuxerr.EPERM } if err := createInSyntheticDir(parent, name); err != nil { return err @@ -470,10 +456,10 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b name := rp.Component() if dir { if name == "." { - return syserror.EINVAL + return linuxerr.EINVAL } if name == ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } else { if name == "." || name == ".." { @@ -540,8 +526,8 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if child.syntheticChildren != 0 { // This is definitely not an empty directory, irrespective of // fs.opts.interop. - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTEMPTY + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil. + return linuxerr.ENOTEMPTY } // If InteropModeShared is in effect and the first call to // PrepareDeleteDentry above succeeded, then child wasn't @@ -550,13 +536,13 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // still exist) would be a waste of time. if child.cachedMetadataAuthoritative() { if !child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTDIR + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. + return linuxerr.ENOTDIR } for _, grandchild := range child.children { if grandchild != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTEMPTY + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. + return linuxerr.ENOTEMPTY } } } @@ -565,14 +551,14 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } else { // child must be a non-directory file. if child != nil && child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return syserror.EISDIR } if rp.MustBeDir() { if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } if parent.isSynthetic() { @@ -583,7 +569,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b err = parent.file.unlinkAt(ctx, name, flags) if err != nil { if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } return err } @@ -601,7 +587,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } if child != nil { - vfsObj.CommitDeleteDentry(ctx, &child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above. child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- @@ -643,7 +629,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -675,11 +661,11 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error { if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) if d.isDir() { - return syserror.EPERM + return linuxerr.EPERM } gid := auth.KGID(atomic.LoadUint32(&d.gid)) uid := auth.KUID(atomic.LoadUint32(&d.uid)) @@ -691,7 +677,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.ENOENT } if d.nlink == math.MaxUint32 { - return syserror.EMLINK + return linuxerr.EMLINK } if err := parent.file.link(ctx, d.file, childName); err != nil { return err @@ -735,7 +721,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }, func(parent *dentry, name string) error { if !opts.ForSyntheticMountpoint { // Can't create non-synthetic files in synthetic directories. - return syserror.EPERM + return linuxerr.EPERM } parent.createSyntheticChildLocked(&createSyntheticOpts{ name: name, @@ -765,7 +751,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v switch { case err == nil: // Step succeeded, another file exists. - return syserror.EEXIST + return linuxerr.EEXIST case !linuxerr.Equals(linuxerr.ENOENT, err): // Unexpected error. return err @@ -794,7 +780,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return nil } // Retain error from gofer if synthetic file cannot be created internally. - return syserror.EPERM + return linuxerr.EPERM }, nil) } @@ -805,7 +791,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // support, and it isn't clear that there's any way to implement this in // 9P. if opts.Flags&linux.O_TMPFILE != 0 { - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } mayCreate := opts.Flags&linux.O_CREAT != 0 mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) @@ -828,7 +814,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return nil, syserror.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if !start.cachedMetadataAuthoritative() { // Refresh dentry's attributes before opening. @@ -866,7 +852,7 @@ afterTrailingSymlink: if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { if parent.isSynthetic() { parent.dirMu.Unlock() - return nil, syserror.EPERM + return nil, linuxerr.EPERM } fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) parent.dirMu.Unlock() @@ -877,7 +863,7 @@ afterTrailingSymlink: return nil, err } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } // Open existing child or follow symlink. if child.isSymlink() && rp.ShouldFollowSymlink() { @@ -892,7 +878,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if rp.MustBeDir() && !child.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } child.IncRef() defer child.DecRef(ctx) @@ -943,7 +929,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, syserror.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if !d.isSynthetic() { if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { @@ -963,10 +949,10 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return &fd.vfsfd, nil case linux.S_IFLNK: // Can't open symlinks without O_PATH, which is handled at the VFS layer. - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP case linux.S_IFSOCK: if d.isSynthetic() { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } if d.fs.iopts.OpenSocketsByConnecting { return d.openSocketByConnecting(ctx, opts) @@ -999,7 +985,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) if err != nil { @@ -1020,7 +1006,7 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // We assume that the server silently inserts O_NONBLOCK in the open flags // for all named pipes (because all existing gofers do this). @@ -1188,7 +1174,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st return "", err } if !d.isSymlink() { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } return d.readlink(ctx, rp.Mount()) } @@ -1205,24 +1191,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if fs.opts.interop == InteropModeShared && opts.Flags&linux.RENAME_NOREPLACE != 0 { // Requires 9P support to synchronize with other remote filesystem // users. - return syserror.EINVAL + return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -1261,7 +1247,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { - return syserror.EINVAL + return linuxerr.EINVAL } if oldParent != newParent { if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { @@ -1270,7 +1256,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -1291,7 +1277,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var replacedVFSD *vfs.Dentry if replaced != nil { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } replacedVFSD = &replaced.vfsd if replaced.isDir() { @@ -1299,11 +1285,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.EISDIR } if genericIsAncestorDentry(replaced, renamed) { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } else { if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } } @@ -1508,7 +1494,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath return d.endpoint, nil } } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 496e31e34..25d2e39d6 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -319,7 +319,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mfp := pgalloc.MemoryFileProviderFromContext(ctx) if mfp == nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } mopts := vfs.GenericParseMountOptions(opts.Data) @@ -355,7 +355,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts.interop = InteropModeShared default: ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } @@ -366,7 +366,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // In Linux, dfltuid is interpreted as a UID and is converted to a KUID // in the caller's user namespace, but goferfs isn't @@ -379,7 +379,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.dfltgid = auth.KGID(dfltgid) } @@ -391,7 +391,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt msize, err := strconv.ParseUint(msizestr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: %s=%s", moptMsize, msizestr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.msize = uint32(msize) } @@ -410,7 +410,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxCachedDentries, err := strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.maxCachedDentries = maxCachedDentries } @@ -434,14 +434,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Check for unparsed options. if len(mopts) != 0 { ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Handle internal options. iopts, ok := opts.InternalData.(InternalFilesystemOptions) if opts.InternalData != nil && !ok { ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // If !ok, iopts being the zero value is correct. @@ -504,7 +504,7 @@ func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int trans, ok := mopts[moptTransport] if !ok || trans != transportModeFD { ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } delete(mopts, moptTransport) @@ -512,28 +512,28 @@ func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int rfdstr, ok := mopts[moptReadFD] if !ok { ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } delete(mopts, moptReadFD) rfd, err := strconv.Atoi(rfdstr) if err != nil { ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } wfdstr, ok := mopts[moptWriteFD] if !ok { ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } delete(mopts, moptWriteFD) wfd, err := strconv.Atoi(wfdstr) if err != nil { ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } if rfd != wfd { ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } return rfd, nil } @@ -582,10 +582,10 @@ func (fs *filesystem) Release(ctx context.Context) { d.dataMu.Unlock() // Close host FDs if they exist. if d.readFD >= 0 { - unix.Close(int(d.readFD)) + _ = unix.Close(int(d.readFD)) } if d.writeFD >= 0 && d.readFD != d.writeFD { - unix.Close(int(d.writeFD)) + _ = unix.Close(int(d.writeFD)) } d.readFD = -1 d.writeFD = -1 @@ -947,10 +947,10 @@ func (d *dentry) cachedMetadataAuthoritative() bool { // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. // Precondition: d.metadataMu must be locked. +// +checklocks:d.metadataMu func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if mask.Mode { if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { - d.metadataMu.Unlock() panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) } atomic.StoreUint32(&d.mode, uint32(attr.Mode)) @@ -989,6 +989,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { // Preconditions: !d.isSynthetic(). // Preconditions: d.metadataMu is locked. +// +checklocks:d.metadataMu func (d *dentry) refreshSizeLocked(ctx context.Context) error { d.handleMu.RLock() @@ -1020,6 +1021,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { // Preconditions: // * !d.isSynthetic(). // * d.metadataMu is locked. +// +checklocks:d.metadataMu func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { // Use d.readFile or d.writeFile, which represent 9P FIDs that have been // opened, in preference to d.file, which represents a 9P fid that has not. @@ -1044,7 +1046,8 @@ func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) if handleMuRLocked { - d.handleMu.RUnlock() // must be released before updateFromP9AttrsLocked() + // handleMu must be released before updateFromP9AttrsLocked(). + d.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err @@ -1091,7 +1094,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM + return linuxerr.EPERM } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { @@ -1111,7 +1114,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs case linux.S_IFDIR: return syserror.EISDIR default: - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -1158,6 +1161,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if !d.isSynthetic() { if stat.Mask != 0 { + if stat.Mask&linux.STATX_SIZE != 0 { + // d.dataMu must be held around the update to both the remote + // file's size and d.size to serialize with writeback (which + // might otherwise write data back up to the old d.size after + // the remote file has been truncated). + d.dataMu.Lock() + } if err := d.file.setAttr(ctx, p9.SetAttrMask{ Permissions: stat.Mask&linux.STATX_MODE != 0, UID: stat.Mask&linux.STATX_UID != 0, @@ -1177,13 +1187,16 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs MTimeSeconds: uint64(stat.Mtime.Sec), MTimeNanoSeconds: uint64(stat.Mtime.Nsec), }); err != nil { + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } return err } if stat.Mask&linux.STATX_SIZE != 0 { // d.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. - d.updateSizeLocked(stat.Size) + d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above } } if d.fs.opts.interop == InteropModeShared { @@ -1246,6 +1259,14 @@ func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate // Preconditions: d.metadataMu must be locked. func (d *dentry) updateSizeLocked(newSize uint64) { d.dataMu.Lock() + d.updateSizeAndUnlockDataMuLocked(newSize) +} + +// Preconditions: d.metadataMu and d.dataMu must be locked. +// +// Postconditions: d.dataMu is unlocked. +// +checklocksrelease:d.dataMu +func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { oldSize := d.size atomic.StoreUint64(&d.size, newSize) // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings @@ -1254,9 +1275,9 @@ func (d *dentry) updateSizeLocked(newSize uint64) { // contents beyond the new d.size. (We are still holding d.metadataMu, // so we can't race with Write or another truncate.) d.dataMu.Unlock() - if d.size < oldSize { + if newSize < oldSize { oldpgend, _ := hostarch.PageRoundUp(oldSize) - newpgend, _ := hostarch.PageRoundUp(d.size) + newpgend, _ := hostarch.PageRoundUp(newSize) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ @@ -1272,8 +1293,8 @@ func (d *dentry) updateSizeLocked(newSize uint64) { // truncated pages have been removed from the remote file, they // should be dropped without being written back. d.dataMu.Lock() - d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) - d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) + d.cache.Truncate(newSize, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) d.dataMu.Unlock() } } @@ -1289,7 +1310,7 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats // to the remote filesystem. This is inconsistent with Linux's 9p client, // but consistent with other filesystems (e.g. FUSE). if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) kuid := auth.KUID(atomic.LoadUint32(&d.uid)) @@ -1470,7 +1491,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo if d.isDeleted() { d.watches.HandleDeletion(ctx) } - d.destroyLocked(ctx) + d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. return } // If d still has inotify watches and it is not deleted or invalidated, it @@ -1498,7 +1519,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo delete(d.parent.children, d.name) d.parent.dirMu.Unlock() } - d.destroyLocked(ctx) + d.destroyLocked(ctx) // +checklocksforce: see above. return } @@ -1527,7 +1548,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() } - d.fs.evictCachedDentryLocked(ctx) + d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. } } @@ -1544,6 +1565,7 @@ func (d *dentry) removeFromCacheLocked() { // Precondition: fs.renameMu must be locked for writing; it may be temporarily // unlocked. +// +checklocks:fs.renameMu func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { for fs.cachedDentriesLen != 0 { fs.evictCachedDentryLocked(ctx) @@ -1552,6 +1574,7 @@ func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { // Preconditions: // * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// +checklocks:fs.renameMu func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { fs.cacheMu.Lock() victim := fs.cachedDentries.Back() @@ -1588,7 +1611,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { // will try to acquire fs.renameMu (which we have already acquired). Hence, // fs.renameMu will synchronize the destroy attempts. victim.cachingMu.Unlock() - victim.destroyLocked(ctx) + victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. } // destroyLocked destroys the dentry. @@ -1598,6 +1621,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { // * d.refs == 0. // * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal // from its former parent dentry. +// +checklocks:d.fs.renameMu func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: @@ -1631,18 +1655,18 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.dataMu.Unlock() // Clunk open fids and close open host FDs. if !d.readFile.isNil() { - d.readFile.close(ctx) + _ = d.readFile.close(ctx) } if !d.writeFile.isNil() && d.readFile != d.writeFile { - d.writeFile.close(ctx) + _ = d.writeFile.close(ctx) } d.readFile = p9file{} d.writeFile = p9file{} if d.readFD >= 0 { - unix.Close(int(d.readFD)) + _ = unix.Close(int(d.readFD)) } if d.writeFD >= 0 && d.readFD != d.writeFD { - unix.Close(int(d.writeFD)) + _ = unix.Close(int(d.writeFD)) } d.readFD = -1 d.writeFD = -1 @@ -1704,7 +1728,7 @@ func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size ui func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if d.file.isNil() { - return "", syserror.ENODATA + return "", linuxerr.ENODATA } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err @@ -1714,7 +1738,7 @@ func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vf func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if d.file.isNil() { - return syserror.EPERM + return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err @@ -1724,7 +1748,7 @@ func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vf func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { if d.file.isNil() { - return syserror.EPERM + return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 340fea813..947dbe05f 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" @@ -34,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -79,17 +79,22 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } - // Skip flushing if there are client-buffered writes, since (as with the - // VFS1 client) we don't flush buffered writes on close anyway. d := fd.dentry() - if d.fs.opts.interop != InteropModeExclusive { - return nil - } - d.dataMu.RLock() - haveDirtyPages := !d.dirty.IsEmpty() - d.dataMu.RUnlock() - if haveDirtyPages { - return nil + if d.fs.opts.interop == InteropModeExclusive { + // d may have dirty pages that we won't write back now (and wouldn't + // have in VFS1), making a flushf RPC ineffective. If this is the case, + // skip the flushf. + // + // Note that it's also possible to have dirty pages under other interop + // modes if forcePageCache is in effect; we conservatively assume that + // applications have some way of tolerating this and still want the + // flushf. + d.dataMu.RLock() + haveDirtyPages := !d.dirty.IsEmpty() + d.dataMu.RUnlock() + if haveDirtyPages { + return nil + } } d.handleMu.RLock() defer d.handleMu.RUnlock() @@ -124,14 +129,14 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs }() if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } // Check for reading at EOF before calling into MM (but not under @@ -194,14 +199,14 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() @@ -297,7 +302,7 @@ func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64 pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} var freed []memmap.FileRange @@ -652,20 +657,20 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 offset += size case linux.SEEK_DATA: if offset > size { - return 0, syserror.ENXIO + return 0, linuxerr.ENXIO } // Use offset as specified. case linux.SEEK_HOLE: if offset > size { - return 0, syserror.ENXIO + return 0, linuxerr.ENXIO } offset = size } default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return offset, nil } @@ -695,7 +700,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // All mappings require a host FD to be coherent with other // filesystem users. if atomic.LoadInt32(&d.mmapFD) < 0 { - return syserror.ENODEV + return linuxerr.ENODEV } default: panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) @@ -707,14 +712,8 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } -func (d *dentry) mayCachePages() bool { - if d.fs.opts.forcePageCache { - return true - } - if d.fs.opts.interop == InteropModeShared { - return false - } - return atomic.LoadInt32(&d.mmapFD) >= 0 +func (fs *filesystem) mayCachePagesInMemoryFile() bool { + return fs.opts.forcePageCache || fs.opts.interop != InteropModeShared } // AddMapping implements memmap.Mappable.AddMapping. @@ -726,7 +725,7 @@ func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar host for _, r := range mapped { d.pf.hostFileMapper.IncRefOn(r) } - if d.mayCachePages() { + if d.fs.mayCachePagesInMemoryFile() { // d.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. mf := d.fs.mfp.MemoryFile() @@ -745,7 +744,7 @@ func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar h for _, r := range unmapped { d.pf.hostFileMapper.DecRefOn(r) } - if d.mayCachePages() { + if d.fs.mayCachePagesInMemoryFile() { // Pages that are no longer referenced by any application memory // mappings are now considered unused; allow MemoryFile to evict them // when necessary. diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 8f81f0822..226790a11 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -247,16 +247,16 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF if found && !d.isSynthetic() { // First dentry is where the search is starting, just update attributes // since it cannot be replaced. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. } - d.metadataMu.Unlock() + d.metadataMu.Unlock() // +checklocksforce: see above. continue } // Note that synthetic dentries will always fails the comparison check // below. if !found || d.qidPath != stats[i].QID.Path { - d.metadataMu.Unlock() + d.metadataMu.Unlock() // +checklocksforce: see above. if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. @@ -298,7 +298,7 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // The file at this path hasn't changed. Just update cached metadata. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. d.metadataMu.Unlock() } @@ -354,6 +354,7 @@ func (r *revalidateState) add(name string, d *dentry) { r.dentries = append(r.dentries, d) } +// +checklocksignore func (r *revalidateState) lockAllMetadata() { for _, d := range r.dentries { d.metadataMu.Lock() @@ -372,6 +373,7 @@ func (r *revalidateState) popFront() *dentry { // reset releases all metadata locks and resets all fields to allow this // instance to be reused. +// +checklocksignore func (r *revalidateState) reset() { if r.locked { // Unlock any remaining dentries. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 3d7b5506e..4b59c1c3c 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -42,6 +42,11 @@ import ( type specialFileFD struct { fileDescription + // releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe + // to access fd.handle without locking for operations that require a ref to + // be held by the caller, e.g. vfs.FileDescriptionImpl implementations. + releaseMu sync.RWMutex `state:"nosave"` + // handle is used for file I/O. handle is immutable. handle handle `state:"nosave"` @@ -117,7 +122,10 @@ func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } + fd.releaseMu.Lock() fd.handle.close(ctx) + fd.releaseMu.Unlock() + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) @@ -184,14 +192,14 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs }() if fd.seekable && offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if d := fd.dentry(); d.cachedMetadataAuthoritative() { @@ -264,14 +272,14 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // offset should be ignored by PWrite. func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if fd.seekable && offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() @@ -355,7 +363,7 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if !fd.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } fd.mu.Lock() defer fd.mu.Unlock() @@ -373,6 +381,13 @@ func (fd *specialFileFD) Sync(ctx context.Context) error { } func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { + // Locks to ensure it didn't race with fd.Release(). + fd.releaseMu.RLock() + defer fd.releaseMu.RUnlock() + + if !fd.handle.isOpen() { + return nil + } err := func() error { // If we have a host FD, fsyncing it is likely to be faster than an fsync // RPC. diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index 2ec819f86..dbd834c67 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -41,7 +41,7 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { d.haveTarget = true d.target = target } - d.dataMu.Unlock() + d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. } return target, err } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 2dbfbdecf..89aa7b3d9 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -42,6 +42,36 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// These are the modes that are stored with virtualOwner. +const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID + +// +stateify savable +type virtualOwner struct { + // This field is initialized at creation time and is immutable. + enabled bool + + // mu protects the fields below and they can be accessed using atomic memory + // operations. + mu sync.Mutex `state:"nosave"` + uid uint32 + gid uint32 + // mode is also stored, otherwise setting the host file to `0000` could remove + // access to the file. + mode uint32 +} + +func (v *virtualOwner) atomicUID() uint32 { + return atomic.LoadUint32(&v.uid) +} + +func (v *virtualOwner) atomicGID() uint32 { + return atomic.LoadUint32(&v.gid) +} + +func (v *virtualOwner) atomicMode() uint32 { + return atomic.LoadUint32(&v.mode) +} + // inode implements kernfs.Inode. // // +stateify savable @@ -98,6 +128,11 @@ type inode struct { // Event queue for blocking operations. queue waiter.Queue + // virtualOwner caches ownership and permission information to override the + // underlying file owner and permission. This is used to allow the unstrusted + // application to change these fields without affecting the host. + virtualOwner virtualOwner + // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data // read from the pipe from previous calls to inode.beforeSave(). haveBuf // and buf are protected by bufMu. haveBuf is accessed using atomic memory @@ -115,7 +150,7 @@ func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fil // be memory-mappable. if !seekable && fileType == unix.S_IFREG { ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) - return nil, syserror.ESPIPE + return nil, linuxerr.ESPIPE } i := &inode{ @@ -147,7 +182,7 @@ func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fil type NewFDOptions struct { // If Savable is true, the host file descriptor may be saved/restored by // numeric value; the sandbox API requires a corresponding host FD with the - // same numeric value to be provieded at time of restore. + // same numeric value to be provided at time of restore. Savable bool // If IsTTY is true, the file descriptor is a TTY. @@ -157,6 +192,12 @@ type NewFDOptions struct { // the new file description will inherit flags from hostFD. HaveFlags bool Flags uint32 + + // VirtualOwner allow the host file to have owner and permissions different + // than the underlying host file. + VirtualOwner bool + UID auth.KUID + GID auth.KGID } // NewFD returns a vfs.FileDescription representing the given host file @@ -168,8 +209,8 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) } // Retrieve metadata. - var s unix.Stat_t - if err := unix.Fstat(hostFD, &s); err != nil { + var stat unix.Stat_t + if err := unix.Fstat(hostFD, &stat); err != nil { return nil, err } @@ -183,11 +224,19 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) flags = uint32(flagsInt) } - d := &kernfs.Dentry{} - i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY) + fileType := linux.FileMode(stat.Mode).FileType() + i, err := newInode(ctx, fs, hostFD, opts.Savable, fileType, opts.IsTTY) if err != nil { return nil, err } + if opts.VirtualOwner { + i.virtualOwner.enabled = true + i.virtualOwner.uid = uint32(opts.UID) + i.virtualOwner.gid = uint32(opts.GID) + i.virtualOwner.mode = stat.Mode + } + + d := &kernfs.Dentry{} d.Init(&fs.Filesystem, i) // i.open will take a reference on d. @@ -196,15 +245,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, // etc.), and use the offset from the host fd otherwise when importing. - return i.open(ctx, d, mnt, flags) -} - -// ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { - return NewFD(ctx, mnt, hostFD, &NewFDOptions{ - Savable: true, - IsTTY: isTTY, - }) + return i.open(ctx, d, mnt, fileType, flags) } // filesystemType implements vfs.FilesystemType. @@ -270,7 +311,7 @@ func (fs *filesystem) MountOptions() string { // CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + if err := i.stat(&s); err != nil { return err } return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) @@ -279,7 +320,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a // Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + if err := i.stat(&s); err != nil { // Retrieving the mode from the host fd using fstat(2) should not fail. // If the syscall does not succeed, something is fundamentally wrong. panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) @@ -290,10 +331,10 @@ func (i *inode) Mode() linux.FileMode { // Stat implements kernfs.Inode.Stat. func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { - return linux.Statx{}, syserror.EINVAL + return linux.Statx{}, linuxerr.EINVAL } if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { - return linux.Statx{}, syserror.EINVAL + return linux.Statx{}, linuxerr.EINVAL } fs := vfsfs.Impl().(*filesystem) @@ -306,7 +347,7 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp // Fallback to fstat(2), if statx(2) is not supported on the host. // // TODO(b/151263641): Remove fallback. - return i.fstat(fs) + return i.statxFromStat(fs) } if err != nil { return linux.Statx{}, err @@ -330,19 +371,35 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp // device numbers. ls.Mask |= s.Mask & linux.STATX_ALL if s.Mask&linux.STATX_TYPE != 0 { - ls.Mode |= s.Mode & linux.S_IFMT + if i.virtualOwner.enabled { + ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT + } else { + ls.Mode |= s.Mode & linux.S_IFMT + } } if s.Mask&linux.STATX_MODE != 0 { - ls.Mode |= s.Mode &^ linux.S_IFMT + if i.virtualOwner.enabled { + ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT + } else { + ls.Mode |= s.Mode &^ linux.S_IFMT + } } if s.Mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } if s.Mask&linux.STATX_UID != 0 { - ls.UID = s.Uid + if i.virtualOwner.enabled { + ls.UID = i.virtualOwner.atomicUID() + } else { + ls.UID = s.Uid + } } if s.Mask&linux.STATX_GID != 0 { - ls.GID = s.Gid + if i.virtualOwner.enabled { + ls.GID = i.virtualOwner.atomicGID() + } else { + ls.GID = s.Gid + } } if s.Mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) @@ -366,7 +423,7 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp return ls, nil } -// fstat is a best-effort fallback for inode.Stat() if the host does not +// statxFromStat is a best-effort fallback for inode.Stat() if the host does not // support statx(2). // // We ignore the mask and sync flags in opts and simply supply @@ -374,9 +431,9 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp // of a mask or sync flags. fstat(2) does not provide any metadata // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so // those fields remain empty. -func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { +func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) { var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + if err := i.stat(&s); err != nil { return linux.Statx{}, err } @@ -400,7 +457,21 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { }, nil } +func (i *inode) stat(stat *unix.Stat_t) error { + if err := unix.Fstat(i.hostFD, stat); err != nil { + return err + } + if i.virtualOwner.enabled { + stat.Uid = i.virtualOwner.atomicUID() + stat.Gid = i.virtualOwner.atomicGID() + stat.Mode = i.virtualOwner.atomicMode() + } + return nil +} + // SetStat implements kernfs.Inode.SetStat. +// +// +checklocksignore func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := &opts.Stat @@ -408,11 +479,22 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if m == 0 { return nil } - if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { - return syserror.EPERM + supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME) + if i.virtualOwner.enabled { + if m&virtualOwnerModes != 0 { + // Take lock if any of the virtual owner fields will be updated. + i.virtualOwner.mu.Lock() + defer i.virtualOwner.mu.Unlock() + } + + supportedModes |= virtualOwnerModes } + if m&^supportedModes != 0 { + return linuxerr.EPERM + } + var hostStat unix.Stat_t - if err := unix.Fstat(i.hostFD, &hostStat); err != nil { + if err := i.stat(&hostStat); err != nil { return err } if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { @@ -420,13 +502,17 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } if m&linux.STATX_MODE != 0 { - if err := unix.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { - return err + if i.virtualOwner.enabled { + i.virtualOwner.mode = uint32(opts.Stat.Mode) + } else { + if err := unix.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { + return err + } } } if m&linux.STATX_SIZE != 0 { if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { - return syserror.EINVAL + return linuxerr.EINVAL } if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err @@ -449,6 +535,14 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre return err } } + if i.virtualOwner.enabled { + if m&linux.STATX_UID != 0 { + i.virtualOwner.uid = opts.Stat.UID + } + if m&linux.STATX_GID != 0 { + i.virtualOwner.gid = opts.Stat.GID + } + } return nil } @@ -471,18 +565,17 @@ func (i *inode) DecRef(ctx context.Context) { func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. if i.Mode().FileType() == linux.S_IFSOCK { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } - return i.open(ctx, d, rp.Mount(), opts.Flags) -} - -func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { - var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + var stat unix.Stat_t + if err := i.stat(&stat); err != nil { return nil, err } - fileType := s.Mode & linux.FileTypeMask + fileType := linux.FileMode(stat.Mode).FileType() + return i.open(ctx, d, rp.Mount(), fileType, opts.Flags) +} +func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) { // Constrain flags to a subset we can handle. // // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. @@ -492,7 +585,7 @@ func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flag case unix.S_IFSOCK: if i.isTTY { log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) - return nil, syserror.ENOTTY + return nil, linuxerr.ENOTTY } ep, err := newEndpoint(ctx, i.hostFD, &i.queue) @@ -530,7 +623,7 @@ func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flag default: log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) - return nil, syserror.EPERM + return nil, linuxerr.EPERM } } @@ -585,12 +678,12 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } i := f.inode if !i.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) @@ -602,7 +695,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } i := f.inode @@ -661,7 +754,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off // PWrite implements vfs.FileDescriptionImpl.PWrite. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { if !f.inode.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } return f.writeToHostFD(ctx, src, offset, opts.Flags) @@ -701,7 +794,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque hostFD := f.inode.hostFD // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) @@ -722,7 +815,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { i := f.inode if !i.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } f.offsetMu.Lock() @@ -731,17 +824,17 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i switch whence { case linux.SEEK_SET: if offset < 0 { - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } f.offset = offset case linux.SEEK_CUR: // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. if offset > math.MaxInt64-f.offset { - return f.offset, syserror.EOVERFLOW + return f.offset, linuxerr.EOVERFLOW } if f.offset+offset < 0 { - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } f.offset += offset @@ -754,10 +847,10 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // Check for overflow. Note that underflow cannot occur, since size >= 0. if offset > math.MaxInt64-size { - return f.offset, syserror.EOVERFLOW + return f.offset, linuxerr.EOVERFLOW } if size+offset < 0 { - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } f.offset = size + offset @@ -774,7 +867,7 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i default: // Invalid whence. - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } return f.offset, nil @@ -791,7 +884,7 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts // NOTE(b/38213152): Technically, some obscure char devices can be memory // mapped, but we only allow regular files. if f.inode.ftype != unix.S_IFREG { - return syserror.ENODEV + return linuxerr.ENODEV } i := f.inode i.CachedMappable.InitFileMapperOnce() diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 8cce36212..709d5747d 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/waiter" @@ -159,7 +158,7 @@ func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMess if n < totalLen && err == nil { // The host only returns a short write if it would otherwise // block (and only for stream sockets). - err = syserror.EAGAIN + err = linuxerr.EAGAIN } if n > 0 && !linuxerr.Equals(linuxerr.EAGAIN, err) { // The caller may need to block to send more data, but diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go index e090bb725..292b44c43 100644 --- a/pkg/sentry/fsimpl/host/socket_iovec.go +++ b/pkg/sentry/fsimpl/host/socket_iovec.go @@ -16,8 +16,8 @@ package host import ( "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/hostfd" - "gvisor.dev/gvisor/pkg/syserror" ) // copyToMulti copies as many bytes from src to dst as possible. @@ -64,9 +64,9 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec if length > maxlen { if truncate { stopLen = maxlen - err = syserror.EAGAIN + err = linuxerr.EAGAIN } else { - return 0, nil, nil, syserror.EMSGSIZE + return 0, nil, nil, linuxerr.EMSGSIZE } } diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 2cf360065..7f6ce4ee5 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -148,7 +148,7 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // Ignore arg[0]. This is the real FD: @@ -189,7 +189,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch pidns := kernel.PIDNamespaceFromContext(ctx) if pidns == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } t.mu.Lock() @@ -213,14 +213,14 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() // to -ENOTTY. if linuxerr.Equals(linuxerr.EIO, err) { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } return 0, err } // Check that calling task's process group is in the TTY session. if task.ThreadGroup().Session() != t.session { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } var pgIDP primitive.Int32 @@ -231,19 +231,19 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch // pgID must be non-negative. if pgID < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Process group with pgID must exist in this PID namespace. pidns := task.PIDNamespace() pg := pidns.ProcessGroupWithID(pgID) if pg == nil { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } // Check that new process group is in the TTY session. if pg.Session() != t.session { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } t.fgProcessGroup = pg @@ -303,7 +303,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch unimpl.EmitUnimplementedEvent(ctx) fallthrough default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 84b1c3745..9d7526e47 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -71,7 +71,7 @@ func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *D // inode attributes to be changed. Override SetStat() making it call // f.InodeAttrs to allow it. func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a @@ -137,5 +137,5 @@ func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. - return syserror.EPERM + return linuxerr.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index e55111af0..8b008dc10 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -248,10 +249,10 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd)) } default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.off = offset return offset, nil diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 20319ab76..a97473f7d 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -40,7 +40,7 @@ import ( // Postcondition: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // Directory searchable? if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { @@ -71,7 +71,7 @@ afterSymlink: return d.parent, nil } if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } d.dirMu.Lock() next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name]) @@ -170,7 +170,7 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP } } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -197,7 +197,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving } } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -215,13 +215,13 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string return err } if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } if _, ok := parent.children[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } if parent.VFSDentry().IsDead() { return syserror.ENOENT @@ -238,7 +238,7 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { parent := d.parent if parent == nil { - return syserror.EBUSY + return linuxerr.EBUSY } if parent.vfsd.IsDead() { return syserror.ENOENT @@ -318,7 +318,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -345,7 +345,7 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -365,7 +365,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.ENOENT } if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -374,7 +374,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. d := vd.Dentry().Impl().(*Dentry) if d.isDir() { - return syserror.EPERM + return linuxerr.EPERM } childI, err := parent.inode.NewLink(ctx, pc, d.inode) @@ -390,7 +390,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -426,7 +426,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -512,7 +512,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return nil, syserror.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err @@ -543,7 +543,7 @@ afterTrailingSymlink: return nil, syserror.EISDIR } if len(pc) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } // Determine whether or not we need to create a file. child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */) @@ -577,7 +577,7 @@ afterTrailingSymlink: } // Open existing file or follow symlink. if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if rp.ShouldFollowSymlink() && child.isSymlink() { targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) @@ -623,7 +623,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } if !d.isSymlink() { fs.mu.RUnlock() - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // Inode.Readlink() cannot be called holding fs locks. @@ -649,13 +649,13 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Only RENAME_NOREPLACE is supported. if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -681,9 +681,9 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newName := rp.Component() if newName == "." || newName == ".." { if noReplace { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir) @@ -693,7 +693,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa case linuxerr.Equals(linuxerr.EEXIST, err): if noReplace { // Won't overwrite existing node since RENAME_NOREPLACE was requested. - return syserror.EEXIST + return linuxerr.EEXIST } dst = dstDir.children[newName] if dst == nil { @@ -752,7 +752,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa fs.deferDecRef(replaced) replaceVFSD = replaced.VFSDentry() } - virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) // +checklocksforce: to may be nil, that's okay. return nil } @@ -774,10 +774,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if !d.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if d.inode.HasChildren() { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } virtfs := rp.VirtualFilesystem() parentDentry := d.parent @@ -788,7 +788,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mntns.DecRef(ctx) vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { - return err + return err // +checklocksforce: vfsd is not locked. } if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil { @@ -844,7 +844,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -930,7 +930,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. @@ -943,7 +943,7 @@ func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, err } // kernfs currently does not support extended attributes. - return nil, syserror.ENOTSUP + return nil, linuxerr.ENOTSUP } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. @@ -956,7 +956,7 @@ func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", err } // kernfs currently does not support extended attributes. - return "", syserror.ENOTSUP + return "", linuxerr.ENOTSUP } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. @@ -969,7 +969,7 @@ func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return err } // kernfs currently does not support extended attributes. - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. @@ -982,7 +982,7 @@ func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, return err } // kernfs currently does not support extended attributes. - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 3d0866ecf..a42fc79b4 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -61,27 +62,27 @@ type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewDir implements Inode.NewDir. func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewLink implements Inode.NewLink. func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewSymlink implements Inode.NewSymlink. func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewNode implements Inode.NewNode. func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // InodeNotDirectory partially implements the Inode interface, specifically the @@ -158,12 +159,12 @@ type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // Getlink implements Inode.Getlink. func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { - return vfs.VirtualDentry{}, "", syserror.EINVAL + return vfs.VirtualDentry{}, "", linuxerr.EINVAL } // InodeAttrs partially implements the Inode interface, specifically the @@ -285,7 +286,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut // allowed by kernfs files but does not do anything. If some other behavior is // needed, the embedder should consider extending SetStat. if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM + return linuxerr.EPERM } if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { return syserror.EISDIR @@ -510,7 +511,7 @@ func (o *OrderedChildren) insert(name string, child Inode, static bool) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } s := &slot{ name: name, @@ -569,7 +570,7 @@ func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { // Unlink implements Inode.Unlink. func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error { if !o.writable { - return syserror.EPERM + return linuxerr.EPERM } o.mu.Lock() defer o.mu.Unlock() @@ -599,15 +600,15 @@ func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) e // Postcondition: reference on any replaced dentry transferred to caller. func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error { if !o.writable { - return syserror.EPERM + return linuxerr.EPERM } dst, ok := dstDir.(interface{}).(*OrderedChildren) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } if !dst.writable { - return syserror.EPERM + return linuxerr.EPERM } // Note: There's a potential deadlock below if concurrent calls to Rename @@ -653,7 +654,7 @@ type InodeSymlink struct { // Open implements Inode.Open. func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP } // StaticDirectory is a standard implementation of a directory with static @@ -709,7 +710,7 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *De // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DecRef implements Inode.DecRef. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 6f699c9cd..0e2867d49 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -52,7 +52,7 @@ // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu // (inode implementation locks, if any) -// kernfs.Filesystem.droppedDentriesMu +// kernfs.Filesystem.deferredDecRefsMu package kernfs import ( @@ -76,12 +76,12 @@ import ( type Filesystem struct { vfsfs vfs.Filesystem - droppedDentriesMu sync.Mutex `state:"nosave"` + deferredDecRefsMu sync.Mutex `state:"nosave"` - // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for - // writing. Protected by droppedDentriesMu. - droppedDentries []*Dentry + // writing. Protected by deferredDecRefsMu. + deferredDecRefs []refsvfs2.RefCounter // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but @@ -131,25 +131,49 @@ type Filesystem struct { // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. // This may be called while Filesystem.mu or Dentry.dirMu is locked. -func (fs *Filesystem) deferDecRef(d *Dentry) { - fs.droppedDentriesMu.Lock() - fs.droppedDentries = append(fs.droppedDentries, d) - fs.droppedDentriesMu.Unlock() +func (fs *Filesystem) deferDecRef(d refsvfs2.RefCounter) { + fs.deferredDecRefsMu.Lock() + fs.deferredDecRefs = append(fs.deferredDecRefs, d) + fs.deferredDecRefsMu.Unlock() +} + +// SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred +// in case Filesystem.mu is held. See comment on Filesystem.mu. +func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) { + if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // Only defer if dentry belongs to this filesystem, since locks cannot cross + // filesystems. + fs.deferDecRef(fd) + return + } + fd.DecRef(ctx) +} + +// SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred +// in case Filesystem.mu is held. See comment on Filesystem.mu. +func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) { + if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // Only defer if dentry belongs to this filesystem, since locks cannot cross + // filesystems. + fs.deferDecRef(&vd) + return + } + vd.DecRef(ctx) } // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the -// droppedDentries list. See comment on Filesystem.mu. +// deferredDecRefs list. See comment on Filesystem.mu. // // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { - fs.droppedDentriesMu.Lock() - for _, d := range fs.droppedDentries { - // Defer the DecRef call so that we are not holding droppedDentriesMu + fs.deferredDecRefsMu.Lock() + for _, d := range fs.deferredDecRefs { + // Defer the DecRef call so that we are not holding deferredDecRefsMu // when DecRef is called. defer d.DecRef(ctx) } - fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. - fs.droppedDentriesMu.Unlock() + fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse. + fs.deferredDecRefsMu.Unlock() } // VFSFilesystem returns the generic vfs filesystem object. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index de046ce1f..609887943 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -95,7 +94,7 @@ type attrs struct { } func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } type readonlyDir struct { @@ -197,15 +196,15 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (k } func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } func (fsType) Name() string { diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index a0736c0d6..4adf76ce6 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -17,9 +17,9 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // StaticSymlink provides an Inode implementation for symlinks that point to @@ -62,5 +62,5 @@ func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go index 11694c392..c91d23b56 100644 --- a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // syntheticDirectory implements kernfs.Inode for a directory created by @@ -65,13 +65,13 @@ func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, // NewFile implements Inode.NewFile. func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewDir implements Inode.NewDir. func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) { if !opts.ForSyntheticMountpoint { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) if err := dir.OrderedChildren.Insert(name, subdirI); err != nil { @@ -84,17 +84,17 @@ func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs // NewLink implements Inode.NewLink. func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewSymlink implements Inode.NewSymlink. func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewNode implements Inode.NewNode. func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // DecRef implements Inode.DecRef. diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 8fd51e9d0..1f85a1f0d 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -52,13 +52,13 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { // Can be copied-up. default: // Can't be copied-up. - return syserror.EPERM + return linuxerr.EPERM } // Ensure that our parent directory is copied-up. if d.parent == nil { // d is a filesystem root with no upper layer. - return syserror.EROFS + return linuxerr.EROFS } if err := d.parent.copyUpLocked(ctx); err != nil { return err @@ -272,7 +272,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if upperStat.Mask&linux.STATX_INO == 0 { cleanupUndoCopyUp() - return syserror.EREMOTE + return linuxerr.EREMOTE } atomic.StoreUint32(&d.devMajor, upperStat.DevMajor) atomic.StoreUint32(&d.devMinor, upperStat.DevMinor) diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index df4492346..ad3cdbb56 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isDir() bool { @@ -69,7 +69,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string return nil } // Non-whiteout file in the directory prevents rmdir. - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY })) if err != nil { readdirErr = err @@ -88,7 +88,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string } if stat.RdevMajor != 0 || stat.RdevMinor != 0 { // This file is a real character device, not a whiteout. - readdirErr = syserror.ENOTEMPTY + readdirErr = linuxerr.ENOTEMPTY return false } whiteouts[maybeWhiteoutName] = isUpper @@ -256,7 +256,7 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in switch whence { case linux.SEEK_SET: if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset == 0 { // Ensure that the next call to fd.IterDirents() calls @@ -268,13 +268,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in case linux.SEEK_CUR: offset += fd.off if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Don't clear fd.dirents in this case, even if offset == 0. fd.off = offset return fd.off, nil default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 81745bccd..5e89928c5 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -87,7 +87,7 @@ func putDentrySlice(ds *[]*dentry) { // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. // -// +checklocks:fs.renameMu +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { @@ -113,7 +113,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]* putDentrySlice(*dsp) } -// +checklocks:fs.renameMu +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() @@ -138,7 +138,7 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, lookupLayer, error) { if !d.isDir() { - return nil, lookupLayerNone, syserror.ENOTDIR + return nil, lookupLayerNone, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, lookupLayerNone, err @@ -246,7 +246,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str return false } if stat.Mask&mask != mask { - lookupErr = syserror.EREMOTE + lookupErr = linuxerr.EREMOTE return false } @@ -366,7 +366,7 @@ func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, nam // Linux's overlayfs tends to return EREMOTE in cases where a file // is unusable for reasons that are not better captured by another // errno. - lookupErr = syserror.EREMOTE + lookupErr = linuxerr.EREMOTE return false } if isWhiteout(&stat) { @@ -438,7 +438,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving d = next } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -458,7 +458,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, d = next } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -480,7 +480,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } name := rp.Component() if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if parent.vfsd.IsDead() { return syserror.ENOENT @@ -495,14 +495,14 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // Determine if a file already exists at name. if _, ok := parent.children[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } childLayer, err := fs.lookupLayerLocked(ctx, parent, name) if err != nil { return err } if childLayer.existsInOverlay() { - return syserror.EEXIST + return linuxerr.EEXIST } if !dir && rp.MustBeDir() { @@ -593,7 +593,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -621,11 +621,11 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } old := vd.Dentry().Impl().(*dentry) if old.isDir() { - return syserror.EPERM + return linuxerr.EPERM } if err := old.copyUpLocked(ctx); err != nil { return err @@ -726,7 +726,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { // Disallow attempts to create whiteouts. if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { - return syserror.EPERM + return linuxerr.EPERM } vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ @@ -783,7 +783,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return nil, syserror.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if start.isRegularFile() && mayWrite { if err := start.copyUpLocked(ctx); err != nil { @@ -823,7 +823,7 @@ afterTrailingSymlink: } // Open existing child or follow symlink. if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx) @@ -837,7 +837,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if rp.MustBeDir() && !child.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if child.isRegularFile() && mayWrite { if err := child.copyUpLocked(ctx); err != nil { @@ -872,7 +872,7 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts * return nil, syserror.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fd := &directoryFD{} fd.LockFD.Init(&d.locks) @@ -1028,19 +1028,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -1065,7 +1065,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { - return syserror.EINVAL + return linuxerr.EINVAL } if oldParent != newParent { if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { @@ -1074,7 +1074,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -1100,7 +1100,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if replaced != nil { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } replacedVFSD = &replaced.vfsd if replaced.isDir() { @@ -1108,7 +1108,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.EISDIR } if genericIsAncestorDentry(replaced, renamed) { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } replaced.dirMu.Lock() defer replaced.dirMu.Unlock() @@ -1118,7 +1118,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } } @@ -1286,10 +1286,10 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer rp.Mount().EndWrite() name := rp.Component() if name == "." { - return syserror.EINVAL + return linuxerr.EINVAL } if name == ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) @@ -1310,7 +1310,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if !child.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err @@ -1536,7 +1536,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EISDIR } if rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) @@ -1659,7 +1659,7 @@ func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Crede // Return EOPNOTSUPP when fetching an overlay attribute. // See fs/overlayfs/super.c:ovl_own_xattr_get(). if isOverlayXattr(opts.Name) { - return "", syserror.EOPNOTSUPP + return "", linuxerr.EOPNOTSUPP } // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get(). @@ -1697,7 +1697,7 @@ func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mo // Return EOPNOTSUPP when setting an overlay attribute. // See fs/overlayfs/super.c:ovl_own_xattr_set(). if isOverlayXattr(opts.Name) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set(). @@ -1742,7 +1742,7 @@ func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs // Linux passes the remove request to xattr_handler->set. // See fs/xattr.c:vfs_removexattr(). if isOverlayXattr(name) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } if err := mnt.CheckBeginWrite(); err != nil { diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 454c20d4f..46d9f1f1d 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -40,13 +40,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. @@ -135,7 +135,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts, ok := fsoptsRaw.(FilesystemOptions) if fsoptsRaw != nil && !ok { ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } vfsroot := vfs.RootFromContext(ctx) if vfsroot.Ok() { @@ -145,7 +145,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if upperPathname, ok := mopts["upperdir"]; ok { if fsopts.UpperRoot.Ok() { ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "upperdir") // Linux overlayfs also requires a workdir when upperdir is @@ -154,7 +154,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt upperPath := fspath.Parse(upperPathname) if !upperPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ Root: vfsroot, @@ -181,7 +181,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { if len(fsopts.LowerRoots) != 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "lowerdir") lowerPathnames := strings.Split(lowerPathnamesStr, ":") @@ -189,7 +189,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ Root: vfsroot, @@ -216,21 +216,21 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if len(mopts) != 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if len(fsopts.LowerRoots) == 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK if len(fsopts.LowerRoots) > maxLowerLayers { ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Take extra references held by the filesystem. @@ -277,13 +277,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if rootStat.Mask&rootStatMask != rootStatMask { root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EREMOTE + return nil, nil, linuxerr.EREMOTE } if isWhiteout(&rootStat) { ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } root.mode = uint32(rootStat.Mode) root.uid = rootStat.UID diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go index 82491a0f8..156ffeaeb 100644 --- a/pkg/sentry/fsimpl/overlay/regular_file.go +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -415,7 +415,7 @@ func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOp // Only permit mmap of regular files, since other file types may have // unpredictable behavior when mmapped (e.g. /dev/zero). if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG { - return syserror.ENODEV + return linuxerr.ENODEV } // Get a Mappable for the current top layer. diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 278ee3c92..a50510031 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/sentry/fsimpl/kernfs", @@ -16,6 +17,5 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 08aedc2ad..af09195a7 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -152,7 +152,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. if opts.Stat.Mask == 0 { return nil } - return syserror.EPERM + return linuxerr.EPERM } // Open implements kernfs.Inode.Open. diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index ce8f55b1f..f2697c12d 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -76,7 +76,7 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index c53cc0122..d99f90b36 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -180,7 +181,7 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DecRef implements kernfs.Inode.DecRef. diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index d05cc1508..f54811edf 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // taskInode represents the inode for /proc/PID/ directory. @@ -49,7 +49,7 @@ var _ kernfs.Inode = (*taskInode)(nil) func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, fakeCgroupControllers map[string]string) (kernfs.Inode, error) { if task.ExitState() == kernel.TaskExitDead { - return nil, syserror.ESRCH + return nil, linuxerr.ESRCH } contents := map[string]kernfs.Inode{ @@ -65,8 +65,8 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "io": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), "maps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}), "mem": fs.newMemInode(ctx, task, fs.NextIno(), 0400), - "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{task: task}), + "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}), + "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}), "net": fs.newTaskNetDir(ctx, task), "ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{ "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "net"), @@ -78,7 +78,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "smaps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}), "stat": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), "statm": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}), - "status": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "status": fs.newStatusInode(ctx, task, pidns, fs.NextIno(), 0444), "uid_map": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { @@ -124,7 +124,7 @@ func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.D // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DecRef implements kernfs.Inode.DecRef. diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 4718fac7a..dfc0a924e 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -42,12 +42,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, fs *filesystem, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef(ctx) + fs.SafeDecRefFD(ctx, file) return true } @@ -145,7 +145,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(ctx, i.task, fd) { + if !taskFDExists(ctx, i.fs, i.task, fd) { return nil, syserror.ENOENT } return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil @@ -198,6 +198,7 @@ type fdSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task fd int32 } @@ -206,6 +207,7 @@ var _ kernfs.Inode = (*fdSymlink)(nil) func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ + fs: fs, task: task, fd: fd, } @@ -218,9 +220,9 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if file == nil { return "", syserror.ENOENT } - defer file.DecRef(ctx) + defer s.fs.SafeDecRefFD(ctx, file) root := vfs.RootFromContext(ctx) - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) // Note: it's safe to reenter kernfs from Readlink if needed to resolve path. return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) @@ -231,7 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT } - defer file.DecRef(ctx) + defer s.fs.SafeDecRefFD(ctx, file) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil @@ -239,7 +241,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // Valid implements kernfs.Inode.Valid. func (s *fdSymlink) Valid(ctx context.Context) bool { - return taskFDExists(ctx, s.task, s.fd) + return taskFDExists(ctx, s.fs, s.task, s.fd) } // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. @@ -279,10 +281,11 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(ctx, i.task, fd) { + if !taskFDExists(ctx, i.fs, i.task, fd) { return nil, syserror.ENOENT } data := &fdInfoData{ + fs: i.fs, task: i.task, fd: fd, } @@ -316,6 +319,7 @@ func (i *fdInfoDirInode) DecRef(ctx context.Context) { type fdInfoData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task fd int32 } @@ -328,7 +332,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { if file == nil { return syserror.ENOENT } - defer file.DecRef(ctx) + defer d.fs.SafeDecRefFD(ctx, file) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt @@ -339,5 +343,5 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Valid implements kernfs.Inode.Valid. func (d *fdInfoData) Valid(ctx context.Context) bool { - return taskFDExists(ctx, d.task, d.fd) + return taskFDExists(ctx, d.fs, d.task, d.fd) } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index b294dfd6a..0ce3ed797 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsbridge" @@ -70,9 +71,9 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { func checkTaskState(t *kernel.Task) error { switch t.ExitState() { case kernel.TaskExitZombie: - return syserror.EACCES + return linuxerr.EACCES case kernel.TaskExitDead: - return syserror.ESRCH + return linuxerr.ESRCH } return nil } @@ -109,7 +110,7 @@ var _ dynamicInode = (*auxvData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.task.ExitState() == kernel.TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } m, err := getMMIncRef(d.task) if err != nil { @@ -159,7 +160,7 @@ var _ dynamicInode = (*cmdlineData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.task.ExitState() == kernel.TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } m, err := getMMIncRef(d.task) if err != nil { @@ -227,7 +228,7 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { if int(arEnvv.Length()) > remaining { end, ok := arEnvv.Start.AddLength(uint64(remaining)) if !ok { - return syserror.EFAULT + return linuxerr.EFAULT } arEnvv.End = end } @@ -325,7 +326,7 @@ func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset in // the file ..." - user_namespaces(7) srclen := src.NumBytes() if srclen >= hostarch.PageSize || offset != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } b := make([]byte, srclen) if _, err := src.CopyIn(ctx, b); err != nil { @@ -345,7 +346,7 @@ func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset in } lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) if len(lines) > maxIDMapLines { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries := make([]auth.IDMapEntry, len(lines)) @@ -353,7 +354,7 @@ func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset in var e auth.IDMapEntry _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) if err != nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries[i] = e } @@ -408,7 +409,7 @@ func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.De // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH if !kernel.ContextCanTrace(ctx, f.task, true) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } if err := checkTaskState(f.task); err != nil { return nil, err @@ -422,7 +423,7 @@ func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.De // SetStat implements kernfs.Inode.SetStat. func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } var _ vfs.FileDescriptionImpl = (*memFD)(nil) @@ -461,10 +462,10 @@ func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, e case linux.SEEK_CUR: offset += fd.offset default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.offset = offset return offset, nil @@ -485,7 +486,7 @@ func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64 n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) if n > 0 { if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return int64(n), nil } @@ -512,7 +513,7 @@ func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, e // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // Release implements vfs.FileDescriptionImpl.Release. @@ -660,34 +661,119 @@ func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// statusInode implements kernfs.Inode for /proc/[pid]/status. // // +stateify savable -type statusData struct { - kernfs.DynamicBytesFile +type statusInode struct { + kernfs.InodeAttrs + kernfs.InodeNoStatFS + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink task *kernel.Task pidns *kernel.PIDNamespace + locks vfs.FileLocks +} + +// statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for +// /proc/[pid]/status. +// +// +stateify savable +type statusFD struct { + statusFDLowerBase + vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD + + vfsfd vfs.FileDescription + + inode *statusInode + task *kernel.Task + pidns *kernel.PIDNamespace + userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns +} + +// statusFDLowerBase is a dumb hack to ensure that statusFD prefers +// vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl +// methods. +// +// +stateify savable +type statusFDLowerBase struct { + vfs.FileDescriptionDefaultImpl +} + +func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode { + // Note: credentials are overridden by taskOwnedInode. + inode := &statusInode{ + task: task, + pidns: pidns, + } + inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm) + return &taskOwnedInode{Inode: inode, owner: task} +} + +// Open implements kernfs.Inode.Open. +func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &statusFD{ + inode: s, + task: s.task, + pidns: s.pidns, + userns: rp.Credentials().UserNamespace, + } + fd.LockFD.Init(&s.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + fd.SetDataSource(fd) + return &fd.vfsfd, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return linuxerr.EPERM } -var _ dynamicInode = (*statusData)(nil) +// Release implements vfs.FileDescriptionImpl.Release. +func (s *statusFD) Release(ctx context.Context) { +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := s.vfsfd.VirtualDentry().Mount().Filesystem() + return s.inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + return linuxerr.EPERM +} // Generate implements vfs.DynamicBytesSource.Generate. -func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) + ppid := kernel.ThreadID(0) if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) if tracer := s.task.Tracer(); tracer != nil { tpid = s.pidns.IDOfTask(tracer) } fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + + creds := s.task.Credentials() + ruid := creds.RealKUID.In(s.userns).OrOverflow() + euid := creds.EffectiveKUID.In(s.userns).OrOverflow() + suid := creds.SavedKUID.In(s.userns).OrOverflow() + rgid := creds.RealKGID.In(s.userns).OrOverflow() + egid := creds.EffectiveKGID.In(s.userns).OrOverflow() + sgid := creds.SavedKGID.In(s.userns).OrOverflow() var fds int var vss, rss, data uint64 s.task.WithMuLocked(func(t *kernel.Task) { @@ -700,12 +786,26 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { data = mm.VirtualDataSize() } }) + // Filesystem user/group IDs aren't implemented; effective UID/GID are used + // instead. + fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) + fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid) fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + buf.WriteString("Groups:\t ") + // There is a space between each pair of supplemental GIDs, as well as an + // unconditional trailing space that some applications actually depend on. + var sep string + for _, kgid := range creds.ExtraKGIDs { + fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow()) + sep = " " + } + buf.WriteString(" \n") + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) - creds := s.task.Credentials() fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) @@ -762,7 +862,7 @@ var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { if o.task.ExitState() == kernel.TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) return nil @@ -784,7 +884,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset } if o.task.ExitState() == kernel.TaskExitDead { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } if err := o.task.SetOOMScoreAdj(v); err != nil { return 0, err @@ -802,13 +902,17 @@ type exeSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*exeSymlink)(nil) func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { - inode := &exeSymlink{task: task} + inode := &exeSymlink{ + fs: fs, + task: task, + } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -819,14 +923,14 @@ func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if err != nil { return "", err } - defer exec.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, exec) root := vfs.RootFromContext(ctx) if !root.Ok() { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) vfsObj := exec.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) @@ -836,7 +940,7 @@ func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) // Getlink implements kernfs.Inode.Getlink. func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { - return vfs.VirtualDentry{}, "", syserror.EACCES + return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err @@ -847,7 +951,7 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent s.task.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { - err = syserror.EACCES + err = linuxerr.EACCES return } @@ -856,7 +960,7 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent // (with locks held). exec = mm.Executable() if exec == nil { - err = syserror.ESRCH + err = linuxerr.ESRCH } }) if err != nil { @@ -878,13 +982,17 @@ type cwdSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*cwdSymlink)(nil) func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { - inode := &cwdSymlink{task: task} + inode := &cwdSymlink{ + fs: fs, + task: task, + } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -895,14 +1003,14 @@ func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if err != nil { return "", err } - defer cwd.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, cwd) root := vfs.RootFromContext(ctx) if !root.Ok() { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) @@ -912,7 +1020,7 @@ func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) // Getlink implements kernfs.Inode.Getlink. func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { - return vfs.VirtualDentry{}, "", syserror.EACCES + return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err @@ -920,8 +1028,9 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent cwd := s.task.FSContext().WorkingDirectoryVFS2() if !cwd.Ok() { // It could have raced with process deletion. - return vfs.VirtualDentry{}, "", syserror.ESRCH + return vfs.VirtualDentry{}, "", linuxerr.ESRCH } + // The reference is transferred to the caller. return cwd, "", nil } @@ -931,6 +1040,7 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent type mountInfoData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task } @@ -951,7 +1061,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef(ctx) + defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -962,6 +1072,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { type mountsData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task } @@ -982,7 +1093,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef(ctx) + defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } @@ -1123,7 +1234,7 @@ func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error // exit this file show a task is in no cgroups, which is incorrect. Instead, // once a task has left its cgroups, we return an error. if d.task.ExitState() >= kernel.TaskExitInitiated { - return syserror.ESRCH + return linuxerr.ESRCH } d.task.GenerateProcTaskCgroup(buf) diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 045ed7a2d..03bed22a3 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -53,7 +54,7 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { @@ -69,7 +70,7 @@ func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualD // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // +stateify savable @@ -94,7 +95,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) @@ -111,7 +112,7 @@ func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vi // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // dynamicBytesFileSetAttr implements a special file that allows inode diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 2bc98a94f..99f64a9d8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" ) @@ -209,7 +209,7 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -257,7 +257,7 @@ func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -311,7 +311,7 @@ func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -396,7 +396,7 @@ func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -449,7 +449,7 @@ func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error { func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -467,7 +467,7 @@ func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset i // Port numbers must be uint16s. if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { diff --git a/pkg/sentry/fsimpl/proc/yama.go b/pkg/sentry/fsimpl/proc/yama.go index e039ec45e..7240563d7 100644 --- a/pkg/sentry/fsimpl/proc/yama.go +++ b/pkg/sentry/fsimpl/proc/yama.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -56,7 +56,7 @@ func (s *yamaPtraceScope) Generate(ctx context.Context, buf *bytes.Buffer) error func (s *yamaPtraceScope) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // Ignore partial writes. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -73,7 +73,7 @@ func (s *yamaPtraceScope) Write(ctx context.Context, src usermem.IOSequence, off // We do not support YAMA levels > YAMA_SCOPE_RELATIONAL. if v < linux.YAMA_SCOPE_DISABLED || v > linux.YAMA_SCOPE_RELATIONAL { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } atomic.StoreInt32(s.level, v) diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 9453277b8..9defca936 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -9,10 +9,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 735756280..75934ecd0 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // filesystemType implements vfs.FilesystemType. @@ -102,7 +102,7 @@ type inode struct { // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 09043b572..1af0a5cbc 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -26,6 +26,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/coverage", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go index b13f141a8..51f0bf3d8 100644 --- a/pkg/sentry/fsimpl/sys/kcov.go +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -17,13 +17,13 @@ package sys import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -85,11 +85,11 @@ func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallAr case linux.KCOV_DISABLE: if arg != 0 { // This arg is unused; it should be 0. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return 0, fd.kcov.DisableTrace(ctx) default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 14eb10dcd..f322d2747 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -23,12 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -74,7 +74,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } @@ -174,7 +174,7 @@ func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // Open implements kernfs.Inode.Open. diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD index 7ce7dc429..e6980a314 100644 --- a/pkg/sentry/fsimpl/timerfd/BUILD +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -8,6 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index cbb8b67c5..655a1c76a 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -69,7 +70,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { var buf [sizeofUint64]byte diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 341b4f904..dc8b9bfeb 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -58,6 +58,7 @@ go_library( "//pkg/abi/linux", "//pkg/amutex", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/log", @@ -94,6 +95,7 @@ go_test( ":tmpfs", "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/refs", "//pkg/sentry/contexttest", @@ -101,7 +103,6 @@ go_test( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 3cc63e732..2c29343c1 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/contexttest" @@ -30,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Differences from stat_benchmark: @@ -68,7 +68,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent rel = wd } else { // Need to extract the given FD. - return syserror.EBADF + return linuxerr.EBADF } // Lookup the node. @@ -146,7 +146,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { for i := 0; i < b.N; i++ { err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { @@ -341,7 +341,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { for i := 0; i < b.N; i++ { err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index e8d256495..c25494c0b 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -196,10 +196,10 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in case linux.SEEK_CUR: offset += fd.off default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index f0f4297ef..8b04df038 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -45,7 +46,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -70,7 +71,7 @@ afterSymlink: return d.parent, nil } if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } child, ok := dir.childMap[name] if !ok { @@ -112,7 +113,7 @@ func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) } dir, ok := d.inode.impl.(*directory) if !ok { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return dir, nil } @@ -132,7 +133,7 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) d = next } if rp.MustBeDir() && !d.inode.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -161,13 +162,13 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } name := rp.Component() if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } if _, ok := parentDir.childMap[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } if !dir && rp.MustBeDir() { return syserror.ENOENT @@ -220,7 +221,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.inode.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -246,12 +247,12 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) i := d.inode if i.isDir() { - return syserror.EPERM + return linuxerr.EPERM } if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err @@ -260,7 +261,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.ENOENT } if i.nlink == maxLinks { - return syserror.EMLINK + return linuxerr.EMLINK } i.incLinksLocked() i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) @@ -274,7 +275,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { - return syserror.EMLINK + return linuxerr.EMLINK } parentDir.inode.incLinksLocked() // from child's ".." childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) @@ -300,7 +301,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v case linux.S_IFSOCK: childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir) default: - return syserror.EINVAL + return linuxerr.EINVAL } child := fs.newDentry(childInode) parentDir.insertChildLocked(child, name) @@ -312,7 +313,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_TMPFILE != 0 { // Not yet supported. - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } // Handle O_CREAT and !O_CREAT separately, since in the latter case we @@ -347,7 +348,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return nil, syserror.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } start.IncRef() defer start.DecRef(ctx) @@ -372,7 +373,7 @@ afterTrailingSymlink: return nil, syserror.EISDIR } if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } // Determine whether or not we need to create a file. child, ok := parentDir.childMap[name] @@ -401,7 +402,7 @@ afterTrailingSymlink: return fd, nil } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } // Is the file mounted over? if err := rp.CheckMount(ctx, &child.vfsd); err != nil { @@ -418,7 +419,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if rp.MustBeDir() && !child.inode.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } child.IncRef() defer child.DecRef(ctx) @@ -466,13 +467,13 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH, which is handled at the VFS layer. - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP case *namedPipe: return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } @@ -488,7 +489,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } symlink, ok := d.inode.impl.(*symlink) if !ok { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } symlink.inode.touchAtime(rp.Mount()) return symlink.target, nil @@ -506,19 +507,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if opts.Flags&^linux.RENAME_NOREPLACE != 0 { // TODO(b/145974740): Support other renameat2 flags. - return syserror.EINVAL + return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -541,7 +542,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // mounted filesystem. if renamed.inode.isDir() { if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { - return syserror.EINVAL + return linuxerr.EINVAL } if oldParentDir != newParentDir { // Writability is needed to change renamed's "..". @@ -551,7 +552,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -561,7 +562,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replaced, ok := newParentDir.childMap[newName] if ok { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } replacedDir, ok := replaced.inode.impl.(*directory) if ok { @@ -569,19 +570,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.EISDIR } if len(replacedDir.childMap) != 0 { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } else { if rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if renamed.inode.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } } else { if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks { - return syserror.EMLINK + return linuxerr.EMLINK } } // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can @@ -646,10 +647,10 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } name := rp.Component() if name == "." { - return syserror.EINVAL + return linuxerr.EINVAL } if name == ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } child, ok := parentDir.childMap[name] if !ok { @@ -660,10 +661,10 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } childDir, ok := child.inode.impl.(*directory) if !ok { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if len(childDir.childMap) != 0 { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -766,7 +767,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EISDIR } if rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -806,11 +807,11 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath switch impl := d.inode.impl.(type) { case *socketFile: if impl.ep == nil { - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } return impl.ep, nil default: - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index c45bddff6..0f2ac6144 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -33,7 +34,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -185,7 +185,7 @@ func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { // Can we grow the file? if rf.seals&linux.F_SEAL_GROW != 0 { rf.dataMu.Unlock() - return false, syserror.EPERM + return false, linuxerr.EPERM } // We only need to update the file size. atomic.StoreUint64(&rf.size, newSize) @@ -196,7 +196,7 @@ func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { // We are shrinking the file. First check if this is allowed. if rf.seals&linux.F_SEAL_SHRINK != 0 { rf.dataMu.Unlock() - return false, syserror.EPERM + return false, linuxerr.EPERM } // Update the file size. @@ -233,7 +233,7 @@ func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, a // Reject writable mapping if F_SEAL_WRITE is set. if rf.seals&linux.F_SEAL_WRITE != 0 && writable { - return syserror.EPERM + return linuxerr.EPERM } rf.mappings.AddMapping(ms, ar, offset, writable) @@ -366,7 +366,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs fsmetric.TmpfsReads.Increment() if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since @@ -374,7 +374,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -407,7 +407,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // final offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since @@ -415,7 +415,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } srclen := src.NumBytes() @@ -432,7 +432,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } if end := offset + srclen; end < offset { // Overflow. - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } srclen, err = vfs.CheckLimit(ctx, offset, srclen) @@ -476,10 +476,10 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( case linux.SEEK_END: offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size)) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.off = offset return offset, nil @@ -594,7 +594,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, // Check if seals prevent either file growth or all writes. switch { case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed - return 0, syserror.EPERM + return 0, linuxerr.EPERM case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would // normally grow the file to partially succeed up to the current EOF, @@ -615,7 +615,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, } if end <= rw.off { // Truncation would result in no data being written. - return 0, syserror.EPERM + return 0, linuxerr.EPERM } } @@ -684,7 +684,7 @@ exitLoop: func GetSeals(fd *vfs.FileDescription) (uint32, error) { f, ok := fd.Impl().(*regularFileFD) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } rf := f.inode().impl.(*regularFile) rf.dataMu.RLock() @@ -696,7 +696,7 @@ func GetSeals(fd *vfs.FileDescription) (uint32, error) { func AddSeals(fd *vfs.FileDescription, val uint32) error { f, ok := fd.Impl().(*regularFileFD) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } rf := f.inode().impl.(*regularFile) rf.mapsMu.Lock() @@ -706,13 +706,13 @@ func AddSeals(fd *vfs.FileDescription, val uint32) error { if rf.seals&linux.F_SEAL_SEAL != 0 { // Seal applied which prevents addition of any new seals. - return syserror.EPERM + return linuxerr.EPERM } // F_SEAL_WRITE can only be added if there are no active writable maps. if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { if rf.writableMappingPages > 0 { - return syserror.EBUSY + return linuxerr.EBUSY } } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 6b4367c42..f2250c025 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -138,7 +139,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootMode = linux.FileMode(mode & 07777) } @@ -149,12 +150,12 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootKUID = kuid } @@ -165,18 +166,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootKGID = kgid } if len(mopts) != 0 { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } devMinor, err := vfsObj.GetAnonBlockDevMinor() @@ -396,8 +397,8 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth } // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). - if parentDir != nil && parentDir.inode.mode&linux.S_ISGID == linux.S_ISGID { - kgid = auth.KGID(parentDir.inode.gid) + if parentDir != nil && atomic.LoadUint32(&parentDir.inode.mode)&linux.S_ISGID == linux.S_ISGID { + kgid = auth.KGID(atomic.LoadUint32(&parentDir.inode.gid)) if mode&linux.S_IFDIR == linux.S_IFDIR { mode |= linux.S_ISGID } @@ -527,7 +528,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs. return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM + return linuxerr.EPERM } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { @@ -557,7 +558,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs. case *directory: return syserror.EISDIR default: - return syserror.EINVAL + return linuxerr.EINVAL } } if mask&linux.STATX_UID != 0 { @@ -730,7 +731,7 @@ func checkXattrName(name string) error { if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { return nil } - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index e84452421..930016a3e 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -75,6 +75,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { @@ -90,6 +91,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*d putDentrySlice(*ds) } +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() @@ -114,7 +116,7 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { @@ -283,7 +285,7 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi Mode: uint32(parentStat.Mode), UID: parentStat.UID, GID: parentStat.GID, - Children: parent.childrenNames, + Children: parent.childrenList, HashAlgorithms: fs.alg.toLinuxHashAlg(), ReadOffset: int64(offset), ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())), @@ -404,6 +406,9 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry var buf bytes.Buffer d.hashMu.RLock() + + d.generateChildrenList() + params := &merkletree.VerifyParams{ Out: &buf, Tree: &fdReader, @@ -412,7 +417,7 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry Mode: uint32(stat.Mode), UID: stat.UID, GID: stat.GID, - Children: d.childrenNames, + Children: d.childrenList, HashAlgorithms: fs.alg.toLinuxHashAlg(), ReadOffset: 0, // Set read size to 0 so only the metadata is verified. @@ -680,7 +685,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving d = next } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -700,7 +705,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, d = next } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -709,7 +714,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { // Verity file system is read-only. if ats&vfs.MayWrite != 0 { - return syserror.EROFS + return linuxerr.EROFS } var ds *[]*dentry fs.renameMu.RLock() @@ -732,7 +737,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -759,26 +764,26 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Verity fs is read-only. if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 { - return nil, syserror.EROFS + return nil, linuxerr.EROFS } var ds *[]*dentry @@ -827,7 +832,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // Users should not open the Merkle tree files. Those are for verity fs // use only. if strings.Contains(d.name, merklePrefix) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats); err != nil { @@ -836,7 +841,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // Verity fs is read-only. if ats&vfs.MayWrite != 0 { - return nil, syserror.EROFS + return nil, linuxerr.EROFS } // Get the path to the target file. This is only used to provide path @@ -846,11 +851,18 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } + tmpOpts := *opts + + // Open the lowerFD with O_PATH if a symlink is opened for verity. + if tmpOpts.Flags&linux.O_NOFOLLOW != 0 && d.isSymlink() { + tmpOpts.Flags |= linux.O_PATH + } + // Open the file in the underlying file system. lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: d.lowerVD, Start: d.lowerVD, - }, opts) + }, &tmpOpts) // The file should exist, as we succeeded in finding its dentry. If it's // missing, it indicates an unexpected modification to the file system. @@ -888,7 +900,6 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // be called if a verity FD is successfully created. defer merkleReader.DecRef(ctx) - lowerFlags := lowerFD.StatusFlags() lowerFDOpts := lowerFD.Options() var merkleWriter *vfs.FileDescription var parentMerkleWriter *vfs.FileDescription @@ -941,7 +952,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf isDir: d.isDir(), } - if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { return nil, err } lowerFD.IncRef() @@ -970,19 +981,19 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -1022,13 +1033,13 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. @@ -1039,7 +1050,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { return nil, err } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. @@ -1077,13 +1088,13 @@ func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index c5f59d851..c5fa9855b 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -39,6 +39,7 @@ import ( "encoding/json" "fmt" "math" + "sort" "strconv" "strings" "sync/atomic" @@ -252,7 +253,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt hash, err := hex.DecodeString(encodedRootHash) if err != nil { ctx.Warningf("verity.FilesystemType.GetFilesystem: Failed to decode root hash: %v", err) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootHash = hash } @@ -270,19 +271,19 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Check for unparsed options. if len(mopts) != 0 { ctx.Warningf("verity.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Handle internal options. iopts, ok := opts.InternalData.(InternalFilesystemOptions) if len(lowerPathname) == 0 && !ok { ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if len(lowerPathname) != 0 { if ok { ctx.Warningf("verity.FilesystemType.GetFilesystem: unexpected verity configs with specified lower path") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } iopts = InternalFilesystemOptions{ AllowRuntimeEnable: len(rootHash) == 0, @@ -301,7 +302,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { ctx.Infof("verity.FilesystemType.GetFilesystem: lower_path %q must be absolute", lowerPathname) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } var err error mountedLowerVD, err = vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ @@ -440,7 +441,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if !d.isDir() { ctx.Warningf("verity root must be a directory") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if !fs.allowRuntimeEnable { @@ -509,6 +510,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return nil, nil, err } + d.generateChildrenList() } d.vfsd.Init(d) @@ -565,6 +567,11 @@ type dentry struct { // populated by enableVerity. childrenNames is also protected by dirMu. childrenNames map[string]struct{} + // childrenList is a complete sorted list of childrenNames. This list + // is generated when verity is enabled, or the first time the file is + // verified in non runtime enable mode. + childrenList []string + // lowerVD is the VirtualDentry in the underlying file system. It is // never modified after initialized. lowerVD vfs.VirtualDentry @@ -750,6 +757,17 @@ func (d *dentry) verityEnabled() bool { return !d.fs.allowRuntimeEnable || len(d.hash) != 0 } +// generateChildrenList generates a sorted childrenList from childrenNames, and +// cache it in d for hashing. +func (d *dentry) generateChildrenList() { + if len(d.childrenList) == 0 && len(d.childrenNames) != 0 { + for child := range d.childrenNames { + d.childrenList = append(d.childrenList, child) + } + sort.Strings(d.childrenList) + } +} + // getLowerAt returns the dentry in the underlying file system, which is // represented by filename relative to d. func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) { @@ -858,13 +876,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { // Verity files are read-only. - return syserror.EPERM + return linuxerr.EPERM } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { if !fd.d.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } fd.mu.Lock() defer fd.mu.Unlock() @@ -922,14 +940,14 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) case linux.SEEK_END: n = int64(fd.d.size) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset > math.MaxInt64-n { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } offset += n if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.off = offset return offset, nil @@ -963,10 +981,12 @@ func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, ui return nil, 0, err } + fd.d.generateChildrenList() + params := &merkletree.GenerateParams{ TreeReader: &merkleReader, TreeWriter: &merkleWriter, - Children: fd.d.childrenNames, + Children: fd.d.childrenList, HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), Name: fd.d.name, Mode: uint32(stat.Mode), @@ -1008,7 +1028,7 @@ func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, ui default: // TODO(b/167728857): Investigate whether and how we should // enable other types of file. - return nil, 0, syserror.EINVAL + return nil, 0, linuxerr.EINVAL } hash, err := merkletree.Generate(params) return hash, uint64(params.Size), err @@ -1057,7 +1077,7 @@ func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error { // and stores its hash in its parent directory's Merkle tree. func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { if !fd.d.fs.allowRuntimeEnable { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } fd.d.fs.verityMu.Lock() @@ -1126,7 +1146,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hostarch.Addr) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var metadata linux.DigestMetadata @@ -1139,7 +1159,7 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hosta // enabled, in which case fd.d.hash should be set. if len(fd.d.hash) == 0 { if fd.d.fs.allowRuntimeEnable { - return 0, syserror.ENODATA + return 0, linuxerr.ENODATA } return 0, fd.d.fs.alertIntegrityViolation("Ioctl measureVerity: no hash found") } @@ -1149,7 +1169,7 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hosta return 0, err } if metadata.DigestSize < uint16(len(fd.d.hash)) { - return 0, syserror.EOVERFLOW + return 0, linuxerr.EOVERFLOW } // Populate the output digest size, since DigestSize is both input and @@ -1179,7 +1199,7 @@ func (fd *fileDescription) verityFlags(ctx context.Context, flags hostarch.Addr) t := kernel.TaskFromContext(ctx) if t == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } _, err := primitive.CopyInt32Out(t, flags, f) return 0, err @@ -1262,7 +1282,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Mode: fd.d.mode, UID: fd.d.uid, GID: fd.d.gid, - Children: fd.d.childrenNames, + Children: fd.d.childrenList, HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), ReadOffset: offset, ReadSize: dst.NumBytes(), @@ -1278,12 +1298,12 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EROFS + return 0, linuxerr.EROFS } // Write implements vfs.FileDescriptionImpl.Write. func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EROFS + return 0, linuxerr.EROFS } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. @@ -1299,7 +1319,7 @@ func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapO // Check if mmap is allowed on the lower filesystem. if !opts.SentryOwnedContent { - return syserror.ENODEV + return linuxerr.ENODEV } return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } @@ -1434,7 +1454,7 @@ func (r *mmapReadSeeker) ReadAt(p []byte, off int64) (int, error) { // mapped region. readOffset := off - int64(r.Offset) if readOffset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } bs.DropFirst64(uint64(readOffset)) view := bs.TakeFirst64(uint64(len(p))) diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index 65465b814..af041bd50 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -899,7 +899,7 @@ func TestUnmodifiedSymlinkFileReadSucceeds(t *testing.T) { t.Fatalf("SymlinkAt: %v", err) } - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("openVerityAt symlink: %v", err) @@ -1034,7 +1034,7 @@ func TestDeletedSymlinkFileReadFails(t *testing.T) { t.Fatalf("SymlinkAt: %v", err) } - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("openVerityAt symlink: %v", err) @@ -1136,7 +1136,7 @@ func TestModifiedSymlinkFileReadFails(t *testing.T) { } // Open symlink file to get the fd for ioctl in new step. - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt symlink: %v", err) } diff --git a/pkg/sentry/hostfd/hostfd_linux.go b/pkg/sentry/hostfd/hostfd_linux.go index e103e7296..0131da22d 100644 --- a/pkg/sentry/hostfd/hostfd_linux.go +++ b/pkg/sentry/hostfd/hostfd_linux.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package hostfd // MaxReadWriteIov is the maximum permitted size of a struct iovec array in a diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 10563e3d7..e4e0dc04f 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -227,6 +227,7 @@ go_library( "//pkg/context", "//pkg/coverage", "//pkg/cpuid", + "//pkg/errors", "//pkg/errors/linuxerr", "//pkg/eventchannel", "//pkg/fspath", @@ -256,6 +257,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", + "//pkg/sentry/kernel/msgqueue", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/semaphore", "//pkg/sentry/kernel/shm", @@ -301,6 +303,7 @@ go_test( deps = [ "//pkg/abi", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/arch", "//pkg/sentry/contexttest", @@ -312,6 +315,5 @@ go_test( "//pkg/sentry/time", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index d100e58d7..5d86a04f3 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -27,7 +27,7 @@ import ( // +stateify savable type abstractEndpoint struct { ep transport.BoundEndpoint - socket refsvfs2.RefCounter + socket refsvfs2.TryRefCounter name string ns *AbstractSocketNamespace } @@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { // its backing socket. type boundEndpoint struct { transport.BoundEndpoint - socket refsvfs2.RefCounter + socket refsvfs2.TryRefCounter } // Release implements transport.BoundEndpoint.Release. @@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.TryRefCounter) error { a.mu.Lock() defer a.mu.Unlock() @@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran // Remove removes the specified socket at name from the abstract socket // namespace, if it has not yet been replaced. -func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) { +func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.TryRefCounter) { a.mu.Lock() defer a.mu.Unlock() diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 12180351d..7a1a36454 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -63,6 +63,7 @@ go_library( "//pkg/abi/linux", "//pkg/bits", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index 3325fedcb..fc245c54b 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -16,7 +16,7 @@ package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // Credentials contains information required to authorize privileged operations @@ -203,7 +203,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) { // uid must be mapped. kuid := c.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return NoID, syserror.EINVAL + return NoID, linuxerr.EINVAL } // If c has CAP_SETUID, then it can use any UID in its user namespace. if c.HasCapability(linux.CAP_SETUID) { @@ -214,7 +214,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) { if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID { return kuid, nil } - return NoID, syserror.EPERM + return NoID, linuxerr.EPERM } // UseGID checks that c can use gid in its user namespace, then translates it @@ -222,7 +222,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) { func (c *Credentials) UseGID(gid GID) (KGID, error) { kgid := c.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return NoID, syserror.EINVAL + return NoID, linuxerr.EINVAL } if c.HasCapability(linux.CAP_SETGID) { return kgid, nil @@ -230,7 +230,7 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID { return kgid, nil } - return NoID, syserror.EPERM + return NoID, linuxerr.EPERM } // SetUID translates the provided uid to the root user namespace and updates c's @@ -239,7 +239,7 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { func (c *Credentials) SetUID(uid UID) error { kuid := c.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } c.RealKUID = kuid c.EffectiveKUID = kuid @@ -253,7 +253,7 @@ func (c *Credentials) SetUID(uid UID) error { func (c *Credentials) SetGID(gid GID) error { kgid := c.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } c.RealKGID = kgid c.EffectiveKGID = kgid diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go index 28cbe159d..f06a374a0 100644 --- a/pkg/sentry/kernel/auth/id_map.go +++ b/pkg/sentry/kernel/auth/id_map.go @@ -17,7 +17,7 @@ package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns. @@ -106,11 +106,11 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er // than once to a uid_map file in a user namespace fails with the error // EPERM. Similar rules apply for gid_map files." - user_namespaces(7) if !ns.uidMapFromParent.IsEmpty() { - return syserror.EPERM + return linuxerr.EPERM } // "At least one line must be written to the file." if len(entries) == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // """ // In order for a process to write to the /proc/[pid]/uid_map @@ -121,12 +121,12 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er // in the user namespace of the process pid. // """ if !c.HasCapabilityIn(linux.CAP_SETUID, ns) { - return syserror.EPERM + return linuxerr.EPERM } // "2. The writing process must either be in the user namespace of the process // pid or be in the parent user namespace of the process pid." if c.UserNamespace != ns && c.UserNamespace != ns.parent { - return syserror.EPERM + return linuxerr.EPERM } // """ // 3. (see trySetUIDMap) @@ -145,14 +145,14 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er // parent user namespace to a user ID (group ID) in the user namespace. // """ if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 { - return syserror.EPERM + return linuxerr.EPERM } // """ // + The writing process must have the same effective user ID as the // process that created the user namespace. // """ if c.EffectiveKUID != ns.owner { - return syserror.EPERM + return linuxerr.EPERM } } // trySetUIDMap leaves data in maps if it fails. @@ -170,11 +170,11 @@ func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { // checks for NoID. lastID := e.FirstID + e.Length if lastID <= e.FirstID { - return syserror.EINVAL + return linuxerr.EINVAL } lastParentID := e.FirstParentID + e.Length if lastParentID <= e.FirstParentID { - return syserror.EINVAL + return linuxerr.EINVAL } // "3. The mapped user IDs (group IDs) must in turn have a mapping in // the parent user namespace." @@ -182,14 +182,14 @@ func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { // mappings when it's created, so SetUIDMap would have returned EPERM // without reaching this point if ns is root. if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) { - return syserror.EPERM + return linuxerr.EPERM } // If either of these Adds fail, we have an overlapping range. if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { - return syserror.EINVAL + return linuxerr.EINVAL } if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { - return syserror.EINVAL + return linuxerr.EINVAL } } return nil @@ -202,24 +202,24 @@ func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) er ns.mu.Lock() defer ns.mu.Unlock() if !ns.gidMapFromParent.IsEmpty() { - return syserror.EPERM + return linuxerr.EPERM } if len(entries) == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if !c.HasCapabilityIn(linux.CAP_SETGID, ns) { - return syserror.EPERM + return linuxerr.EPERM } if c.UserNamespace != ns && c.UserNamespace != ns.parent { - return syserror.EPERM + return linuxerr.EPERM } if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) { if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 { - return syserror.EPERM + return linuxerr.EPERM } // It's correct for this to still be UID. if c.EffectiveKUID != ns.owner { - return syserror.EPERM + return linuxerr.EPERM } // "In the case of gid_map, use of the setgroups(2) system call must // first be denied by writing "deny" to the /proc/[pid]/setgroups file @@ -239,20 +239,20 @@ func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error { for _, e := range entries { lastID := e.FirstID + e.Length if lastID <= e.FirstID { - return syserror.EINVAL + return linuxerr.EINVAL } lastParentID := e.FirstParentID + e.Length if lastParentID <= e.FirstParentID { - return syserror.EINVAL + return linuxerr.EINVAL } if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) { - return syserror.EPERM + return linuxerr.EPERM } if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { - return syserror.EINVAL + return linuxerr.EINVAL } if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { - return syserror.EINVAL + return linuxerr.EINVAL } } return nil diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index 9dd52c860..40a406f9d 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -17,8 +17,8 @@ package auth import ( "math" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // A UserNamespace represents a user namespace. See user_namespaces(7) for @@ -105,7 +105,7 @@ func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { if c.UserNamespace.depth() >= maxUserNamespaceDepth { // "... Calls to unshare(2) or clone(2) that would cause this limit to // be exceeded fail with the error EUSERS." - user_namespaces(7) - return nil, syserror.EUSERS + return nil, linuxerr.EUSERS } // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective // user ID or the effective group ID of the caller does not have a mapping @@ -114,10 +114,10 @@ func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { // process are mapped to user IDs and group IDs in the user namespace of // the calling process at the time of the call." - unshare(2) if !c.EffectiveKUID.In(c.UserNamespace).Ok() { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } if !c.EffectiveKGID.In(c.UserNamespace).Ok() { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } return &UserNamespace{ parent: c.UserNamespace, diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 6224a0cbd..6b2dd09da 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -8,12 +8,12 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 5d584dc45..473987a79 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -17,12 +17,12 @@ package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -248,7 +248,7 @@ func (a *FileAsync) Signal() linux.Signal { // to send SIGIO. func (a *FileAsync) SetSignal(signal linux.Signal) error { if signal != 0 && !signal.IsValid() { - return syserror.EINVAL + return linuxerr.EINVAL } a.mu.Lock() defer a.mu.Unlock() diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 9f7702fcc..eff556a0c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -108,7 +108,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { ctx := context.Background() f.initNoLeakCheck() // Initialize table. - f.fdBitmap = bitmap.BitmapWithSize(uint32(math.MaxUint16)) + f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) for fd, d := range m { if fd < 0 { panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd)) diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index c4cac6b99..2b3e6ef71 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -46,7 +46,7 @@ func (f *FDTable) initNoLeakCheck() { func (f *FDTable) init() { f.initNoLeakCheck() f.InitRefs() - f.fdBitmap = bitmap.BitmapWithSize(uint32(math.MaxUint16)) + f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) } // get gets a file entry. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 6c31e082c..cfdea5cf7 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -37,6 +37,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/sentry/memmap", @@ -53,8 +54,8 @@ go_test( library = ":futex", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sync", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 0427cf3f4..f5c364c96 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -20,6 +20,7 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" @@ -122,7 +123,7 @@ func check(t Target, addr hostarch.Addr, val uint32) error { return err } if cur != val { - return syserror.EAGAIN + return linuxerr.EAGAIN } return nil } @@ -332,7 +333,7 @@ func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) { // Ensure the address is aligned. // It must be a DWORD boundary. if addr&0x3 != 0 { - return Key{}, syserror.EINVAL + return Key{}, linuxerr.EINVAL } if private { return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil @@ -397,8 +398,8 @@ func (m *Manager) Fork() *Manager { } // lockBucket returns a locked bucket for the given key. -func (m *Manager) lockBucket(k *Key) *bucket { - var b *bucket +// +checklocksacquire:b.mu +func (m *Manager) lockBucket(k *Key) (b *bucket) { if k.Kind == KindSharedMappable { b = m.sharedBucket } else { @@ -409,7 +410,9 @@ func (m *Manager) lockBucket(k *Key) *bucket { } // lockBuckets returns locked buckets for the given keys. -func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { +// +checklocksacquire:b1.mu +// +checklocksacquire:b2.mu +func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) { // Buckets must be consistently ordered to avoid circular lock // dependencies. We order buckets in m.privateBuckets by index (lowest // index first), and all buckets in m.privateBuckets precede @@ -419,8 +422,8 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { i1 := bucketIndexForAddr(k1.addr()) i2 := bucketIndexForAddr(k2.addr()) - b1 := &m.privateBuckets[i1] - b2 := &m.privateBuckets[i2] + b1 = &m.privateBuckets[i1] + b2 = &m.privateBuckets[i2] switch { case i1 < i2: b1.mu.Lock() @@ -431,19 +434,30 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { default: b1.mu.Lock() } - return b1, b2 + return b1, b2 // +checklocksforce } // At least one of b1 or b2 should be m.sharedBucket. - b1 := m.sharedBucket - b2 := m.sharedBucket + b1 = m.sharedBucket + b2 = m.sharedBucket if k1.Kind != KindSharedMappable { b1 = m.lockBucket(k1) } else if k2.Kind != KindSharedMappable { b2 = m.lockBucket(k2) } m.sharedBucket.mu.Lock() - return b1, b2 + return b1, b2 // +checklocksforce +} + +// unlockBuckets unlocks two buckets. +// +checklocksrelease:b1.mu +// +checklocksrelease:b2.mu +func (m *Manager) unlockBuckets(b1, b2 *bucket) { + b1.mu.Unlock() + if b1 != b2 { + b2.mu.Unlock() + } + return // +checklocksforce } // Wake wakes up to n waiters matching the bitmask on the given addr. @@ -476,10 +490,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, c defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) - defer b1.mu.Unlock() - if b2 != b1 { - defer b2.mu.Unlock() - } + defer m.unlockBuckets(b1, b2) if checkval { if err := check(t, addr, val); err != nil { @@ -526,10 +537,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwa defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) - defer b1.mu.Unlock() - if b2 != b1 { - defer b2.mu.Unlock() - } + defer m.unlockBuckets(b1, b2) done := 0 cond, err := atomicOp(t, addr2, op) @@ -670,7 +678,7 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint return false, err } if (cur & linux.FUTEX_TID_MASK) == tid { - return false, syserror.EDEADLK + return false, linuxerr.EDEADLK } if (cur & linux.FUTEX_TID_MASK) == 0 { @@ -745,7 +753,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu } if (cur & linux.FUTEX_TID_MASK) != tid { - return syserror.EPERM + return linuxerr.EPERM } var next *Waiter // Who's the next owner? @@ -773,7 +781,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu if prev != cur { // Let user mode handle CAS races. This is different than lock, which // retries when CAS fails. - return syserror.EAGAIN + return linuxerr.EAGAIN } return nil } @@ -790,7 +798,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu return err } if prev != cur { - return syserror.EINVAL + return linuxerr.EINVAL } b.wakeWaiterLocked(next) diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index deba44e5c..04c136f87 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -21,8 +21,8 @@ import ( "testing" "unsafe" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sync" ) @@ -488,7 +488,7 @@ func (t *testMutex) Lock() { // Wait for it to be "not locked". w := NewWaiter() err := t.m.WaitPrepare(w, t.d, t.a, true, testMutexLocked, ^uint32(0)) - if err == unix.EAGAIN { + if linuxerr.Equals(linuxerr.EAGAIN, err) { continue } if err != nil { diff --git a/pkg/sentry/kernel/ipc/BUILD b/pkg/sentry/kernel/ipc/BUILD new file mode 100644 index 000000000..e42a94e15 --- /dev/null +++ b/pkg/sentry/kernel/ipc/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "ipc", + srcs = [ + "object.go", + "registry.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/log", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + ], +) diff --git a/pkg/sentry/kernel/ipc/object.go b/pkg/sentry/kernel/ipc/object.go new file mode 100644 index 000000000..387b35e7e --- /dev/null +++ b/pkg/sentry/kernel/ipc/object.go @@ -0,0 +1,115 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ipc defines functionality and utilities common to sysvipc mechanisms. +// +// Lock ordering: [shm/semaphore/msgqueue].Registry.mu -> Mechanism +package ipc + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// Key is a user-provided identifier for IPC objects. +type Key int32 + +// ID is a kernel identifier for IPC objects. +type ID int32 + +// Object represents an abstract IPC object with fields common to all IPC +// mechanisms. +// +// +stateify savable +type Object struct { + // User namespace which owns the IPC namespace which owns the IPC object. + // Immutable. + UserNS *auth.UserNamespace + + // ID is a kernel identifier for the IPC object. Immutable. + ID ID + + // Key is a user-provided identifier for the IPC object. Immutable. + Key Key + + // Creator is the user who created the IPC object. Immutable. + Creator fs.FileOwner + + // Owner is the current owner of the IPC object. + Owner fs.FileOwner + + // Perms is the access permissions the IPC object. + Perms fs.FilePermissions +} + +// Mechanism represents a SysV mechanism that holds an IPC object. It can also +// be looked at as a container for an ipc.Object, which is by definition a fully +// functional SysV object. +type Mechanism interface { + // Lock behaves the same as Mutex.Lock on the mechanism. + Lock() + + // Unlock behaves the same as Mutex.Unlock on the mechanism. + Unlock() + + // Object returns a pointer to the mechanism's ipc.Object. Mechanism.Lock, + // and Mechanism.Unlock should be used when the object is used. + Object() *Object + + // Destroy destroys the mechanism. + Destroy() +} + +// NewObject returns a new, initialized ipc.Object. The newly returned object +// doesn't have a valid ID. When the object is registered, the registry assigns +// it a new unique ID. +func NewObject(un *auth.UserNamespace, key Key, creator, owner fs.FileOwner, perms fs.FilePermissions) *Object { + return &Object{ + UserNS: un, + Key: key, + Creator: creator, + Owner: owner, + Perms: perms, + } +} + +// CheckOwnership verifies whether an IPC object may be accessed using creds as +// an owner. See ipc/util.c:ipcctl_obtain_check() in Linux. +func (o *Object) CheckOwnership(creds *auth.Credentials) bool { + if o.Owner.UID == creds.EffectiveKUID || o.Creator.UID == creds.EffectiveKUID { + return true + } + + // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux + // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented + // for use to "override IPC ownership checks". + return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, o.UserNS) +} + +// CheckPermissions verifies whether an IPC object is accessible using creds for +// access described by req. See ipc/util.c:ipcperms() in Linux. +func (o *Object) CheckPermissions(creds *auth.Credentials, req fs.PermMask) bool { + p := o.Perms.Other + if o.Owner.UID == creds.EffectiveKUID { + p = o.Perms.User + } else if creds.InGroup(o.Owner.GID) { + p = o.Perms.Group + } + + if p.SupersetOf(req) { + return true + } + return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, o.UserNS) +} diff --git a/pkg/sentry/kernel/ipc/registry.go b/pkg/sentry/kernel/ipc/registry.go new file mode 100644 index 000000000..91de19070 --- /dev/null +++ b/pkg/sentry/kernel/ipc/registry.go @@ -0,0 +1,196 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipc + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// Registry is similar to Object, but for registries. It represent an abstract +// SysV IPC registry with fields common to all SysV registries. Registry is not +// thread-safe, and should be protected using a mutex. +// +// +stateify savable +type Registry struct { + // UserNS owning the IPC namespace this registry belongs to. Immutable. + UserNS *auth.UserNamespace + + // objects is a map of IDs to IPC mechanisms. + objects map[ID]Mechanism + + // KeysToIDs maps a lookup key to an ID. + keysToIDs map[Key]ID + + // lastIDUsed is used to find the next available ID for object creation. + lastIDUsed ID +} + +// NewRegistry return a new, initialized ipc.Registry. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + UserNS: userNS, + objects: make(map[ID]Mechanism), + keysToIDs: make(map[Key]ID), + } +} + +// Find uses key to search for and return a SysV mechanism. Find returns an +// error if an object is found by shouldn't be, or if the user doesn't have +// permission to use the object. If no object is found, Find checks create +// flag, and returns an error only if it's false. +func (r *Registry) Find(ctx context.Context, key Key, mode linux.FileMode, create, exclusive bool) (Mechanism, error) { + if id, ok := r.keysToIDs[key]; ok { + mech := r.objects[id] + mech.Lock() + defer mech.Unlock() + + obj := mech.Object() + creds := auth.CredentialsFromContext(ctx) + if !obj.CheckPermissions(creds, fs.PermsFromMode(mode)) { + // The [calling process / user] does not have permission to access + // the set, and does not have the CAP_IPC_OWNER capability in the + // user namespace that governs its IPC namespace. + return nil, linuxerr.EACCES + } + + if create && exclusive { + // IPC_CREAT and IPC_EXCL were specified, but an object already + // exists for key. + return nil, linuxerr.EEXIST + } + return mech, nil + } + + if !create { + // No object exists for key and msgflg did not specify IPC_CREAT. + return nil, linuxerr.ENOENT + } + + return nil, nil +} + +// Register adds the given object into Registry.Objects, and assigns it a new +// ID. It returns an error if all IDs are exhausted. +func (r *Registry) Register(m Mechanism) error { + id, err := r.newID() + if err != nil { + return err + } + + obj := m.Object() + obj.ID = id + + r.objects[id] = m + r.keysToIDs[obj.Key] = id + + return nil +} + +// newID finds the first unused ID in the registry, and returns an error if +// non is found. +func (r *Registry) newID() (ID, error) { + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.objects[id] == nil { + r.lastIDUsed = id + return id, nil + } + } + + log.Warningf("ids exhausted, they may be leaking") + + // The man pages for shmget(2) mention that ENOSPC should be used if "All + // possible shared memory IDs have been taken (SHMMNI)". Other SysV + // mechanisms don't have a specific errno for running out of IDs, but they + // return ENOSPC if the max number of objects is exceeded, so we assume that + // it's the same case. + return 0, linuxerr.ENOSPC +} + +// Remove removes the mechanism with the given id from the registry, and calls +// mechanism.Destroy to perform mechanism-specific removal. +func (r *Registry) Remove(id ID, creds *auth.Credentials) error { + mech := r.objects[id] + if mech == nil { + return linuxerr.EINVAL + } + + mech.Lock() + defer mech.Unlock() + + obj := mech.Object() + + // The effective user ID of the calling process must match the creator or + // owner of the [mechanism], or the caller must be privileged. + if !obj.CheckOwnership(creds) { + return linuxerr.EPERM + } + + delete(r.objects, obj.ID) + delete(r.keysToIDs, obj.Key) + mech.Destroy() + + return nil +} + +// ForAllObjects executes a given function for all given objects. +func (r *Registry) ForAllObjects(f func(o Mechanism)) { + for _, o := range r.objects { + f(o) + } +} + +// FindByID returns the mechanism with the given ID, nil if non exists. +func (r *Registry) FindByID(id ID) Mechanism { + return r.objects[id] +} + +// DissociateKey removes the association between a mechanism and its key +// (deletes it from r.keysToIDs), preventing it from being discovered by any new +// process, but not necessarily destroying it. If the given key doesn't exist, +// nothing is changed. +func (r *Registry) DissociateKey(key Key) { + delete(r.keysToIDs, key) +} + +// DissociateID removes the association between a mechanism and its ID (deletes +// it from r.objects). An ID can't be removed unless the associated key is +// removed already, this is done to prevent the users from acquiring nil a +// Mechanism. +// +// Precondition: must be preceded by a call to r.DissociateKey. +func (r *Registry) DissociateID(id ID) { + delete(r.objects, id) +} + +// ObjectCount returns the number of registered objects. +func (r *Registry) ObjectCount() int { + return len(r.objects) +} + +// LastIDUsed returns the last used ID. +func (r *Registry) LastIDUsed() ID { + return r.lastIDUsed +} diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 9545bb5ef..0b101b1bb 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" ) @@ -30,6 +31,7 @@ type IPCNamespace struct { // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace + queues *msgqueue.Registry semaphores *semaphore.Registry shms *shm.Registry } @@ -38,6 +40,7 @@ type IPCNamespace struct { func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { ns := &IPCNamespace{ userNS: userNS, + queues: msgqueue.NewRegistry(userNS), semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } @@ -45,6 +48,11 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { return ns } +// MsgqueueRegistry returns the message queue registry for this namespace. +func (i *IPCNamespace) MsgqueueRegistry() *msgqueue.Registry { + return i.queues +} + // SemaphoreRegistry returns the semaphore set registry for this namespace. func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { return i.semaphores diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go index 4b943106b..e8a71bec1 100644 --- a/pkg/sentry/kernel/kcov.go +++ b/pkg/sentry/kernel/kcov.go @@ -22,13 +22,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov @@ -125,19 +125,19 @@ func (kcov *Kcov) InitTrace(size uint64) error { defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_DISABLED { - return syserror.EBUSY + return linuxerr.EBUSY } // To simplify all the logic around mapping, we require that the length of the // shared region is a multiple of the system page size. if (8*size)&(hostarch.PageSize-1) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // We need space for at least two uint64s to hold current position and a // single PC. if size < 2 || size > kcovAreaSizeMax { - return syserror.EINVAL + return linuxerr.EINVAL } kcov.size = size @@ -157,7 +157,7 @@ func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call. if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil { - return syserror.EINVAL + return linuxerr.EINVAL } switch traceKind { @@ -165,13 +165,13 @@ func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { kcov.mode = linux.KCOV_MODE_TRACE_PC case linux.KCOV_TRACE_CMP: // We do not support KCOV_MODE_TRACE_CMP. - return syserror.ENOTSUP + return linuxerr.ENOTSUP default: - return syserror.EINVAL + return linuxerr.EINVAL } if kcov.owningTask != nil && kcov.owningTask != t { - return syserror.EBUSY + return linuxerr.EBUSY } kcov.owningTask = t @@ -195,7 +195,7 @@ func (kcov *Kcov) DisableTrace(ctx context.Context) error { } if t != kcov.owningTask { - return syserror.EINVAL + return linuxerr.EINVAL } kcov.mode = linux.KCOV_MODE_INIT kcov.owningTask = nil @@ -237,7 +237,7 @@ func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) erro defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_INIT { - return syserror.EINVAL + return linuxerr.EINVAL } if kcov.mappable == nil { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 352c36ba9..df5160b67 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1299,11 +1299,11 @@ func (k *Kernel) WaitExited() { } // Kill requests that all tasks in k immediately exit as if group exiting with -// status es. Kill does not wait for tasks to exit. -func (k *Kernel) Kill(es ExitStatus) { +// status ws. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(ws linux.WaitStatus) { k.extMu.Lock() defer k.extMu.Unlock() - k.tasks.Kill(es) + k.tasks.Kill(ws) } // Pause requests that all tasks in k temporarily stop executing, and blocks diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go index 2e66ec587..5ffafb0d1 100644 --- a/pkg/sentry/kernel/kernel_opts.go +++ b/pkg/sentry/kernel/kernel_opts.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package kernel // SpecialOpts contains non-standard options for the kernel. diff --git a/pkg/sentry/kernel/msgqueue/BUILD b/pkg/sentry/kernel/msgqueue/BUILD new file mode 100644 index 000000000..5ec11e1f6 --- /dev/null +++ b/pkg/sentry/kernel/msgqueue/BUILD @@ -0,0 +1,36 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "message_list", + out = "message_list.go", + package = "msgqueue", + prefix = "msg", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Message", + "Linker": "*Message", + }, +) + +go_library( + name = "msgqueue", + srcs = [ + "message_list.go", + "msgqueue.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", + "//pkg/sentry/kernel/time", + "//pkg/sync", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/msgqueue/msgqueue.go b/pkg/sentry/kernel/msgqueue/msgqueue.go new file mode 100644 index 000000000..3ce926950 --- /dev/null +++ b/pkg/sentry/kernel/msgqueue/msgqueue.go @@ -0,0 +1,220 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package msgqueue implements System V message queues. +package msgqueue + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + // System-wide limit for maximum number of queues. + maxQueues = linux.MSGMNI + + // Maximum size of a queue in bytes. + maxQueueBytes = linux.MSGMNB + + // Maximum size of a message in bytes. + maxMessageBytes = linux.MSGMAX +) + +// Registry contains a set of message queues that can be referenced using keys +// or IDs. +// +// +stateify savable +type Registry struct { + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // reg defines basic fields and operations needed for all SysV registries. + reg *ipc.Registry +} + +// NewRegistry returns a new Registry ready to be used. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + reg: ipc.NewRegistry(userNS), + } +} + +// Queue represents a SysV message queue, described by sysvipc(7). +// +// +stateify savable +type Queue struct { + // registry is the registry owning this queue. Immutable. + registry *Registry + + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // dead is set to true when a queue is removed from the registry and should + // not be used. Operations on the queue should check dead, and return + // EIDRM if set to true. + dead bool + + // obj defines basic fields that should be included in all SysV IPC objects. + obj *ipc.Object + + // senders holds a queue of blocked message senders. Senders are notified + // when enough space is available in the queue to insert their message. + senders waiter.Queue + + // receivers holds a queue of blocked receivers. Receivers are notified + // when a new message is inserted into the queue and can be received. + receivers waiter.Queue + + // messages is a list of sent messages. + messages msgList + + // sendTime is the last time a msgsnd was perfomed. + sendTime ktime.Time + + // receiveTime is the last time a msgrcv was performed. + receiveTime ktime.Time + + // changeTime is the last time the queue was modified using msgctl. + changeTime ktime.Time + + // byteCount is the current number of message bytes in the queue. + byteCount uint64 + + // messageCount is the current number of messages in the queue. + messageCount uint64 + + // maxBytes is the maximum allowed number of bytes in the queue, and is also + // used as a limit for the number of total possible messages. + maxBytes uint64 + + // sendPID is the PID of the process that performed the last msgsnd. + sendPID int32 + + // receivePID is the PID of the process that performed the last msgrcv. + receivePID int32 +} + +// Message represents a message exchanged through a Queue via msgsnd(2) and +// msgrcv(2). +// +// +stateify savable +type Message struct { + msgEntry + + // mType is an integer representing the type of the sent message. + mType int64 + + // mText is an untyped block of memory. + mText []byte + + // mSize is the size of mText. + mSize uint64 +} + +// FindOrCreate creates a new message queue or returns an existing one. See +// msgget(2). +func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, mode linux.FileMode, private, create, exclusive bool) (*Queue, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if !private { + queue, err := r.reg.Find(ctx, key, mode, create, exclusive) + if err != nil { + return nil, err + } + + if queue != nil { + return queue.(*Queue), nil + } + } + + // Check system-wide limits. + if r.reg.ObjectCount() >= maxQueues { + return nil, linuxerr.ENOSPC + } + + return r.newQueueLocked(ctx, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode)) +} + +// newQueueLocked creates a new queue using the given fields. An error is +// returned if there're no more available identifiers. +// +// Precondition: r.mu must be held. +func (r *Registry) newQueueLocked(ctx context.Context, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions) (*Queue, error) { + q := &Queue{ + registry: r, + obj: ipc.NewObject(r.reg.UserNS, key, creator, creator, perms), + sendTime: ktime.ZeroTime, + receiveTime: ktime.ZeroTime, + changeTime: ktime.NowFromContext(ctx), + maxBytes: maxQueueBytes, + } + + err := r.reg.Register(q) + if err != nil { + return nil, err + } + return q, nil +} + +// Remove removes the queue with specified ID. All waiters (readers and +// writers) and writers will be awakened and fail. Remove will return an error +// if the ID is invalid, or the the user doesn't have privileges. +func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error { + r.mu.Lock() + defer r.mu.Unlock() + + r.reg.Remove(id, creds) + return nil +} + +// Lock implements ipc.Mechanism.Lock. +func (q *Queue) Lock() { + q.mu.Lock() +} + +// Unlock implements ipc.mechanism.Unlock. +// +// +checklocksignore +func (q *Queue) Unlock() { + q.mu.Unlock() +} + +// Object implements ipc.Mechanism.Object. +func (q *Queue) Object() *ipc.Object { + return q.obj +} + +// Destroy implements ipc.Mechanism.Destroy. +func (q *Queue) Destroy() { + q.dead = true + + // Notify waiters. Senders and receivers will try to run, and return an + // error (EIDRM). Waiters should remove themselves from the queue after + // waking up. + q.senders.Notify(waiter.EventOut) + q.receivers.Notify(waiter.EventIn) +} + +// ID returns queue's ID. +func (q *Queue) ID() ipc.ID { + return q.obj.ID +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index af46b3e08..94ebac7c5 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/abi/linux", "//pkg/amutex", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal/primitive", "//pkg/safemem", diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 6497dc4ba..08786d704 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -17,6 +17,7 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" @@ -112,7 +113,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // read side isn't open yet. if flags.NonBlocking { w.DecRef(ctx) - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { @@ -130,10 +131,10 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi return rw, nil default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { - return syserror.EPIPE + return linuxerr.EPIPE } diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 06769931a..85e3ce9f4 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -22,6 +22,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -428,18 +429,18 @@ func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) { // SetFifoSize implements fs.FifoSizer.SetFifoSize. func (p *Pipe) SetFifoSize(size int64) (int64, error) { if size < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if size < MinimumPipeSize { size = MinimumPipeSize // Per spec. } if size > MaximumPipeSize { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } p.mu.Lock() defer p.mu.Unlock() if size < p.size { - return 0, syserror.EBUSY + return 0, linuxerr.EBUSY } p.max = size return size, nil diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go index dd60cba24..077c5d596 100644 --- a/pkg/sentry/kernel/pipe/pipe_unsafe.go +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -23,6 +23,8 @@ import ( // concurrent calls cannot deadlock. // // Preconditions: x != y. +// +checklocksacquire:x.mu +// +checklocksacquire:y.mu func lockTwoPipes(x, y *Pipe) { // Lock the two pipes in order of increasing address. if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 3fa5d1d2f..c883a9014 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -86,7 +87,7 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) if n > 0 { p.Notify(waiter.ReadableEvents) } - if err == unix.EPIPE { + if linuxerr.Equals(linuxerr.EPIPE, err) { // If we are returning EPIPE send SIGPIPE to the task. if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil { sendSig(linux.SIGPIPE) @@ -156,6 +157,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume // // mu must be held by the caller. waitFor returns with mu held, but it will // drop mu before blocking for any reader/writers. +// +checklocks:mu func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { // Ideally this function would simply use a condition variable. However, the // wait needs to be interruptible via 'sleeper', so we must sychronize via a diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 95b948edb..077d5fd7f 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -17,6 +17,7 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -79,7 +80,7 @@ func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *v // Allocate implements vfs.FileDescriptionImpl.Allocate. func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error { - return syserror.ESPIPE + return linuxerr.ESPIPE } // Open opens the pipe represented by vp. @@ -90,7 +91,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s readable := vfs.MayReadFileWithOpenFlags(statusFlags) writable := vfs.MayWriteFileWithOpenFlags(statusFlags) if !readable && !writable { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fd, err := vp.newFD(mnt, vfsd, statusFlags, locks) @@ -131,7 +132,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { fd.DecRef(ctx) - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { @@ -224,7 +225,7 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.ESPIPE + return linuxerr.ESPIPE } // EventRegister implements waiter.Waitable.EventRegister. @@ -415,7 +416,7 @@ func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { // Preconditions: count > 0. func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { if dst.pipe == src.pipe { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } lockTwoPipes(dst.pipe, src.pipe) diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go index d801a3d83..319754a42 100644 --- a/pkg/sentry/kernel/posixtimer.go +++ b/pkg/sentry/kernel/posixtimer.go @@ -18,8 +18,8 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // IntervalTimer represents a POSIX interval timer as described by @@ -175,7 +175,7 @@ func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux. break } if t.tg.nextTimerID == end { - return 0, syserror.EAGAIN + return 0, linuxerr.EAGAIN } } @@ -214,16 +214,16 @@ func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux. target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)] t.tg.pidns.owner.mu.RUnlock() if !ok || target.tg != t.tg { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } it.target = target default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if sigev.Notify != linux.SIGEV_NONE { it.signo = linux.Signal(sigev.Signo) if !it.signo.IsValid() { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } it.timer = ktime.NewTimer(c, it) @@ -238,7 +238,7 @@ func (t *Task) IntervalTimerDelete(id linux.TimerID) error { defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return syserror.EINVAL + return linuxerr.EINVAL } delete(t.tg.timers, id) it.DestroyTimer() @@ -251,7 +251,7 @@ func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return linux.Itimerspec{}, syserror.EINVAL + return linux.Itimerspec{}, linuxerr.EINVAL } newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock()) @@ -269,7 +269,7 @@ func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return linux.Itimerspec{}, syserror.EINVAL + return linux.Itimerspec{}, linuxerr.EINVAL } tm, s := it.timer.Get() @@ -285,7 +285,7 @@ func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) { defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // By timer_create(2) invariant, either it.target == nil (in which case // it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 20563f02a..079294f81 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -481,7 +482,7 @@ func (t *Task) ptraceTraceme() error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if t.hasTracer() { - return syserror.EPERM + return linuxerr.EPERM } if t.parent == nil { // In Linux, only init can not have a parent, and init is assumed never @@ -497,7 +498,7 @@ func (t *Task) ptraceTraceme() error { return nil } if !t.parent.canTraceLocked(t, true) { - return syserror.EPERM + return linuxerr.EPERM } if t.parent.exitState != TaskExitNone { // Fail silently, as if we were successfully attached but then @@ -513,21 +514,21 @@ func (t *Task) ptraceTraceme() error { // ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { if t.tg == target.tg { - return syserror.EPERM + return linuxerr.EPERM } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if !t.canTraceLocked(target, true) { - return syserror.EPERM + return linuxerr.EPERM } if target.hasTracer() { - return syserror.EPERM + return linuxerr.EPERM } // Attaching to zombies and dead tasks is not permitted; the exit // notification logic relies on this. Linux allows attaching to PF_EXITING // tasks, though. if target.exitState >= TaskExitZombie { - return syserror.EPERM + return linuxerr.EPERM } if seize { if err := target.ptraceSetOptionsLocked(opts); err != nil { @@ -651,6 +652,7 @@ func (t *Task) forgetTracerLocked() { // Preconditions: // * The signal mutex must be locked. // * The caller must be running on the task goroutine. +// +checklocks:t.tg.signalHandlers.mu func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false @@ -766,14 +768,14 @@ const ( // ptraceClone is called at the end of a clone or fork syscall to check if t // should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK // stop. child is the new task. -func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, args *linux.CloneArgs) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() event := false - if !opts.Untraced { + if args.Flags&linux.CLONE_UNTRACED == 0 { switch kind { case ptraceCloneKindClone: if t.ptraceOpts.TraceClone { @@ -808,7 +810,7 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => // include/linux/ptrace.h:ptrace_init_task(). - if event || opts.InheritTracer { + if event || args.Flags&linux.CLONE_PTRACE != 0 { tracer := t.Tracer() if tracer != nil { child.ptraceTracer.Store(tracer) @@ -910,7 +912,7 @@ func (t *Task) ptraceExit() { return } t.tg.signalHandlers.mu.Lock() - status := t.exitStatus.Status() + status := t.exitStatus t.tg.signalHandlers.mu.Unlock() t.Debugf("Entering PTRACE_EVENT_EXIT stop") t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) @@ -938,7 +940,7 @@ func (t *Task) ptraceKill(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { - return syserror.ESRCH + return linuxerr.ESRCH } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() @@ -962,7 +964,7 @@ func (t *Task) ptraceInterrupt(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { - return syserror.ESRCH + return linuxerr.ESRCH } if !target.ptraceSeized { return syserror.EIO @@ -994,7 +996,7 @@ func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { linux.PTRACE_O_TRACEVFORK | linux.PTRACE_O_TRACEVFORKDONE) if opts&^valid != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } t.ptraceOpts = ptraceOptions{ ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, @@ -1020,7 +1022,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // specified by pid. target := t.tg.pidns.TaskWithID(pid) if target == nil { - return syserror.ESRCH + return linuxerr.ESRCH } // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already @@ -1045,7 +1047,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() if target.Tracer() != t { t.tg.pidns.owner.mu.RUnlock() - return syserror.ESRCH + return linuxerr.ESRCH } if !target.ptraceFreeze() { t.tg.pidns.owner.mu.RUnlock() @@ -1053,7 +1055,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - // ptrace(2) - return syserror.ESRCH + return linuxerr.ESRCH } t.tg.pidns.owner.mu.RUnlock() // Even if the target has a ptrace-stop active, the tracee's task goroutine @@ -1221,7 +1223,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { - return syserror.EINVAL + return linuxerr.EINVAL } _, err := target.ptraceSiginfo.CopyOut(t, data) return err @@ -1234,14 +1236,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { - return syserror.EINVAL + return linuxerr.EINVAL } target.ptraceSiginfo = &info return nil case linux.PTRACE_GETSIGMASK: if addr != linux.SignalSetSize { - return syserror.EINVAL + return linuxerr.EINVAL } mask := target.SignalMask() _, err := mask.CopyOut(t, data) @@ -1249,7 +1251,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { case linux.PTRACE_SETSIGMASK: if addr != linux.SignalSetSize { - return syserror.EINVAL + return linuxerr.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, data); err != nil { diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 5ae05b5c3..63422e155 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kernel diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 46dd84cbc..27514d67b 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kernel diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 4bc5bca44..de352f4f2 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -18,9 +18,9 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -59,23 +59,23 @@ func (t *Task) RSeqAvailable() bool { func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr != 0 { if t.rseqAddr != addr { - return syserror.EINVAL + return linuxerr.EINVAL } if t.rseqSignature != signature { - return syserror.EINVAL + return linuxerr.EINVAL } - return syserror.EBUSY + return linuxerr.EBUSY } // rseq must be aligned and correctly sized. if addr&(linux.AlignOfRSeq-1) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if length != linux.SizeOfRSeq { - return syserror.EINVAL + return linuxerr.EINVAL } if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { - return syserror.EFAULT + return linuxerr.EFAULT } t.rseqAddr = addr @@ -92,7 +92,7 @@ func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) - return syserror.EFAULT + return linuxerr.EFAULT } return nil @@ -103,16 +103,16 @@ func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { // Preconditions: The caller must be running on the task goroutine. func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if t.rseqAddr != addr { - return syserror.EINVAL + return linuxerr.EINVAL } if length != linux.SizeOfRSeq { - return syserror.EINVAL + return linuxerr.EINVAL } if t.rseqSignature != signature { - return syserror.EPERM + return linuxerr.EPERM } if err := t.rseqClearCPU(); err != nil { @@ -152,10 +152,10 @@ func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { return nil } if r.CriticalSection.Start >= r.CriticalSection.End { - return syserror.EINVAL + return linuxerr.EINVAL } if r.CriticalSection.Contains(r.Restart) { - return syserror.EINVAL + return linuxerr.EINVAL } // TODO(jamieliu): check that r.CriticalSection and r.Restart are in // the application address range, for consistency with Linux. @@ -187,7 +187,7 @@ func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error { // unfortunate, but unlikely in a correct program. if err := t.rseqUpdateCPU(); err != nil { t.oldRSeqCPUAddr = 0 - return syserror.EINVAL // yes, EINVAL, not err or EFAULT + return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT } return nil } diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 65e5427c1..2ae08ed12 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -25,9 +25,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", - "//pkg/log", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/time", "//pkg/sync", "//pkg/syserror", @@ -40,10 +41,11 @@ go_test( srcs = ["semaphore_test.go"], library = ":semaphore", deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/sentry/contexttest", - "//pkg/sentry/kernel/auth", - "//pkg/syserror", + "//pkg/abi/linux", # keep + "//pkg/context", # keep + "//pkg/sentry/contexttest", # keep + "//pkg/sentry/kernel/auth", # keep + "//pkg/sentry/kernel/ipc", # keep + "//pkg/syserror", # keep ], ) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 47bb66b42..8610d3fc1 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -20,9 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -46,15 +47,15 @@ const ( // // +stateify savable type Registry struct { - // userNS owning the ipc name this registry belongs to. Immutable. - userNS *auth.UserNamespace // mu protects all fields below. - mu sync.Mutex `state:"nosave"` - semaphores map[int32]*Set - lastIDUsed int32 + mu sync.Mutex `state:"nosave"` + + // reg defines basic fields and operations needed for all SysV registries. + reg *ipc.Registry + // indexes maintains a mapping between a set's index in virtual array and // its identifier. - indexes map[int32]int32 + indexes map[int32]ipc.ID } // Set represents a set of semaphores that can be operated atomically. @@ -64,19 +65,11 @@ type Set struct { // registry owning this sem set. Immutable. registry *Registry - // Id is a handle that identifies the set. - ID int32 - - // key is an user provided key that can be shared between processes. - key int32 + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` - // creator is the user that created the set. Immutable. - creator fs.FileOwner + obj *ipc.Object - // mu protects all fields below. - mu sync.Mutex `state:"nosave"` - owner fs.FileOwner - perms fs.FilePermissions opTime ktime.Time changeTime ktime.Time @@ -114,9 +107,8 @@ type waiter struct { // NewRegistry creates a new semaphore set registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ - userNS: userNS, - semaphores: make(map[int32]*Set), - indexes: make(map[int32]int32), + reg: ipc.NewRegistry(userNS), + indexes: make(map[int32]ipc.ID), } } @@ -125,52 +117,40 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { // a new set is always created. If create is false, it fails if a set cannot // be found. If exclusive is true, it fails if a set with the same key already // exists. -func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { +func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { if nsems < 0 || nsems > semsMax { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } r.mu.Lock() defer r.mu.Unlock() if !private { - // Look up an existing semaphore. - if set := r.findByKey(key); set != nil { - set.mu.Lock() - defer set.mu.Unlock() - - // Check that caller can access semaphore set. - creds := auth.CredentialsFromContext(ctx) - if !set.checkPerms(creds, fs.PermsFromMode(mode)) { - return nil, syserror.EACCES - } + set, err := r.reg.Find(ctx, key, mode, create, exclusive) + if err != nil { + return nil, err + } - // Validate parameters. + // Validate semaphore-specific parameters. + if set != nil { + set := set.(*Set) if nsems > int32(set.Size()) { - return nil, syserror.EINVAL - } - if create && exclusive { - return nil, syserror.EEXIST + return nil, linuxerr.EINVAL } return set, nil } - - if !create { - // Semaphore not found and should not be created. - return nil, syserror.ENOENT - } } // Zero is only valid if an existing set is found. if nsems == 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // Apply system limits. // - // Map semaphores and map indexes in a registry are of the same size, - // check map semaphores only here for the system limit. - if len(r.semaphores) >= setsMax { + // Map reg.objects and map indexes in a registry are of the same size, + // check map reg.objects only here for the system limit. + if r.reg.ObjectCount() >= setsMax { return nil, syserror.ENOSPC } if r.totalSems() > int(semsTotalMax-nsems) { @@ -178,9 +158,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu } // Finally create a new set. - owner := fs.FileOwnerFromContext(ctx) - perms := fs.FilePermsFromMode(mode) - return r.newSet(ctx, key, owner, owner, perms, nsems) + return r.newSetLocked(ctx, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), nsems) } // IPCInfo returns information about system-wide semaphore limits and parameters. @@ -207,7 +185,7 @@ func (r *Registry) SemInfo() *linux.SemInfo { defer r.mu.Unlock() info := r.IPCInfo() - info.SemUsz = uint32(len(r.semaphores)) + info.SemUsz = uint32(r.reg.ObjectCount()) info.SemAem = uint32(r.totalSems()) return info @@ -230,77 +208,59 @@ func (r *Registry) HighestIndex() int32 { return highestIndex } -// RemoveID removes set with give 'id' from the registry and marks the set as +// Remove removes set with give 'id' from the registry and marks the set as // dead. All waiters will be awakened and fail. -func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { +func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error { r.mu.Lock() defer r.mu.Unlock() - set := r.semaphores[id] - if set == nil { - return syserror.EINVAL - } index, found := r.findIndexByID(id) if !found { - // Inconsistent state. - panic(fmt.Sprintf("unable to find an index for ID: %d", id)) + return linuxerr.EINVAL } + delete(r.indexes, index) - set.mu.Lock() - defer set.mu.Unlock() - - // "The effective user ID of the calling process must match the creator or - // owner of the semaphore set, or the caller must be privileged." - if !set.checkCredentials(creds) && !set.checkCapability(creds) { - return syserror.EACCES - } + r.reg.Remove(id, creds) - delete(r.semaphores, set.ID) - delete(r.indexes, index) - set.destroy() return nil } -func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { +// newSetLocked creates a new Set using given fields. An error is returned if there +// are no more available identifiers. +// +// Precondition: r.mu must be held. +func (r *Registry) newSetLocked(ctx context.Context, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { set := &Set{ registry: r, - key: key, - owner: owner, - creator: owner, - perms: perms, + obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, perms), changeTime: ktime.NowFromContext(ctx), sems: make([]sem, nsems), } - // Find the next available ID. - for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { - // Handle wrap around. - if id < 0 { - id = 0 - continue - } - if r.semaphores[id] == nil { - index, found := r.findFirstAvailableIndex() - if !found { - panic("unable to find an available index") - } - r.indexes[index] = id - r.lastIDUsed = id - r.semaphores[id] = set - set.ID = id - return set, nil - } + err := r.reg.Register(set) + if err != nil { + return nil, err + } + + index, found := r.findFirstAvailableIndex() + if !found { + // See linux, ipc/sem.c:newary(). + return nil, linuxerr.ENOSPC } + r.indexes[index] = set.obj.ID - log.Warningf("Semaphore map is full, they must be leaking") - return nil, syserror.ENOMEM + return set, nil } // FindByID looks up a set given an ID. -func (r *Registry) FindByID(id int32) *Set { +func (r *Registry) FindByID(id ipc.ID) *Set { r.mu.Lock() defer r.mu.Unlock() - return r.semaphores[id] + mech := r.reg.FindByID(id) + if mech == nil { + return nil + } + return mech.(*Set) } // FindByIndex looks up a set given an index. @@ -312,19 +272,10 @@ func (r *Registry) FindByIndex(index int32) *Set { if !present { return nil } - return r.semaphores[id] + return r.reg.FindByID(id).(*Set) } -func (r *Registry) findByKey(key int32) *Set { - for _, v := range r.semaphores { - if v.key == key { - return v - } - } - return nil -} - -func (r *Registry) findIndexByID(id int32) (int32, bool) { +func (r *Registry) findIndexByID(id ipc.ID) (int32, bool) { for k, v := range r.indexes { if v == id { return k, true @@ -344,12 +295,36 @@ func (r *Registry) findFirstAvailableIndex() (int32, bool) { func (r *Registry) totalSems() int { totalSems := 0 - for _, v := range r.semaphores { - totalSems += v.Size() - } + r.reg.ForAllObjects( + func(o ipc.Mechanism) { + totalSems += o.(*Set).Size() + }, + ) return totalSems } +// ID returns semaphore's ID. +func (s *Set) ID() ipc.ID { + return s.obj.ID +} + +// Object implements ipc.Mechanism.Object. +func (s *Set) Object() *ipc.Object { + return s.obj +} + +// Lock implements ipc.Mechanism.Lock. +func (s *Set) Lock() { + s.mu.Lock() +} + +// Unlock implements ipc.mechanism.Unlock. +// +// +checklocksignore +func (s *Set) Unlock() { + s.mu.Unlock() +} + func (s *Set) findSem(num int32) *sem { if num < 0 || int(num) >= s.Size() { return nil @@ -369,12 +344,12 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File // "The effective UID of the calling process must match the owner or creator // of the semaphore set, or the caller must be privileged." - if !s.checkCredentials(creds) && !s.checkCapability(creds) { - return syserror.EACCES + if !s.obj.CheckOwnership(creds) { + return linuxerr.EACCES } - s.owner = owner - s.perms = perms + s.obj.Owner = owner + s.obj.Perms = perms s.changeTime = ktime.NowFromContext(ctx) return nil } @@ -394,18 +369,18 @@ func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.Sem s.mu.Lock() defer s.mu.Unlock() - if !s.checkPerms(creds, permMask) { - return nil, syserror.EACCES + if !s.obj.CheckPermissions(creds, permMask) { + return nil, linuxerr.EACCES } return &linux.SemidDS{ SemPerm: linux.IPCPerm{ - Key: uint32(s.key), - UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), - GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), - CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), - CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), - Mode: uint16(s.perms.LinuxMode()), + Key: uint32(s.obj.Key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Creator.GID)), + Mode: uint16(s.obj.Perms.LinuxMode()), Seq: 0, // IPC sequence not supported. }, SemOTime: s.opTime.TimeT(), @@ -417,20 +392,20 @@ func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.Sem // SetVal overrides a semaphore value, waking up waiters as needed. func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { if val < 0 || val > valueMax { - return syserror.ERANGE + return linuxerr.ERANGE } s.mu.Lock() defer s.mu.Unlock() // "The calling process must have alter permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Write: true}) { - return syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Write: true}) { + return linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return syserror.ERANGE + return linuxerr.ERANGE } // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. @@ -452,7 +427,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti for _, val := range vals { if val > valueMax { - return syserror.ERANGE + return linuxerr.ERANGE } } @@ -460,8 +435,8 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti defer s.mu.Unlock() // "The calling process must have alter permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Write: true}) { - return syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Write: true}) { + return linuxerr.EACCES } for i, val := range vals { @@ -482,13 +457,13 @@ func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) { defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return sem.value, nil } @@ -499,8 +474,8 @@ func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) { defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return nil, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return nil, linuxerr.EACCES } vals := make([]uint16, s.Size()) @@ -516,13 +491,13 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) { defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return sem.pid, nil } @@ -532,13 +507,13 @@ func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *wait defer s.mu.Unlock() // The calling process must have read permission on the semaphore set. - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } var cnt uint16 for w := sem.waiters.Front(); w != nil; w = w.Next() { @@ -581,15 +556,15 @@ func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Cr readOnly := true for _, op := range ops { if s.findSem(int32(op.SemNum)) == nil { - return nil, 0, syserror.EFBIG + return nil, 0, linuxerr.EFBIG } if op.SemOp != 0 { readOnly = false } } - if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { - return nil, 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { + return nil, 0, linuxerr.EACCES } ch, num, err := s.executeOps(ctx, ops, pid) @@ -624,7 +599,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch if op.SemOp < 0 { // Handle 'wait' operation. if -op.SemOp > valueMax { - return nil, 0, syserror.ERANGE + return nil, 0, linuxerr.ERANGE } if -op.SemOp > tmpVals[op.SemNum] { // Not enough resources, must wait. @@ -639,7 +614,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch } else { // op.SemOp > 0: Handle 'signal' operation. if tmpVals[op.SemNum] > valueMax-op.SemOp { - return nil, 0, syserror.ERANGE + return nil, 0, linuxerr.ERANGE } } @@ -674,38 +649,10 @@ func (s *Set) AbortWait(num int32, ch chan struct{}) { // Waiter may not be found in case it raced with wakeWaiters(). } -func (s *Set) checkCredentials(creds *auth.Credentials) bool { - return s.owner.UID == creds.EffectiveKUID || - s.owner.GID == creds.EffectiveKGID || - s.creator.UID == creds.EffectiveKUID || - s.creator.GID == creds.EffectiveKGID -} - -func (s *Set) checkCapability(creds *auth.Credentials) bool { - return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok() -} - -func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool { - // Are we owner, or in group, or other? - p := s.perms.Other - if s.owner.UID == creds.EffectiveKUID { - p = s.perms.User - } else if creds.InGroup(s.owner.GID) { - p = s.perms.Group - } - - // Are permissions satisfied without capability checks? - if p.SupersetOf(reqPerms) { - return true - } - - return s.checkCapability(creds) -} - -// destroy destroys the set. +// Destroy implements ipc.Mechanism.Destroy. // // Preconditions: Caller must hold 's.mu'. -func (s *Set) destroy() { +func (s *Set) Destroy() { // Notify all waiters. They will fail on the next attempt to execute // operations and return error. s.dead = true diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go index e47acefdf..2e4ab8121 100644 --- a/pkg/sentry/kernel/semaphore/semaphore_test.go +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/syserror" ) @@ -55,7 +56,7 @@ func signalled(ch chan struct{}) bool { func TestBasic(t *testing.T) { ctx := contexttest.Context(t) - set := &Set{ID: 123, sems: make([]sem, 1)} + set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)} ops := []linux.Sembuf{ {SemOp: 1}, } @@ -76,7 +77,7 @@ func TestBasic(t *testing.T) { func TestWaitForZero(t *testing.T) { ctx := contexttest.Context(t) - set := &Set{ID: 123, sems: make([]sem, 1)} + set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)} ops := []linux.Sembuf{ {SemOp: 0}, } @@ -115,7 +116,7 @@ func TestWaitForZero(t *testing.T) { func TestNoWait(t *testing.T) { ctx := contexttest.Context(t) - set := &Set{ID: 123, sems: make([]sem, 1)} + set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)} ops := []linux.Sembuf{ {SemOp: 1}, } @@ -138,11 +139,12 @@ func TestUnregister(t *testing.T) { ctx := contexttest.Context(t) r := NewRegistry(auth.NewRootUserNamespace()) set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true) + if err != nil { t.Fatalf("FindOrCreate() failed, err: %v", err) } - if got := r.FindByID(set.ID); got.ID != set.ID { - t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set) + if got := r.FindByID(set.obj.ID); got.obj.ID != set.obj.ID { + t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.obj.ID, got, set) } ops := []linux.Sembuf{ @@ -155,14 +157,14 @@ func TestUnregister(t *testing.T) { } creds := auth.CredentialsFromContext(ctx) - if err := r.RemoveID(set.ID, creds); err != nil { - t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err) + if err := r.Remove(set.obj.ID, creds); err != nil { + t.Fatalf("Remove(%d) failed, err: %v", set.obj.ID, err) } if !set.dead { t.Fatalf("set is not dead: %+v", set) } - if got := r.FindByID(set.ID); got != nil { - t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got) + if got := r.FindByID(set.obj.ID); got != nil { + t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.obj.ID, got) } for i, ch := range chs { if !signalled(ch) { diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 973d708a3..f9f872522 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,7 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // SessionID is the public identifier. @@ -120,8 +120,9 @@ func (pg *ProcessGroup) Originator() *ThreadGroup { // IsOrphan returns true if this process group is an orphan. func (pg *ProcessGroup) IsOrphan() bool { - pg.originator.TaskSet().mu.RLock() - defer pg.originator.TaskSet().mu.RUnlock() + ts := pg.originator.TaskSet() + ts.mu.RLock() + defer ts.mu.RUnlock() return pg.ancestors == 0 } @@ -277,14 +278,14 @@ func (tg *ThreadGroup) createSession() error { continue } if s.leader == tg { - return syserror.EPERM + return linuxerr.EPERM } if s.id == SessionID(id) { - return syserror.EPERM + return linuxerr.EPERM } for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { if pg.id == ProcessGroupID(id) { - return syserror.EPERM + return linuxerr.EPERM } } } @@ -371,7 +372,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // Check whether a process still exists or not. if id == 0 { - return syserror.ESRCH + return linuxerr.ESRCH } // Per above, check for a Session leader or existing group. @@ -380,11 +381,11 @@ func (tg *ThreadGroup) CreateProcessGroup() error { continue } if s.leader == tg { - return syserror.EPERM + return linuxerr.EPERM } for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { if pg.id == ProcessGroupID(id) { - return syserror.EPERM + return linuxerr.EPERM } } } @@ -442,17 +443,17 @@ func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID // Lookup the ProcessGroup. pg := pidns.processGroups[pgid] if pg == nil { - return syserror.EPERM + return linuxerr.EPERM } // Disallow the join if an execve has performed, per POSIX. if checkExec && tg.execed { - return syserror.EACCES + return linuxerr.EACCES } // See if it's in the same session as ours. if pg.session != tg.processGroup.session { - return syserror.EPERM + return linuxerr.EPERM } // Join the group; adjust children. diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 1c3c0794f..4e8deac4c 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -28,6 +28,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/refs", @@ -35,6 +36,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index a73f1bdca..2abf467d7 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -38,10 +38,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -50,12 +52,6 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// Key represents a shm segment key. Analogous to a file name. -type Key int32 - -// ID represents the opaque handle for a shm segment. Analogous to an fd. -type ID int32 - // Registry tracks all shared memory segments in an IPC namespace. The registry // provides the mechanisms for creating and finding segments, and reporting // global shm parameters. @@ -68,50 +64,51 @@ type Registry struct { // mu protects all fields below. mu sync.Mutex `state:"nosave"` - // shms maps segment ids to segments. + // reg defines basic fields and operations needed for all SysV registries. // - // shms holds all referenced segments, which are removed on the last + // Withing reg, there are two maps, Objects and KeysToIDs. + // + // reg.objects holds all referenced segments, which are removed on the last // DecRef. Thus, it cannot itself hold a reference on the Shm. // // Since removal only occurs after the last (unlocked) DecRef, there // exists a short window during which a Shm still exists in Shm, but is // unreferenced. Users must use TryIncRef to determine if the Shm is // still valid. - shms map[ID]*Shm - - // keysToShms maps segment keys to segments. // - // Shms in keysToShms are guaranteed to be referenced, as they are + // keysToIDs maps segment keys to IDs. + // + // Shms in keysToIDs are guaranteed to be referenced, as they are // removed by disassociateKey before the last DecRef. - keysToShms map[Key]*Shm + reg *ipc.Registry // Sum of the sizes of all existing segments rounded up to page size, in // units of page size. totalPages uint64 - - // ID assigned to the last created segment. Used to quickly find the next - // unused ID. - lastIDUsed ID } // NewRegistry creates a new shm registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ - userNS: userNS, - shms: make(map[ID]*Shm), - keysToShms: make(map[Key]*Shm), + userNS: userNS, + reg: ipc.NewRegistry(userNS), } } // FindByID looks up a segment given an ID. // // FindByID returns a reference on Shm. -func (r *Registry) FindByID(id ID) *Shm { +func (r *Registry) FindByID(id ipc.ID) *Shm { r.mu.Lock() defer r.mu.Unlock() - s := r.shms[id] + mech := r.reg.FindByID(id) + if mech == nil { + return nil + } + s := mech.(*Shm) + // Take a reference on s. If TryIncRef fails, s has reached the last - // DecRef, but hasn't quite been removed from r.shms yet. + // DecRef, but hasn't quite been removed from r.reg.objects yet. if s != nil && s.TryIncRef() { return s } @@ -128,9 +125,9 @@ func (r *Registry) dissociateKey(s *Shm) { defer r.mu.Unlock() s.mu.Lock() defer s.mu.Unlock() - if s.key != linux.IPC_PRIVATE { - delete(r.keysToShms, s.key) - s.key = linux.IPC_PRIVATE + if s.obj.Key != linux.IPC_PRIVATE { + r.reg.DissociateKey(s.obj.Key) + s.obj.Key = linux.IPC_PRIVATE } } @@ -138,69 +135,49 @@ func (r *Registry) dissociateKey(s *Shm) { // analogous to open(2). // // FindOrCreate returns a reference on Shm. -func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { +func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { // "A new segment was to be created and size is less than SHMMIN or // greater than SHMMAX." - man shmget(2) // // Note that 'private' always implies the creation of a new segment // whether IPC_CREAT is specified or not. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } r.mu.Lock() defer r.mu.Unlock() - if len(r.shms) >= linux.SHMMNI { + if r.reg.ObjectCount() >= linux.SHMMNI { // "All possible shared memory IDs have been taken (SHMMNI) ..." // - man shmget(2) return nil, syserror.ENOSPC } if !private { - // Look up an existing segment. - if shm := r.keysToShms[key]; shm != nil { - shm.mu.Lock() - defer shm.mu.Unlock() - - // Check that caller can access the segment. - if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) { - // "The user does not have permission to access the shared - // memory segment, and does not have the CAP_IPC_OWNER - // capability in the user namespace that governs its IPC - // namespace." - man shmget(2) - return nil, syserror.EACCES - } + shm, err := r.reg.Find(ctx, key, mode, create, exclusive) + if err != nil { + return nil, err + } + // Validate shm-specific parameters. + if shm != nil { + shm := shm.(*Shm) if size > shm.size { // "A segment for the given key exists, but size is greater than // the size of that segment." - man shmget(2) - return nil, syserror.EINVAL - } - - if create && exclusive { - // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a - // shared memory segment already exists for key." - // - man shmget(2) - return nil, syserror.EEXIST + return nil, linuxerr.EINVAL } - shm.IncRef() return shm, nil } - - if !create { - // "No segment exists for the given key, and IPC_CREAT was not - // specified." - man shmget(2) - return nil, syserror.ENOENT - } } var sizeAligned uint64 if val, ok := hostarch.Addr(size).RoundUp(); ok { sizeAligned = uint64(val) } else { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL { @@ -211,9 +188,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui } // Need to create a new segment. - creator := fs.FileOwnerFromContext(ctx) - perms := fs.FilePermsFromMode(mode) - s, err := r.newShm(ctx, pid, key, creator, perms, size) + s, err := r.newShmLocked(ctx, pid, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), size) if err != nil { return nil, err } @@ -223,10 +198,10 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui return s, nil } -// newShm creates a new segment in the registry. +// newShmLocked creates a new segment in the registry. // // Precondition: Caller must hold r.mu. -func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { +func (r *Registry) newShmLocked(ctx context.Context, pid int32, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) if mfp == nil { panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) @@ -241,40 +216,21 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi shm := &Shm{ mfp: mfp, registry: r, - creator: creator, size: size, effectiveSize: effectiveSize, + obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, perms), fr: fr, - key: key, - perms: perms, - owner: creator, creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } shm.InitRefs() - // Find the next available ID. - for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { - // Handle wrap around. - if id < 0 { - id = 0 - continue - } - if r.shms[id] == nil { - r.lastIDUsed = id - - shm.ID = id - r.shms[id] = shm - r.keysToShms[key] = shm - - r.totalPages += effectiveSize / hostarch.PageSize - - return shm, nil - } + if err := r.reg.Register(shm); err != nil { + return nil, err } + r.totalPages += effectiveSize / hostarch.PageSize - log.Warningf("Shm ids exhuasted, they may be leaking") - return nil, syserror.ENOSPC + return shm, nil } // IPCInfo reports global parameters for sysv shared memory segments on this @@ -296,7 +252,7 @@ func (r *Registry) ShmInfo() *linux.ShmInfo { defer r.mu.Unlock() return &linux.ShmInfo{ - UsedIDs: int32(r.lastIDUsed), + UsedIDs: int32(r.reg.LastIDUsed()), ShmTot: r.totalPages, ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. ShmSwp: 0, // No reclaim at the moment. @@ -313,11 +269,11 @@ func (r *Registry) remove(s *Shm) { s.mu.Lock() defer s.mu.Unlock() - if s.key != linux.IPC_PRIVATE { + if s.obj.Key != linux.IPC_PRIVATE { panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked())) } - delete(r.shms, s.ID) + r.reg.DissociateID(s.obj.ID) r.totalPages -= s.effectiveSize / hostarch.PageSize } @@ -329,13 +285,16 @@ func (r *Registry) Release(ctx context.Context) { // the IPC namespace containing it has no more references. toRelease := make([]*Shm, 0) r.mu.Lock() - for _, s := range r.keysToShms { - s.mu.Lock() - if !s.pendingDestruction { - toRelease = append(toRelease, s) - } - s.mu.Unlock() - } + r.reg.ForAllObjects( + func(o ipc.Mechanism) { + s := o.(*Shm) + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + }, + ) r.mu.Unlock() for _, s := range toRelease { @@ -373,12 +332,6 @@ type Shm struct { // registry points to the shm registry containing this segment. Immutable. registry *Registry - // ID is the kernel identifier for this segment. Immutable. - ID ID - - // creator is the user that created the segment. Immutable. - creator fs.FileOwner - // size is the requested size of the segment at creation, in // bytes. Immutable. size uint64 @@ -396,14 +349,8 @@ type Shm struct { // mu protects all fields below. mu sync.Mutex `state:"nosave"` - // key is the public identifier for this segment. - key Key - - // perms is the access permissions for the segment. - perms fs.FilePermissions + obj *ipc.Object - // owner of this segment. - owner fs.FileOwner // attachTime is updated on every successful shmat. attachTime ktime.Time // detachTime is updated on every successful shmdt. @@ -425,17 +372,44 @@ type Shm struct { pendingDestruction bool } +// ID returns object's ID. +func (s *Shm) ID() ipc.ID { + return s.obj.ID +} + +// Object implements ipc.Mechanism.Object. +func (s *Shm) Object() *ipc.Object { + return s.obj +} + +// Destroy implements ipc.Mechanism.Destroy. No work is performed on shm.Destroy +// because a different removal mechanism is used in shm. See Shm.MarkDestroyed. +func (s *Shm) Destroy() { +} + +// Lock implements ipc.Mechanism.Lock. +func (s *Shm) Lock() { + s.mu.Lock() +} + +// Unlock implements ipc.mechanism.Unlock. +// +// +checklocksignore +func (s *Shm) Unlock() { + s.mu.Unlock() +} + // Precondition: Caller must hold s.mu. func (s *Shm) debugLocked() string { return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}", - s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction) + s.obj.ID, s.obj.Key, s.size, s.ReadRefs(), s.pendingDestruction) } // MappedName implements memmap.MappingIdentity.MappedName. func (s *Shm) MappedName(ctx context.Context) string { s.mu.Lock() defer s.mu.Unlock() - return fmt.Sprintf("SYSV%08d", s.key) + return fmt.Sprintf("SYSV%08d", s.obj.Key) } // DeviceID implements memmap.MappingIdentity.DeviceID. @@ -447,7 +421,7 @@ func (s *Shm) DeviceID() uint64 { func (s *Shm) InodeID() uint64 { // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() - return uint64(s.ID) + return uint64(s.obj.ID) } // DecRef drops a reference on s. @@ -511,7 +485,7 @@ func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > s.fr.Length() { - err = &memmap.BusError{syserror.EFAULT} + err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ @@ -550,7 +524,8 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta return memmap.MMapOpts{}, syserror.EIDRM } - if !s.checkPermissions(ctx, fs.PermMask{ + creds := auth.CredentialsFromContext(ctx) + if !s.obj.CheckPermissions(creds, fs.PermMask{ Read: true, Write: !opts.Readonly, Execute: opts.Execute, @@ -558,7 +533,7 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta // "The calling process does not have the required permissions for the // requested attach type, and does not have the CAP_IPC_OWNER capability // in the user namespace that governs its IPC namespace." - man shmat(2) - return memmap.MMapOpts{}, syserror.EACCES + return memmap.MMapOpts{}, linuxerr.EACCES } return memmap.MMapOpts{ Length: s.size, @@ -590,19 +565,19 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { // "The caller must have read permission on the shared memory segment." // - man shmctl(2) - if !s.checkPermissions(ctx, fs.PermMask{Read: true}) { + creds := auth.CredentialsFromContext(ctx) + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow // read access for shmid, and the calling process does not have the // CAP_IPC_OWNER capability in the user namespace that governs its IPC // namespace." - man shmctl(2) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } var mode uint16 if s.pendingDestruction { mode |= linux.SHM_DEST } - creds := auth.CredentialsFromContext(ctx) // Use the reference count as a rudimentary count of the number of // attaches. We exclude: @@ -619,12 +594,12 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { ds := &linux.ShmidDS{ ShmPerm: linux.IPCPerm{ - Key: uint32(s.key), - UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), - GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), - CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), - CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), - Mode: mode | uint16(s.perms.LinuxMode()), + Key: uint32(s.obj.Key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Creator.GID)), + Mode: mode | uint16(s.obj.Perms.LinuxMode()), Seq: 0, // IPC sequences not supported. }, ShmSegsz: s.size, @@ -644,24 +619,24 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { s.mu.Lock() defer s.mu.Unlock() - if !s.checkOwnership(ctx) { - return syserror.EPERM + creds := auth.CredentialsFromContext(ctx) + if !s.obj.CheckOwnership(creds) { + return linuxerr.EPERM } - creds := auth.CredentialsFromContext(ctx) uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) if !uid.Ok() || !gid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // User may only modify the lower 9 bits of the mode. All the other bits are // always 0 for the underlying inode. mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) - s.perms = fs.FilePermsFromMode(mode) + s.obj.Perms = fs.FilePermsFromMode(mode) - s.owner.UID = uid - s.owner.GID = gid + s.obj.Owner.UID = uid + s.obj.Owner.GID = gid s.changeTime = ktime.NowFromContext(ctx) return nil @@ -690,40 +665,3 @@ func (s *Shm) MarkDestroyed(ctx context.Context) { s.DecRef(ctx) return } - -// checkOwnership verifies whether a segment may be accessed by ctx as an -// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux. -// -// Precondition: Caller must hold s.mu. -func (s *Shm) checkOwnership(ctx context.Context) bool { - creds := auth.CredentialsFromContext(ctx) - if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID { - return true - } - - // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux - // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented - // for use to "override IPC ownership checks". - return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS) -} - -// checkPermissions verifies whether a segment is accessible by ctx for access -// described by req. See ipc/util.c:ipcperms() in Linux. -// -// Precondition: Caller must hold s.mu. -func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool { - creds := auth.CredentialsFromContext(ctx) - - p := s.perms.Other - if s.owner.UID == creds.EffectiveKUID { - p = s.perms.User - } else if creds.InGroup(s.owner.GID) { - p = s.perms.Group - } - if p.SupersetOf(req) { - return true - } - - // Tasks with CAP_IPC_OWNER may bypass permission checks. - return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) -} diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 76d472292..1110ecca5 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index f58ec4194..47958e2d4 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -18,6 +18,7 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -64,7 +65,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { t := kernel.TaskFromContext(ctx) if t == nil { // No task context? Not valid. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // name matches fs/signalfd.c:signalfd4. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]") diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 2e3b4488a..59eeb253d 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -32,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -232,7 +232,7 @@ type Task struct { // exitStatus is the task's exit status. // // exitStatus is protected by the signal mutex. - exitStatus ExitStatus + exitStatus linux.WaitStatus // syscallRestartBlock represents a custom restart function to run in // restart_syscall(2) to resume an interrupted syscall. @@ -846,7 +846,7 @@ func (t *Task) OOMScoreAdj() int32 { // value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { if adj > 1000 || adj < -1000 { - return syserror.EINVAL + return linuxerr.EINVAL } atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go index e574997f7..dd364ae50 100644 --- a/pkg/sentry/kernel/task_acct.go +++ b/pkg/sentry/kernel/task_acct.go @@ -18,10 +18,10 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // Getitimer implements getitimer(2). @@ -44,7 +44,7 @@ func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) { s, _ = t.tg.itimerProfSetting.At(tm) t.tg.signalHandlers.mu.Unlock() default: - return linux.ItimerVal{}, syserror.EINVAL + return linux.ItimerVal{}, linuxerr.EINVAL } val, iv := ktime.SpecFromSetting(tm, s) return linux.ItimerVal{ @@ -105,7 +105,7 @@ func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, err return linux.ItimerVal{}, err } default: - return linux.ItimerVal{}, syserror.EINVAL + return linux.ItimerVal{}, linuxerr.EINVAL } oldval, oldiv := ktime.SpecFromSetting(tm, olds) return linux.ItimerVal{ diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index 07533d982..b2520eecf 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -163,7 +163,7 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { region.End() t.SleepFinish(true) // We've timed out. - return syserror.ETIMEDOUT + return linuxerr.ETIMEDOUT } } diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go index 7c138e80f..828b90014 100644 --- a/pkg/sentry/kernel/task_cgroup.go +++ b/pkg/sentry/kernel/task_cgroup.go @@ -20,15 +20,13 @@ import ( "sort" "strings" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/syserror" ) // EnterInitialCgroups moves t into an initial set of cgroups. // // Precondition: t isn't in any cgroups yet, t.cgs is empty. -// -// +checklocksignore parent.mu is conditionally acquired. func (t *Task) EnterInitialCgroups(parent *Task) { var inherit map[Cgroup]struct{} if parent != nil { @@ -67,7 +65,7 @@ func (t *Task) EnterCgroup(c Cgroup) error { // // TODO(b/183137098): Implement cgroup migration. log.Warningf("Cgroup migration is not implemented") - return syserror.EBUSY + return linuxerr.EBUSY } } } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 405771f3f..da4b77ca2 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -20,147 +20,46 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// SharingOptions controls what resources are shared by a new task created by -// Task.Clone, or an existing task affected by Task.Unshare. -type SharingOptions struct { - // If NewAddressSpace is true, the task should have an independent virtual - // address space. - NewAddressSpace bool - - // If NewSignalHandlers is true, the task should use an independent set of - // signal handlers. - NewSignalHandlers bool - - // If NewThreadGroup is true, the task should be the leader of its own - // thread group. TerminationSignal is the signal that the thread group - // will send to its parent when it exits. If NewThreadGroup is false, - // TerminationSignal is ignored. - NewThreadGroup bool - TerminationSignal linux.Signal - - // If NewPIDNamespace is true: - // - // - In the context of Task.Clone, the new task should be the init task - // (TID 1) in a new PID namespace. - // - // - In the context of Task.Unshare, the task should create a new PID - // namespace, and all subsequent clones of the task should be members of - // the new PID namespace. - NewPIDNamespace bool - - // If NewUserNamespace is true, the task should have an independent user - // namespace. - NewUserNamespace bool - - // If NewNetworkNamespace is true, the task should have an independent - // network namespace. - NewNetworkNamespace bool - - // If NewFiles is true, the task should use an independent file descriptor - // table. - NewFiles bool - - // If NewFSContext is true, the task should have an independent FSContext. - NewFSContext bool - - // If NewUTSNamespace is true, the task should have an independent UTS - // namespace. - NewUTSNamespace bool - - // If NewIPCNamespace is true, the task should have an independent IPC - // namespace. - NewIPCNamespace bool -} - -// CloneOptions controls the behavior of Task.Clone. -type CloneOptions struct { - // SharingOptions defines the set of resources that the new task will share - // with its parent. - SharingOptions - - // Stack is the initial stack pointer of the new task. If Stack is 0, the - // new task will start with the same stack pointer as its parent. - Stack hostarch.Addr - - // If SetTLS is true, set the new task's TLS (thread-local storage) - // descriptor to TLS. If SetTLS is false, TLS is ignored. - SetTLS bool - TLS hostarch.Addr - - // If ChildClearTID is true, when the child exits, 0 is written to the - // address ChildTID in the child's memory, and if the write is successful a - // futex wake on the same address is performed. - // - // If ChildSetTID is true, the child's thread ID (in the child's PID - // namespace) is written to address ChildTID in the child's memory. (As in - // Linux, failed writes are silently ignored.) - ChildClearTID bool - ChildSetTID bool - ChildTID hostarch.Addr - - // If ParentSetTID is true, the child's thread ID (in the parent's PID - // namespace) is written to address ParentTID in the parent's memory. (As - // in Linux, failed writes are silently ignored.) - // - // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID - // causes the child's thread ID to be written to ptid in both the parent - // and child's memory, but this is a documentation error fixed by - // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). - ParentSetTID bool - ParentTID hostarch.Addr - - // If Vfork is true, place the parent in vforkStop until the cloned task - // releases its TaskImage. - Vfork bool - - // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for - // this clone(), and do not ptrace-attach the caller's tracer to the new - // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). - Untraced bool - - // If InheritTracer is true, ptrace-attach the caller's tracer to the new - // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported - // for it. If both Untraced and InheritTracer are true, no event will be - // reported, but tracer inheritance will still occur. - InheritTracer bool -} - // Clone implements the clone(2) syscall and returns the thread ID of the new // task in t's PID namespace. Clone may return both a non-zero thread ID and a // non-nil error. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. -func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { +func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { // Since signal actions may refer to application signal handlers by virtual // address, any set of signal handlers must refer to the same address // space. - if !opts.NewSignalHandlers && opts.NewAddressSpace { - return 0, nil, syserror.EINVAL + if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { + return 0, nil, linuxerr.EINVAL } // In order for the behavior of thread-group-directed signals to be sane, // all tasks in a thread group must share signal handlers. - if !opts.NewThreadGroup && opts.NewSignalHandlers { - return 0, nil, syserror.EINVAL + if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { + return 0, nil, linuxerr.EINVAL } // All tasks in a thread group must be in the same PID namespace. - if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { - return 0, nil, syserror.EINVAL + if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { + return 0, nil, linuxerr.EINVAL } // The two different ways of specifying a new PID namespace are // incompatible. - if opts.NewPIDNamespace && t.childPIDNamespace != nil { - return 0, nil, syserror.EINVAL + if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { + return 0, nil, linuxerr.EINVAL } // Thread groups and FS contexts cannot span user namespaces. - if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { - return 0, nil, syserror.EINVAL + if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { + return 0, nil, linuxerr.EINVAL + } + // args.ExitSignal must be a valid signal. + if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { + return 0, nil, linuxerr.EINVAL } // Pull task registers and FPU state, a cloned task will inherit the @@ -174,7 +73,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // user_namespaces(7) creds := t.Credentials() userns := creds.UserNamespace - if opts.NewUserNamespace { + if args.Flags&linux.CLONE_NEWUSER != 0 { var err error // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and // the caller is in a chroot environment (i.e., the caller's root @@ -182,28 +81,26 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // in which it resides)." - clone(2). Neither chroot(2) nor // user_namespaces(7) document this. if t.IsChrooted() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } userns, err = creds.NewChildUserNamespace() if err != nil { return 0, nil, err } } - if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { - return 0, nil, syserror.EPERM + if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { + return 0, nil, linuxerr.EPERM } utsns := t.UTSNamespace() - if opts.NewUTSNamespace { + if args.Flags&linux.CLONE_NEWUTS != 0 { // Note that this must happen after NewUserNamespace so we get // the new userns if there is one. utsns = t.UTSNamespace().Clone(userns) } ipcns := t.IPCNamespace() - if opts.NewIPCNamespace { - // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC - // namespace" + if args.Flags&linux.CLONE_NEWIPC != 0 { ipcns = NewIPCNamespace(userns) } else { ipcns.IncRef() @@ -214,7 +111,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { defer cu.Clean() netns := t.NetworkNamespace() - if opts.NewNetworkNamespace { + if args.Flags&linux.CLONE_NEWNET != 0 { netns = inet.NewNamespace(netns) } @@ -227,7 +124,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) } - image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) + image, err := t.image.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) if err != nil { return 0, nil, err } @@ -236,17 +133,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) // clone() returns 0 in the child. image.Arch.SetReturn(0) - if opts.Stack != 0 { - image.Arch.SetStack(uintptr(opts.Stack)) + if args.Stack != 0 { + image.Arch.SetStack(uintptr(args.Stack)) } - if opts.SetTLS { - if !image.Arch.SetTLS(uintptr(opts.TLS)) { - return 0, nil, syserror.EPERM + if args.Flags&linux.CLONE_SETTLS != 0 { + if !image.Arch.SetTLS(uintptr(args.TLS)) { + return 0, nil, linuxerr.EPERM } } var fsContext *FSContext - if opts.NewFSContext { + if args.Flags&linux.CLONE_FS == 0 { fsContext = t.fsContext.Fork() } else { fsContext = t.fsContext @@ -254,7 +151,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } var fdTable *FDTable - if opts.NewFiles { + if args.Flags&linux.CLONE_FILES == 0 { fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable @@ -264,22 +161,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { pidns := t.tg.pidns if t.childPIDNamespace != nil { pidns = t.childPIDNamespace - } else if opts.NewPIDNamespace { + } else if args.Flags&linux.CLONE_NEWPID != 0 { pidns = pidns.NewChild(userns) } tg := t.tg rseqAddr := hostarch.Addr(0) rseqSignature := uint32(0) - if opts.NewThreadGroup { + if args.Flags&linux.CLONE_THREAD == 0 { if tg.mounts != nil { tg.mounts.IncRef() } sh := t.tg.signalHandlers - if opts.NewSignalHandlers { + if args.Flags&linux.CLONE_SIGHAND == 0 { sh = sh.Fork() } - tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature @@ -304,7 +201,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), } - if opts.NewThreadGroup { + if args.Flags&linux.CLONE_THREAD == 0 { cfg.Parent = t } else { cfg.InheritParent = t @@ -322,7 +219,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // // However kernel/fork.c:copy_process() adds a limitation to this: // "sigaltstack should be cleared when sharing the same VM". - if opts.NewAddressSpace || opts.Vfork { + if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { nt.SetSignalStack(t.SignalStack()) } @@ -347,35 +244,35 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) nt.syscallFilters.Store(copiedFilters) } - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { nt.vforkParent = t } - if opts.ChildClearTID { - nt.SetClearTID(opts.ChildTID) + if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { + nt.SetClearTID(hostarch.Addr(args.ChildTID)) } - if opts.ChildSetTID { + if args.Flags&linux.CLONE_CHILD_SETTID != 0 { ctid := nt.ThreadID() - ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) + ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) } ntid := t.tg.pidns.IDOfTask(nt) - if opts.ParentSetTID { - ntid.CopyOut(t, opts.ParentTID) + if args.Flags&linux.CLONE_PARENT_SETTID != 0 { + ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) } kind := ptraceCloneKindClone - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { kind = ptraceCloneKindVfork - } else if opts.TerminationSignal == linux.SIGCHLD { + } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { kind = ptraceCloneKindFork } - if t.ptraceClone(kind, nt, opts) { - if opts.Vfork { + if t.ptraceClone(kind, nt, args) { + if args.Flags&linux.CLONE_VFORK != 0 { return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil } return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil } - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { t.maybeBeginVforkStop(nt) return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil } @@ -446,39 +343,47 @@ func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { } // Unshare changes the set of resources t shares with other tasks, as specified -// by opts. +// by flags. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) Unshare(opts *SharingOptions) error { - // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and - // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if - // t is the only task using its MM, which due to clone(2)'s rules imply - // that it is also the only task using its signal handlers / in its thread - // group, and cause EINVAL to be returned otherwise. +func (t *Task) Unshare(flags int32) error { + // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if + // the caller is single threaded (i.e., it is not sharing its address space + // with another process or thread). In this case, these flags have no + // effect. (Note also that specifying CLONE_THREAD automatically implies + // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) + // If the process is multithreaded, then the use of these flags results in + // an error." - unshare(2). This is incorrect (cf. + // kernel/fork.c:ksys_unshare()): + // + // - CLONE_THREAD does not imply CLONE_VM. + // + // - CLONE_SIGHAND implies CLONE_THREAD. + // + // - Only CLONE_VM requires that the caller is not sharing its address + // space with another thread. CLONE_SIGHAND requires that the caller is not + // sharing its signal handlers, and CLONE_THREAD requires that the caller + // is the only thread in its thread group. // // Since we don't count the number of tasks using each address space or set - // of signal handlers, we reject NewSignalHandlers and NewAddressSpace - // altogether, and interpret NewThreadGroup as requiring that t be the only - // member of its thread group. This seems to be logically coherent, in the - // sense that clone(2) allows a task to share signal handlers and address - // spaces with tasks in other thread groups. - if opts.NewAddressSpace || opts.NewSignalHandlers { - return syserror.EINVAL + // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. + if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { + return linuxerr.EINVAL } creds := t.Credentials() - if opts.NewThreadGroup { + if flags&linux.CLONE_THREAD != 0 { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { t.tg.signalHandlers.mu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } t.tg.signalHandlers.mu.Unlock() // This isn't racy because we're the only living task, and therefore // the only task capable of creating new ones, in our thread group. } - if opts.NewUserNamespace { + if flags&linux.CLONE_NEWUSER != 0 { if t.IsChrooted() { - return syserror.EPERM + return linuxerr.EPERM } newUserNS, err := creds.NewChildUserNamespace() if err != nil { @@ -492,34 +397,34 @@ func (t *Task) Unshare(opts *SharingOptions) error { creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) - if opts.NewPIDNamespace { + if flags&linux.CLONE_NEWPID != 0 { if !haveCapSysAdmin { - return syserror.EPERM + return linuxerr.EPERM } t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) } t.mu.Lock() // Can't defer unlock: DecRefs must occur without holding t.mu. - if opts.NewNetworkNamespace { + if flags&linux.CLONE_NEWNET != 0 { if !haveCapSysAdmin { t.mu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } t.netns = inet.NewNamespace(t.netns) } - if opts.NewUTSNamespace { + if flags&linux.CLONE_NEWUTS != 0 { if !haveCapSysAdmin { t.mu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } // Note that this must happen after NewUserNamespace, so the // new user namespace is used if there is one. t.utsns = t.utsns.Clone(creds.UserNamespace) } - if opts.NewIPCNamespace { + if flags&linux.CLONE_NEWIPC != 0 { if !haveCapSysAdmin { t.mu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" @@ -527,12 +432,12 @@ func (t *Task) Unshare(opts *SharingOptions) error { t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable - if opts.NewFiles { + if flags&linux.CLONE_FILES != 0 { oldFDTable = t.fdTable t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext - if opts.NewFSContext { + if flags&linux.CLONE_FS != 0 { oldFSContext = t.fsContext t.fsContext = oldFSContext.Fork() } diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index d115b8783..fbfcc19e5 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -28,66 +28,14 @@ import ( "errors" "fmt" "strconv" - "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) -// An ExitStatus is a value communicated from an exiting task or thread group -// to the party that reaps it. -// -// +stateify savable -type ExitStatus struct { - // Code is the numeric value passed to the call to exit or exit_group that - // caused the exit. If the exit was not caused by such a call, Code is 0. - Code int - - // Signo is the signal that caused the exit. If the exit was not caused by - // a signal, Signo is 0. - Signo int -} - -func (es ExitStatus) String() string { - var b strings.Builder - if code := es.Code; code != 0 { - if b.Len() != 0 { - b.WriteByte(' ') - } - _, _ = fmt.Fprintf(&b, "Code=%d", code) - } - if signal := es.Signo; signal != 0 { - if b.Len() != 0 { - b.WriteByte(' ') - } - _, _ = fmt.Fprintf(&b, "Signal=%d", signal) - } - return b.String() -} - -// Signaled returns true if the ExitStatus indicates that the exiting task or -// thread group was killed by a signal. -func (es ExitStatus) Signaled() bool { - return es.Signo != 0 -} - -// Status returns the numeric representation of the ExitStatus returned by e.g. -// the wait4() system call. -func (es ExitStatus) Status() uint32 { - return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) -} - -// ShellExitCode returns the numeric exit code that Bash would return for an -// exit status of es. -func (es ExitStatus) ShellExitCode() int { - if es.Signaled() { - return 128 + es.Signo - } - return es.Code -} - // TaskExitState represents a step in the task exit path. // // "Exiting" and "exited" are often ambiguous; prefer to name specific states. @@ -163,13 +111,13 @@ func (t *Task) killedLocked() bool { return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 } -// PrepareExit indicates an exit with status es. +// PrepareExit indicates an exit with the given status. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) PrepareExit(es ExitStatus) { +func (t *Task) PrepareExit(ws linux.WaitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() - t.exitStatus = es + t.exitStatus = ws } // PrepareGroupExit indicates a group exit with status es to t's thread group. @@ -180,7 +128,7 @@ func (t *Task) PrepareExit(es ExitStatus) { // ptrace.) // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) PrepareGroupExit(es ExitStatus) { +func (t *Task) PrepareGroupExit(ws linux.WaitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.tg.exiting || t.tg.execing != nil { @@ -198,8 +146,8 @@ func (t *Task) PrepareGroupExit(es ExitStatus) { return } t.tg.exiting = true - t.tg.exitStatus = es - t.exitStatus = es + t.tg.exitStatus = ws + t.exitStatus = ws for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { if sibling != t { sibling.killLocked() @@ -207,11 +155,11 @@ func (t *Task) PrepareGroupExit(es ExitStatus) { } } -// Kill requests that all tasks in ts exit as if group exiting with status es. +// Kill requests that all tasks in ts exit as if group exiting with status ws. // Kill does not wait for tasks to exit. // // Kill has no analogue in Linux; it's provided for save/restore only. -func (ts *TaskSet) Kill(es ExitStatus) { +func (ts *TaskSet) Kill(ws linux.WaitStatus) { ts.mu.Lock() defer ts.mu.Unlock() ts.Root.exiting = true @@ -219,7 +167,7 @@ func (ts *TaskSet) Kill(es ExitStatus) { t.tg.signalHandlers.mu.Lock() if !t.tg.exiting { t.tg.exiting = true - t.tg.exitStatus = es + t.tg.exitStatus = ws } t.killLocked() t.tg.signalHandlers.mu.Unlock() @@ -730,10 +678,10 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.S info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = linux.CLD_KILLED - info.SetStatus(int32(t.exitStatus.Signo)) + info.SetStatus(int32(t.exitStatus.TerminationSignal())) } else { info.Code = linux.CLD_EXITED - info.SetStatus(int32(t.exitStatus.Code)) + info.SetStatus(int32(t.exitStatus.ExitStatus())) } // TODO(b/72102453): Set utime, stime. return info @@ -741,7 +689,7 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.S // ExitStatus returns t's exit status, which is only guaranteed to be // meaningful if t.ExitState() != TaskExitNone. -func (t *Task) ExitStatus() ExitStatus { +func (t *Task) ExitStatus() linux.WaitStatus { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() @@ -751,7 +699,7 @@ func (t *Task) ExitStatus() ExitStatus { // ExitStatus returns the exit status that would be returned by a consuming // wait*() on tg. -func (tg *ThreadGroup) ExitStatus() ExitStatus { +func (tg *ThreadGroup) ExitStatus() linux.WaitStatus { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() @@ -762,7 +710,9 @@ func (tg *ThreadGroup) ExitStatus() ExitStatus { return tg.leader.exitStatus } -// TerminationSignal returns the thread group's termination signal. +// TerminationSignal returns the thread group's termination signal, which is +// the signal that will be sent to its leader's parent when all threads have +// exited. func (tg *ThreadGroup) TerminationSignal() linux.Signal { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() @@ -888,8 +838,8 @@ type WaitResult struct { // Event is exactly one of the events defined above. Event waiter.EventMask - // Status is the numeric status associated with the event. - Status uint32 + // Status is the wait status associated with the event. + Status linux.WaitStatus } // Wait waits for an event from a thread group that is a child of t's thread @@ -942,7 +892,7 @@ func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { if anyWaitableTasks { return nil, ErrNoWaitableEvent } - return nil, syserror.ECHILD + return nil, linuxerr.ECHILD } // Preconditions: The TaskSet mutex must be locked for writing. @@ -1042,7 +992,7 @@ func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtrace } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() - status := target.exitStatus.Status() + status := target.exitStatus if !opts.ConsumeEvent { return &WaitResult{ Task: target, @@ -1056,7 +1006,7 @@ func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtrace // differ from that reported by a consuming wait; the latter will return // the group exit code if one is available. if target.tg.exiting { - status = target.tg.exitStatus.Status() + status = target.tg.exitStatus } // t may be (in the thread group of) target's parent, tracer, or both. We // don't need to check for !exitTracerAcked because tracees are detached @@ -1122,12 +1072,11 @@ func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) target.tg.groupStopWaitable = false } return &WaitResult{ - Task: target, - TID: pid, - UID: uid, - Event: EventChildGroupStop, - // There is no name for these status constants. - Status: (uint32(sig)&0xff)<<8 | 0x7f, + Task: target, + TID: pid, + UID: uid, + Event: EventChildGroupStop, + Status: linux.WaitStatusStopped(uint32(sig)), } } @@ -1148,7 +1097,7 @@ func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) * TID: pid, UID: uid, Event: EventGroupContinue, - Status: 0xffff, + Status: linux.WaitStatusContinued(), } } @@ -1176,7 +1125,7 @@ func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *Wai TID: pid, UID: uid, Event: EventTraceeStop, - Status: uint32(code)<<8 | 0x7f, + Status: linux.WaitStatusStopped(uint32(code)), } } diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 0325967e4..a9067b682 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -16,9 +16,9 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" ) // Credentials returns t's credentials. @@ -47,7 +47,7 @@ func (t *Task) HasCapability(cp linux.Capability) bool { func (t *Task) SetUID(uid auth.UID) error { // setuid considers -1 to be invalid. if !uid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } t.mu.Lock() @@ -56,7 +56,7 @@ func (t *Task) SetUID(uid auth.UID) error { creds := t.Credentials() kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // "setuid() sets the effective user ID of the calling process. If the // effective UID of the caller is root (more precisely: if the caller has @@ -70,7 +70,7 @@ func (t *Task) SetUID(uid auth.UID) error { // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." if kuid != creds.RealKUID && kuid != creds.SavedKUID { - return syserror.EPERM + return linuxerr.EPERM } t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil @@ -87,26 +87,26 @@ func (t *Task) SetREUID(r, e auth.UID) error { if r.Ok() { newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } newE := creds.EffectiveKUID if e.Ok() { newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { - return syserror.EPERM + return linuxerr.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." if newR != creds.RealKUID && newR != creds.EffectiveKUID { - return syserror.EPERM + return linuxerr.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user @@ -223,7 +223,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // SetGID implements the semantics of setgid(2). func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } t.mu.Lock() @@ -232,14 +232,14 @@ func (t *Task) SetGID(gid auth.GID) error { creds := t.Credentials() kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } if kgid != creds.RealKGID && kgid != creds.SavedKGID { - return syserror.EPERM + return linuxerr.EPERM } t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil @@ -255,22 +255,22 @@ func (t *Task) SetREGID(r, e auth.GID) error { if r.Ok() { newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } newE := creds.EffectiveKGID if e.Ok() { newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } if !creds.HasCapability(linux.CAP_SETGID) { if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { - return syserror.EPERM + return linuxerr.EPERM } if newR != creds.RealKGID && newR != creds.EffectiveKGID { - return syserror.EPERM + return linuxerr.EPERM } } newS := creds.SavedKGID @@ -343,13 +343,13 @@ func (t *Task) SetExtraGIDs(gids []auth.GID) error { defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETGID) { - return syserror.EPERM + return linuxerr.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } kgids[i] = kgid } @@ -367,25 +367,25 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili // "Permitted: This is a limiting superset for the effective capabilities // that the thread may assume." - capabilities(7) if effective & ^permitted != 0 { - return syserror.EPERM + return linuxerr.EPERM } creds := t.Credentials() // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { - return syserror.EPERM + return linuxerr.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." if permitted & ^creds.PermittedCaps != 0 { - return syserror.EPERM + return linuxerr.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { - return syserror.EPERM + return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.PermittedCaps = permitted @@ -402,7 +402,7 @@ func (t *Task) DropBoundingCapability(cp linux.Capability) error { defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETPCAP) { - return syserror.EPERM + return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.BoundingCaps &^= auth.CapabilitySetOf(cp) @@ -422,7 +422,7 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { - return syserror.EPERM + return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 72b9a0384..8de08151a 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -235,7 +235,7 @@ func (t *Task) traceExitEvent() { if !trace.IsEnabled() { return } - trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) + trace.Logf(t.traceContext, traceCategory, "exit status: %s", t.exitStatus) } // traceExecEvent is called when a task calls exec. diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 068f25af1..054ff212f 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -377,7 +377,7 @@ func (app *runApp) execute(t *Task) taskRunState { default: // What happened? Can't continue. t.Warningf("Unexpected SwitchToApp error: %v", err) - t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)}) + t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) return (*runExit)(nil) } } diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index f142feab4..9d9fa76a6 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -23,12 +23,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // TaskGoroutineState is a coarse representation of the current execution @@ -601,7 +601,7 @@ func (t *Task) SetCPUMask(mask sched.CPUSet) error { // Ensure that at least 1 CPU is still allowed. if mask.NumCPUs() == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if t.k.useHostCores { diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 8ca61ed48..7065ac79c 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -22,6 +22,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -156,7 +157,8 @@ func (t *Task) PendingSignals() linux.SignalSet { // deliverSignal delivers the given signal and returns the following run state. func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRunState { - sigact := computeAction(linux.Signal(info.Signo), act) + sig := linux.Signal(info.Signo) + sigact := computeAction(sig, act) if t.haveSyscallReturn { if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { @@ -197,14 +199,14 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu } // Attach an fault address if appropriate. - switch linux.Signal(info.Signo) { + switch sig { case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS: ucs.FaultAddr = info.Addr() } eventchannel.Emit(ucs) - t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)}) + t.PrepareGroupExit(linux.WaitStatusTerminationSignal(sig)) return (*runExit)(nil) case SignalActionStop: @@ -224,12 +226,12 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu // Send a forced SIGSEGV. If the signal that couldn't be delivered // was a SIGSEGV, force the handler to SIG_DFL. - t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */) + t.forceSignal(linux.SIGSEGV, sig == linux.SIGSEGV /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) } default: - panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act))) + panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(sig, act))) } return (*runInterrupt)(nil) } @@ -338,7 +340,7 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux. } if timeout == 0 { - return nil, syserror.EAGAIN + return nil, linuxerr.EAGAIN } // Unblock signals we're waiting for. Remember the original signal mask so @@ -359,8 +361,8 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux. if info := t.dequeueSignalLocked(mask); info != nil { return info, nil } - if err == syserror.ETIMEDOUT { - return nil, syserror.EAGAIN + if err == linuxerr.ETIMEDOUT { + return nil, linuxerr.EAGAIN } return nil, err } @@ -369,9 +371,9 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux. // // The following errors may be returned: // -// syserror.ESRCH - The task has exited. -// syserror.EINVAL - The signal is not valid. -// syserror.EAGAIN - THe signal is realtime, and cannot be queued. +// linuxerr.ESRCH - The task has exited. +// linuxerr.EINVAL - The signal is not valid. +// linuxerr.EAGAIN - THe signal is realtime, and cannot be queued. // func (t *Task) SendSignal(info *linux.SignalInfo) error { t.tg.pidns.owner.mu.RLock() @@ -406,14 +408,14 @@ func (t *Task) sendSignalLocked(info *linux.SignalInfo, group bool) error { func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *IntervalTimer) error { if t.exitState == TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } sig := linux.Signal(info.Signo) if sig == 0 { return nil } if !sig.IsValid() { - return syserror.EINVAL + return linuxerr.EINVAL } // Signal side effects apply even if the signal is ultimately discarded. @@ -450,7 +452,7 @@ func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer * } if !q.enqueue(info, timer) { if sig.IsRealtime() { - return syserror.EAGAIN + return linuxerr.EAGAIN } t.Debugf("Discarding duplicate signal %d", sig) if timer != nil { @@ -505,7 +507,7 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { // ignores tg.execing. if !tg.exiting { tg.exiting = true - tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)} + tg.exitStatus = linux.WaitStatusTerminationSignal(linux.SIGKILL) } for t := tg.tasks.Front(); t != nil; t = t.Next() { t.killLocked() @@ -684,7 +686,7 @@ func (t *Task) SetSignalStack(alt linux.SignalStack) bool { // to *actptr (if actptr is not nil) and returns the old signal action. func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (linux.SigAction, error) { if !sig.IsValid() { - return linux.SigAction{}, syserror.EINVAL + return linux.SigAction{}, linuxerr.EINVAL } tg.pidns.owner.mu.RLock() @@ -695,7 +697,7 @@ func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) ( oldact := sh.actions[sig] if actptr != nil { if sig == linux.SIGKILL || sig == linux.SIGSTOP { - return oldact, syserror.EINVAL + return oldact, linuxerr.EINVAL } act := *actptr diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 41fd2d471..0565059c1 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -299,7 +300,7 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) { // Did we do a full cycle? if tid == ns.last { // No tid available. - return 0, syserror.EAGAIN + return 0, linuxerr.EAGAIN } } } diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 1874f74e5..0586c9def 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -22,6 +22,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" @@ -160,7 +161,7 @@ func (t *Task) doSyscall() taskRunState { // ok case linux.SECCOMP_RET_KILL_THREAD: t.Debugf("Syscall %d: killed by seccomp", sysno) - t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) case linux.SECCOMP_RET_TRACE: t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) @@ -310,7 +311,7 @@ func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} case linux.SECCOMP_RET_KILL_THREAD: t.Debugf("vsyscall %d: killed by seccomp", sysno) - t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) default: panic(fmt.Sprintf("Unknown seccomp result %d", r)) @@ -337,7 +338,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip // causes do_exit(SIGSYS), and changing sp is ignored. if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { - t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) } if sysno == ^uintptr(0) { @@ -380,6 +381,8 @@ func ExtractErrno(err error, sysno int) int { return 0 case unix.Errno: return int(err) + case *errors.Error: + return int(err.Errno()) case syserror.SyscallRestartErrno: return int(err) case *memmap.BusError: diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index fc6d9438a..8e2c36598 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" @@ -132,7 +133,7 @@ func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) erro case 8: const itemLen = 16 if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok { - return syserror.EFAULT + return linuxerr.EFAULT } b := t.CopyScratchBuffer(itemLen) @@ -190,7 +191,7 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan case 8: const itemLen = 16 if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok { - return hostarch.AddrRangeSeq{}, syserror.EFAULT + return hostarch.AddrRangeSeq{}, linuxerr.EFAULT } b := t.CopyScratchBuffer(itemLen) @@ -202,11 +203,11 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8])) length := hostarch.ByteOrder.Uint64(b[8:16]) if length > math.MaxInt64 { - return hostarch.AddrRangeSeq{}, syserror.EINVAL + return hostarch.AddrRangeSeq{}, linuxerr.EINVAL } ar, ok := t.MemoryManager().CheckIORange(base, int64(length)) if !ok { - return hostarch.AddrRangeSeq{}, syserror.EFAULT + return hostarch.AddrRangeSeq{}, linuxerr.EFAULT } if numIovecs == 1 { @@ -252,7 +253,7 @@ func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOO } ar, ok := t.MemoryManager().CheckIORange(addr, int64(length)) if !ok { - return usermem.IOSequence{}, syserror.EFAULT + return usermem.IOSequence{}, linuxerr.EFAULT } return usermem.IOSequence{ IO: t.MemoryManager(), @@ -270,7 +271,7 @@ func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOO // Preconditions: Same as Task.CopyInIovecs. func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } ars, err := t.CopyInIovecs(addr, iovcnt) if err != nil { @@ -312,7 +313,7 @@ func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) { tmm := cc.t.MemoryManager() cc.t.mu.Unlock() if !tmm.IncUsers() { - return nil, syserror.EFAULT + return nil, linuxerr.EFAULT } return tmm, nil } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 4566e4c7c..2eda15303 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -19,13 +19,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // A ThreadGroup is a logical grouping of tasks that has widespread @@ -143,7 +143,7 @@ type ThreadGroup struct { // // While exiting is false, exitStatus is protected by the signal mutex. // When exiting becomes true, exitStatus becomes immutable. - exitStatus ExitStatus + exitStatus linux.WaitStatus // terminationSignal is the signal that this thread group's leader will // send to its parent when it exits. @@ -357,7 +357,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) // "The calling process must be a session leader and not have a // controlling terminal already." - tty_ioctl(4) if tg.processGroup.session.leader != tg || tg.tty != nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(tg.leader) @@ -371,7 +371,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { // Stealing requires CAP_SYS_ADMIN in the root user namespace. if !hasAdmin || !steal { - return syserror.EPERM + return linuxerr.EPERM } // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. for othertg := range tg.pidns.owner.Root.tgids { @@ -391,7 +391,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) } if !isReadable && !hasAdmin { - return syserror.EPERM + return linuxerr.EPERM } // Set the controlling terminal and foreground process group. @@ -419,7 +419,7 @@ func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { if tg.tty == nil || tg.tty != tty { tg.signalHandlers.mu.Unlock() - return syserror.ENOTTY + return linuxerr.ENOTTY } // "If the process was session leader, then send SIGHUP and SIGCONT to @@ -473,7 +473,7 @@ func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) { // "When fd does not refer to the controlling terminal of the calling // process, -1 is returned" - tcgetpgrp(3) if tg.tty != tty { - return -1, syserror.ENOTTY + return -1, linuxerr.ENOTTY } return int32(tg.processGroup.session.foreground.id), nil @@ -496,24 +496,24 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) // tty must be the controlling terminal. if tg.tty != tty { - return -1, syserror.ENOTTY + return -1, linuxerr.ENOTTY } // pgid must be positive. if pgid < 0 { - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } // pg must not be empty. Empty process groups are removed from their // pid namespaces. pg, ok := tg.pidns.processGroups[pgid] if !ok { - return -1, syserror.ESRCH + return -1, linuxerr.ESRCH } // pg must be part of this process's session. if tg.processGroup.session != pg.session { - return -1, syserror.EPERM + return -1, linuxerr.EPERM } tg.processGroup.session.foreground.id = pgid diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 2817aa3ba..e293d9a0f 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -13,8 +13,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sync", - "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index 26aa34aa6..191b92811 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -22,8 +22,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -322,7 +322,7 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett // interpreted as a time relative to now. func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) { if value < 0 { - return Setting{}, syserror.EINVAL + return Setting{}, linuxerr.EINVAL } if value == 0 { return Setting{Period: interval}, nil @@ -338,7 +338,7 @@ func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (S // interpreted as an absolute time. func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) { if value.Before(ZeroTime) { - return Setting{}, syserror.EINVAL + return Setting{}, linuxerr.EINVAL } if value.IsZero() { return Setting{Period: interval}, nil diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go index dfc3c0719..b6039505a 100644 --- a/pkg/sentry/kernel/timekeeper_test.go +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -17,12 +17,12 @@ package kernel import ( "testing" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/pgalloc" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // mockClocks is a sentrytime.Clocks that simply returns the times in the @@ -45,7 +45,7 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) { case sentrytime.Realtime: return c.realtime, nil default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 4c7666e33..577374fa4 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -477,7 +477,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in // the open path would return a different // error. ctx.Infof("PT_INTERP path is empty: %v", path) - return loadedELF{}, syserror.EACCES + return loadedELF{}, linuxerr.EACCES } } } @@ -518,13 +518,13 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in start, ok = start.AddLength(uint64(offset)) if !ok { ctx.Infof(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset)) - return loadedELF{}, syserror.EINVAL + return loadedELF{}, linuxerr.EINVAL } end, ok = end.AddLength(uint64(offset)) if !ok { ctx.Infof(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset)) - return loadedELF{}, syserror.EINVAL + return loadedELF{}, linuxerr.EINVAL } info.entry, ok = info.entry.AddLength(uint64(offset)) @@ -624,18 +624,18 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.Fil if err != nil { if linuxerr.Equals(linuxerr.ENOEXEC, err) { // Bad interpreter. - err = syserror.ELIBBAD + err = linuxerr.ELIBBAD } return loadedELF{}, err } if info.os != initial.os { ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os) - return loadedELF{}, syserror.ELIBBAD + return loadedELF{}, linuxerr.ELIBBAD } if info.arch != initial.arch { ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch) - return loadedELF{}, syserror.ELIBBAD + return loadedELF{}, linuxerr.ELIBBAD } // The interpreter is not given a load offset, as its location does not diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 8240173ae..86d0c54cd 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -113,7 +114,7 @@ func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string } if t != linux.ModeRegular { ctx.Infof("%q is not a regular file: %v", filename, t) - return syserror.EACCES + return linuxerr.EACCES } return nil } @@ -207,7 +208,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context args.File = nil } - return loadedELF{}, nil, nil, nil, syserror.ELOOP + return loadedELF{}, nil, nil, nil, linuxerr.ELOOP } // Load loads args.File into a MemoryManager. If args.File is nil, the path diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index fd54261fd..054ef1723 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" @@ -58,7 +59,7 @@ type byteFullReader struct { func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset >= int64(len(b.data)) { return 0, io.EOF diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 346866d3c..b7f765cd7 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -17,12 +17,12 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -158,11 +158,11 @@ func (ctx *AIOContext) Prepare() error { defer ctx.mu.Unlock() if ctx.dead { // Context died after the caller looked it up. - return syserror.EINVAL + return linuxerr.EINVAL } if ctx.outstanding >= ctx.maxOutstanding { // Context is busy. - return syserror.EAGAIN + return linuxerr.EAGAIN } ctx.outstanding++ return nil @@ -297,7 +297,7 @@ func (m *aioMappable) InodeID() uint64 { // Msync implements memmap.MappingIdentity.Msync. func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { // Linux: aio_ring_fops.fsync == NULL - return syserror.EINVAL + return linuxerr.EINVAL } // AddMapping implements memmap.Mappable.AddMapping. @@ -305,7 +305,7 @@ func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar ho // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() // sets VM_DONTEXPAND). if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { - return syserror.EFAULT + return linuxerr.EFAULT } return nil } @@ -319,13 +319,13 @@ func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, s // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() // sets VM_DONTEXPAND). if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { - return syserror.EFAULT + return linuxerr.EFAULT } // Require that the mapping correspond to a live AIOContext. Compare // Linux's fs/aio.c:aio_ring_mremap(). mm, ok := ms.(*MemoryManager) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } am := &mm.aioManager am.mu.Lock() @@ -333,12 +333,12 @@ func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, s oldID := uint64(srcAR.Start) aioCtx, ok := am.contexts[oldID] if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } aioCtx.mu.Lock() defer aioCtx.mu.Unlock() if aioCtx.dead { - return syserror.EINVAL + return linuxerr.EINVAL } // Use the new ID for the AIOContext. am.contexts[uint64(dstAR.Start)] = aioCtx @@ -350,7 +350,7 @@ func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, s func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > m.fr.Length() { - err = &memmap.BusError{syserror.EFAULT} + err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ @@ -399,7 +399,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint id := uint64(addr) if !mm.aioManager.newAIOContext(events, id) { mm.MUnmap(ctx, addr, aioRingBufferSize) - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return id, nil } diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go index 16f318ab3..5fcfeb473 100644 --- a/pkg/sentry/mm/io.go +++ b/pkg/sentry/mm/io.go @@ -16,10 +16,10 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -97,14 +97,14 @@ func translateIOError(ctx context.Context, err error) error { if logIOErrors { ctx.Debugf("MM I/O error: %v", err) } - return syserror.EFAULT + return linuxerr.EFAULT } // CopyOut implements usermem.IO.CopyOut. func (mm *MemoryManager) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) { ar, ok := mm.CheckIORange(addr, int64(len(src))) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if len(src) == 0 { @@ -147,7 +147,7 @@ func (mm *MemoryManager) asCopyOut(ctx context.Context, addr hostarch.Addr, src func (mm *MemoryManager) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) { ar, ok := mm.CheckIORange(addr, int64(len(dst))) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if len(dst) == 0 { @@ -190,7 +190,7 @@ func (mm *MemoryManager) asCopyIn(ctx context.Context, addr hostarch.Addr, dst [ func (mm *MemoryManager) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { ar, ok := mm.CheckIORange(addr, toZero) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if toZero == 0 { @@ -231,7 +231,7 @@ func (mm *MemoryManager) asZeroOut(ctx context.Context, addr hostarch.Addr, toZe // CopyOutFrom implements usermem.IO.CopyOutFrom. func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { if !mm.checkIOVec(ars) { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if ars.NumBytes() == 0 { @@ -276,7 +276,7 @@ func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRange // CopyInTo implements usermem.IO.CopyInTo. func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { if !mm.checkIOVec(ars) { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if ars.NumBytes() == 0 { @@ -314,7 +314,7 @@ func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. @@ -339,7 +339,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } im := ims.Head() var err error @@ -357,7 +357,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. @@ -382,7 +382,7 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } im := ims.Head() var err error @@ -400,7 +400,7 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. @@ -425,7 +425,7 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opt _, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } im := ims.Head() var err error diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 5583f62b2..9f4cc238f 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -18,12 +18,12 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // existingPMAsLocked checks that pmas exist for all addresses in ar, and @@ -116,7 +116,7 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar var alignerr error if !ok { end = ar.End.RoundDown() - alignerr = syserror.EFAULT + alignerr = linuxerr.EFAULT } ar = hostarch.AddrRange{ar.Start.RoundDown(), end} @@ -162,7 +162,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.Addr var alignerr error if !ok { end = ar.End.RoundDown() - alignerr = syserror.EFAULT + alignerr = linuxerr.EFAULT } ar = hostarch.AddrRange{ar.Start.RoundDown(), end} diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go index 3130be80c..94d5112a1 100644 --- a/pkg/sentry/mm/shm.go +++ b/pkg/sentry/mm/shm.go @@ -16,16 +16,16 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" - "gvisor.dev/gvisor/pkg/syserror" ) // DetachShm unmaps a sysv shared memory segment. func (mm *MemoryManager) DetachShm(ctx context.Context, addr hostarch.Addr) error { if addr != addr.RoundDown() { // "... shmaddr is not aligned on a page boundary." - man shmdt(2) - return syserror.EINVAL + return linuxerr.EINVAL } var detached *shm.Shm @@ -48,7 +48,7 @@ func (mm *MemoryManager) DetachShm(ctx context.Context, addr hostarch.Addr) erro if detached == nil { // There is no shared memory segment attached at addr. - return syserror.EINVAL + return linuxerr.EINVAL } // Remove all vmas that could have been created by the same attach. diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index e748b7ff8..69c6e77a7 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -16,11 +16,11 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with @@ -94,7 +94,7 @@ func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, hostar func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > m.fr.Length() { - err = &memmap.BusError{syserror.EFAULT} + err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ @@ -144,11 +144,11 @@ func (m *SpecialMappable) Length() uint64 { // leak (b/143656263). Delete this function along with VFS1. func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) { if length == 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } alignedLen, ok := hostarch.Addr(length).RoundUp() if !ok { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous) if err != nil { diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index f46f85eb1..256eb4afb 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -37,7 +37,7 @@ import ( func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr, at hostarch.AccessType, sp hostarch.Addr) error { ar, ok := addr.RoundDown().ToRange(hostarch.PageSize) if !ok { - return syserror.EFAULT + return linuxerr.EFAULT } // Don't bother trying existingPMAsLocked; in most cases, if we did have @@ -75,7 +75,7 @@ func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr // MMap establishes a memory mapping. func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) { if opts.Length == 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } length, ok := hostarch.Addr(opts.Length).RoundUp() if !ok { @@ -86,7 +86,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostar if opts.Mappable != nil { // Offset must be aligned. if hostarch.Addr(opts.Offset).RoundDown() != hostarch.Addr(opts.Offset) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Offset + length must not overflow. if end := opts.Offset + opts.Length; end < opts.Offset { @@ -100,19 +100,19 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostar // MAP_FIXED requires addr to be page-aligned; non-fixed mappings // don't. if opts.Fixed { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } opts.Addr = opts.Addr.RoundDown() } if !opts.MaxPerms.SupersetOf(opts.Perms) { - return 0, syserror.EACCES + return 0, linuxerr.EACCES } if opts.Unmap && !opts.Fixed { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if opts.GrowsDown && opts.Mappable != nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get the new vma. @@ -204,6 +204,7 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar h // * vseg.Range().IsSupersetOf(ar). // // Postconditions: mm.mappingMu will be unlocked. +// +checklocksrelease:mm.mappingMu func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) { // See populateVMA above for commentary. if !vseg.ValuePtr().effectivePerms.Any() { @@ -282,18 +283,18 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, erro // MUnmap implements the semantics of Linux's munmap(2). func (mm *MemoryManager) MUnmap(ctx context.Context, addr hostarch.Addr, length uint64) error { if addr != addr.RoundDown() { - return syserror.EINVAL + return linuxerr.EINVAL } if length == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } la, ok := hostarch.Addr(length).RoundUp() if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } ar, ok := addr.ToRange(uint64(la)) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -332,7 +333,7 @@ const ( func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (hostarch.Addr, error) { // "Note that old_address has to be page aligned." - mremap(2) if oldAddr.RoundDown() != oldAddr { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a @@ -341,13 +342,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS oldSize = uint64(oldSizeAddr) newSizeAddr, ok := hostarch.Addr(newSize).RoundUp() if !ok || newSizeAddr == 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } newSize = uint64(newSizeAddr) oldEnd, ok := oldAddr.AddLength(oldSize) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } mm.mappingMu.Lock() @@ -356,7 +357,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // All cases require that a vma exists at oldAddr. vseg := mm.vmas.FindSegment(oldAddr) if !vseg.Ok() { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Behavior matrix: @@ -380,7 +381,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { - return 0, syserror.EAGAIN + return 0, linuxerr.EAGAIN } } } @@ -403,7 +404,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // Check that oldEnd maps to the same vma as oldAddr. if vseg.End() < oldEnd { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // "Grow" the existing vma by creating a new mergeable one. vma := vseg.ValuePtr() @@ -451,15 +452,15 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS case MRemapMustMove: newAddr := opts.NewAddr if newAddr.RoundDown() != newAddr { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var ok bool newAR, ok = newAddr.ToRange(newSize) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if (hostarch.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that the new region is valid. @@ -493,7 +494,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // Check that oldEnd maps to the same vma as oldAddr. if vseg.End() < oldEnd { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Check against RLIMIT_AS. @@ -505,7 +506,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS if vma := vseg.ValuePtr(); vma.mappable != nil { // Check that offset+length does not overflow. if vma.off+uint64(newAR.Length()) < vma.off { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Inform the Mappable, if any, of the new mapping. if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil { @@ -591,7 +592,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // MProtect implements the semantics of Linux's mprotect(2). func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms hostarch.AccessType, growsDown bool) error { if addr.RoundDown() != addr { - return syserror.EINVAL + return linuxerr.EINVAL } if length == 0 { return nil @@ -619,7 +620,7 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h } if growsDown { if !vseg.ValuePtr().growsDown { - return syserror.EINVAL + return linuxerr.EINVAL } if ar.End <= vseg.Start() { return syserror.ENOMEM @@ -645,7 +646,7 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h // Check for permission validity before splitting vmas, for consistency // with Linux. if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) { - return syserror.EACCES + return linuxerr.EACCES } vseg = mm.vmas.Isolate(vseg, ar) @@ -712,7 +713,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch. if addr < mm.brk.Start { addr = mm.brk.End mm.mappingMu.Unlock() - return addr, syserror.EINVAL + return addr, linuxerr.EINVAL } // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is @@ -731,7 +732,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch. if !ok { addr = mm.brk.End mm.mappingMu.Unlock() - return addr, syserror.EFAULT + return addr, linuxerr.EFAULT } switch { @@ -781,7 +782,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u la, _ := hostarch.Addr(length + addr.PageOffset()).RoundUp() ar, ok := addr.RoundDown().ToRange(uint64(la)) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -793,7 +794,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { mm.mappingMu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { mm.mappingMu.Unlock() @@ -860,7 +861,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u return syserror.ENOMEM } if linuxerr.Equals(linuxerr.ENOMEM, err) { - return syserror.EAGAIN + return linuxerr.EAGAIN } return err } @@ -899,7 +900,7 @@ type MLockAllOpts struct { // depending on opts. func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { if !opts.Current && !opts.Future { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -912,7 +913,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { mm.mappingMu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } if uint64(mm.vmas.Span()) > mlockLimit { mm.mappingMu.Unlock() @@ -971,7 +972,7 @@ func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint6 defer mm.mappingMu.RUnlock() vseg := mm.vmas.FindSegment(addr) if !vseg.Ok() { - return 0, 0, syserror.EFAULT + return 0, 0, linuxerr.EFAULT } vma := vseg.ValuePtr() return vma.numaPolicy, vma.numaNodemask, nil @@ -980,13 +981,13 @@ func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint6 // SetNumaPolicy implements the semantics of Linux's mbind(). func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error { if !addr.IsPageAligned() { - return syserror.EINVAL + return linuxerr.EINVAL } // Linux allows this to overflow. la, _ := hostarch.Addr(length).RoundUp() ar, ok := addr.ToRange(uint64(la)) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } if ar.Length() == 0 { return nil @@ -1004,7 +1005,7 @@ func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy if !vseg.Ok() || lastEnd < vseg.Start() { // "EFAULT: ... there was an unmapped hole in the specified memory // range specified [sic] by addr and len." - mbind(2) - return syserror.EFAULT + return linuxerr.EFAULT } vseg = mm.vmas.Isolate(vseg, ar) vma := vseg.ValuePtr() @@ -1022,7 +1023,7 @@ func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork bool) error { ar, ok := addr.ToRange(length) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -1048,7 +1049,7 @@ func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { ar, ok := addr.ToRange(length) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.RLock() @@ -1064,7 +1065,7 @@ func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { vma := vseg.ValuePtr() if vma.mlockMode != memmap.MLockNone { - return syserror.EINVAL + return linuxerr.EINVAL } vsegAR := vseg.Range().Intersect(ar) // pseg should already correspond to either this vma or a later one, @@ -1115,7 +1116,7 @@ type MSyncOpts struct { // MSync implements the semantics of Linux's msync(). func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length uint64, opts MSyncOpts) error { if addr != addr.RoundDown() { - return syserror.EINVAL + return linuxerr.EINVAL } if length == 0 { return nil @@ -1151,7 +1152,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u vma := vseg.ValuePtr() if opts.Invalidate && vma.mlockMode != memmap.MLockNone { mm.mappingMu.RUnlock() - return syserror.EBUSY + return linuxerr.EBUSY } // It's only possible to have dirtied the Mappable through a shared // mapping. Don't check if the mapping is writable, because mprotect @@ -1192,7 +1193,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr hostarch.Addr) (futex.Key, error) { ar, ok := addr.ToRange(4) // sizeof(int32). if !ok { - return futex.Key{}, syserror.EFAULT + return futex.Key{}, linuxerr.EFAULT } mm.mappingMu.RLock() diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 0d019e41d..5f8ab7ca3 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -66,14 +67,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { - return vmaIterator{}, hostarch.AddrRange{}, syserror.EPERM + return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EPERM } newLockedAS := mm.lockedAS + opts.Length if opts.Unmap { newLockedAS -= mm.mlockedBytesRangeLocked(ar) } if newLockedAS > mlockLimit { - return vmaIterator{}, hostarch.AddrRange{}, syserror.EAGAIN + return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EAGAIN } } } @@ -288,7 +289,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRang vma := vseg.ValuePtr() if addr < vseg.Start() { // TODO(jamieliu): Implement vma.growsDown here. - return vbegin, vgap, syserror.EFAULT + return vbegin, vgap, linuxerr.EFAULT } perms := vma.effectivePerms @@ -296,7 +297,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRang perms = vma.maxPerms } if !perms.SupersetOf(at) { - return vbegin, vgap, syserror.EPERM + return vbegin, vgap, linuxerr.EPERM } addr = vseg.End() @@ -308,7 +309,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRang } // Ran out of vmas before ar.End. - return vbegin, vgap, syserror.EFAULT + return vbegin, vgap, linuxerr.EFAULT } // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 57d73d770..d351869ef 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -85,6 +85,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/memutil", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index d1a883da4..0c8542485 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -31,6 +31,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" @@ -674,7 +675,7 @@ func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (s panic(fmt.Sprintf("invalid range: %v", fr)) } if at.Execute { - return safemem.BlockSeq{}, syserror.EACCES + return safemem.BlockSeq{}, linuxerr.EACCES } chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) @@ -944,7 +945,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( // NOTE(b/165896008): mincore (which is passed as checkCommitted) // by f.UpdateUsage() might take a really long time. So unlock f.mu // while checkCommitted runs. - f.mu.Unlock() + f.mu.Unlock() // +checklocksforce err := checkCommitted(s, buf) f.mu.Lock() if err != nil { diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index d761bbdee..0567c8d32 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm @@ -74,8 +75,27 @@ func (c *vCPU) KernelSyscall() { // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. We only worry about saving on exit. ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment. + // N.B. Since KernelSyscall is called when the kernel makes a syscall, + // FS_BASE is already set for correct execution of this function. + // + // Refresher on syscall/exception handling: + // 1. When the sentry is in guest mode and makes a syscall, it goes to + // sysenter(), which saves the register state (including RIP of SYSCALL + // instruction) to vCPU.registers. + // 2. It then calls KernelSyscall, which rewinds the IP and executes + // HLT. + // 3. HLT does a VM-exit to bluepillHandler, which returns from the + // signal handler using vCPU.registers, directly to the SYSCALL + // instruction. + // 4. Later, when we want to re-use the vCPU (perhaps on a different + // host thread), we set the new thread's registers in vCPU.registers + // (as opposed to setting the KVM registers with KVM_SET_REGS). + // 5. KVM_RUN thus enters the guest with the old register state, + // immediately following the HLT instruction, returning here. + // 6. We then restore FS_BASE and the full registers from vCPU.register + // to return from sysenter() back to the desired bluepill point from + // the host. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // KernelException handles kernel exceptions. @@ -93,8 +113,8 @@ func (c *vCPU) KernelException(vector ring0.Vector) { } // See above. ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment. + // See above. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // bluepillArchExit is called during bluepillEnter. diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s index 953024600..c2a1dca11 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -37,7 +37,15 @@ TEXT ·bluepill(SB),NOSPLIT,$0 begin: MOVQ vcpu+0(FP), AX LEAQ VCPU_CPU(AX), BX + + // The gorountine stack will be changed in guest which renders + // the frame pointer outdated and misleads perf tools. + // Disconnect the frame-chain with the zeroed frame pointer + // when it is saved in the frame in bluepillHandler(). + MOVQ BP, CX + MOVQ $0, BP BYTE CLI; + MOVQ CX, BP check_vcpu: MOVQ ENTRY_CPU_SELF(GS), CX CMPQ BX, CX diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 198bafdea..4ba1d6f9c 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index 9e5c52923..acb0cb05f 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index f105fdbd0..ee7dba828 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index 28a613a54..8fd8287b3 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -101,7 +101,7 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi // Store the physical address in the slot. This is used to // avoid calls to handleBluepillFault in the future (see // machine.mapPhysical). - atomic.StoreUintptr(&m.usedSlots[slot], physical) + atomic.StoreUintptr(&m.usedSlots[slot], physicalStart) // Successfully added region; we can increment nextSlot and // allow another set to proceed here. atomic.StoreUint32(&m.nextSlot, slot+1) diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 6f87236ad..0f0c1e73b 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.12 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package kvm @@ -28,7 +30,7 @@ import ( ) //go:linkname throw runtime.throw -func throw(string) +func throw(s string) // vCPUPtr returns a CPU for the given address. // @@ -85,6 +87,13 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { // signal stack. It should only execute raw system calls and functions that are // explicitly marked go:nosplit. // +// Ideally, this function should switch to gsignal, as runtime.sigtramp does, +// but that is tedious given all the runtime internals. That said, using +// gsignal inside a signal handler is not _required_, provided we avoid stack +// splits and allocations. Note that calling any splittable function here will +// be flaky; if the signal stack is below the G stack then we will trigger a +// split and crash. If above, we won't trigger a split. +// // +checkescape:all // //go:nosplit diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index b9ed4a706..a5189d9e2 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index b1cab89a0..c3fbbdc75 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm @@ -28,7 +29,7 @@ import ( ) func TestSegments(t *testing.T) { - applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleSegments(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestSegments(regs) for { var si linux.SignalInfo @@ -55,7 +56,7 @@ func TestSegments(t *testing.T) { func stmxcsr(addr *uint32) func TestMXCSR(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo switchOpts := ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go index 0c43d72f4..7fdb6ac64 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index b73340f0e..159808433 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go index 0e3d84d95..b53e354da 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_test.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go index f07a9f34d..54d579a2b 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index fe570aff9..3a30286e2 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -120,13 +120,13 @@ func TestKernelFloatingPoint(t *testing.T) { }) } -func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { +func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { // Initialize registers & page tables. var ( regs arch.Registers pt *pagetables.PageTables ) - testutil.SetTestTarget(®s, target) + testutil.SetTestTarget(®s, targetFn) kvmTest(t, func(k *KVM) { // Create new page tables. @@ -157,7 +157,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func } func TestApplicationSyscall(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -171,7 +171,7 @@ func TestApplicationSyscall(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -187,7 +187,7 @@ func TestApplicationSyscall(t *testing.T) { } func TestApplicationFault(t *testing.T) { - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -202,7 +202,7 @@ func TestApplicationFault(t *testing.T) { } return false }) - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -219,7 +219,7 @@ func TestApplicationFault(t *testing.T) { } func TestRegistersSyscall(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleRegsSyscall(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si linux.SignalInfo @@ -242,7 +242,7 @@ func TestRegistersSyscall(t *testing.T) { } func TestRegistersFault(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleRegsFault(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si linux.SignalInfo @@ -266,7 +266,7 @@ func TestRegistersFault(t *testing.T) { } func TestBounce(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -281,7 +281,7 @@ func TestBounce(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -300,7 +300,7 @@ func TestBounce(t *testing.T) { } func TestBounceStress(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { randomSleep := func() { // O(hundreds of microseconds) is appropriate to ensure // different overlaps and different schedules. @@ -336,7 +336,7 @@ func TestBounceStress(t *testing.T) { func TestInvalidate(t *testing.T) { var data uintptr // Used below. - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, &data) // Read legitimate value. for { var si linux.SignalInfo @@ -377,7 +377,7 @@ func IsFault(err error, si *linux.SignalInfo) bool { } func TestEmptyAddressSpace(t *testing.T) { - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -391,7 +391,7 @@ func TestEmptyAddressSpace(t *testing.T) { } return false }) - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -467,7 +467,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { i int // Iteration includes machine.Get() / machine.Put(). a int // Count for ErrContextInterrupt. ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -489,7 +489,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { func BenchmarkKernelSyscall(b *testing.B) { // Note that the target passed here is irrelevant, we never execute SwitchToUser. - applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfGetpid(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { // iteration does not include machine.Get() / machine.Put(). for i := 0; i < b.N; i++ { testutil.Getpid() @@ -504,7 +504,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { i int a int ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 1b5d5f66e..e7092a756 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -70,7 +70,7 @@ type machine struct { // tscControl checks whether cpu supports TSC scaling tscControl bool - // usedSlots is the set of used physical addresses (sorted). + // usedSlots is the set of used physical addresses (not sorted). usedSlots []uintptr // nextID is the next vCPU ID. @@ -296,13 +296,20 @@ func newMachine(vm int) (*machine, error) { return m, nil } -// hasSlot returns true iff the given address is mapped. +// hasSlot returns true if the given address is mapped. // // This must be done via a linear scan. // //go:nosplit func (m *machine) hasSlot(physical uintptr) bool { - for i := 0; i < len(m.usedSlots); i++ { + slotLen := int(atomic.LoadUint32(&m.nextSlot)) + // When slots are being updated, nextSlot is ^uint32(0). As this situation + // is less likely happen, we just set the slotLen to m.maxSlots, and scan + // the whole usedSlots array. + if slotLen == int(^uint32(0)) { + slotLen = m.maxSlots + } + for i := 0; i < slotLen; i++ { if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { return true } diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 7a10fd812..a96634381 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm @@ -136,7 +137,7 @@ func (c *vCPU) initArchState() error { } // Set the entrypoint for the kernel. - kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) + kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 83bcc7406..de798bb2c 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index edaccf9bc..7937a8481 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index f6aa519b1..1a4a9ce7d 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index 49e1c7136..cc3a1253b 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.12 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package kvm diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go index 5c1efa0fd..d8c273796 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil.go +++ b/pkg/sentry/platform/kvm/testutil/testutil.go @@ -23,23 +23,41 @@ import ( // Getpid executes a trivial system call. func Getpid() -// Touch touches the value in the first register. -func Touch() +// AddrOfGetpid returns the address of Getpid. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfGetpid() uintptr + +// AddrOfTouch returns the address of a function that touches the value in the +// first register. +func AddrOfTouch() uintptr +func touch() -// SyscallLoop executes a syscall and loops. -func SyscallLoop() +// AddrOfSyscallLoop returns the address of a function that executes a syscall +// and loops. +func AddrOfSyscallLoop() uintptr +func syscallLoop() -// SpinLoop spins on the CPU. -func SpinLoop() +// AddrOfSpinLoop returns the address of a function that spins on the CPU. +func AddrOfSpinLoop() uintptr +func spinLoop() -// HaltLoop immediately halts and loops. -func HaltLoop() +// AddrOfHaltLoop returns the address of a function that immediately halts and +// loops. +func AddrOfHaltLoop() uintptr +func haltLoop() -// TwiddleRegsFault twiddles registers then faults. -func TwiddleRegsFault() +// AddrOfTwiddleRegsFault returns the address of a function that twiddles +// registers then faults. +func AddrOfTwiddleRegsFault() uintptr +func twiddleRegsFault() -// TwiddleRegsSyscall twiddles registers then executes a syscall. -func TwiddleRegsSyscall() +// AddrOfTwiddleRegsSyscall returns the address of a function that twiddles +// registers then executes a syscall. +func AddrOfTwiddleRegsSyscall() uintptr +func twiddleRegsSyscall() // FloatingPointWorks is a floating point test. // diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go index 8048eedec..98c52b2f5 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package testutil @@ -22,12 +23,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) -// TwiddleSegments reads segments into known registers. -func TwiddleSegments() +// AddrOfTwiddleSegments return the address of a function that reads segments +// into known registers. +func AddrOfTwiddleSegments() uintptr +func twiddleSegments() // SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *arch.Registers, fn func()) { - regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) +func SetTestTarget(regs *arch.Registers, fn uintptr) { + regs.Rip = uint64(fn) } // SetTouchTarget sets rax appropriately. diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s index 491ec0c2a..65e7c05ea 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s @@ -25,27 +25,46 @@ TEXT ·Getpid(SB),NOSPLIT,$0 SYSCALL RET -TEXT ·Touch(SB),NOSPLIT,$0 +// func AddrOfGetpid() uintptr +TEXT ·AddrOfGetpid(SB), $0-8 + MOVQ $·Getpid(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·touch(SB),NOSPLIT,$0 start: MOVQ 0(AX), BX // deref AX MOVQ $39, AX // getpid SYSCALL JMP start -TEXT ·HaltLoop(SB),NOSPLIT,$0 -start: - HLT - JMP start +// func AddrOfTouch() uintptr +TEXT ·AddrOfTouch(SB), $0-8 + MOVQ $·touch(SB), AX + MOVQ AX, ret+0(FP) + RET -TEXT ·SyscallLoop(SB),NOSPLIT,$0 +TEXT ·syscallLoop(SB),NOSPLIT,$0 start: SYSCALL JMP start -TEXT ·SpinLoop(SB),NOSPLIT,$0 +// func AddrOfSyscallLoop() uintptr +TEXT ·AddrOfSyscallLoop(SB), $0-8 + MOVQ $·syscallLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·spinLoop(SB),NOSPLIT,$0 start: JMP start +// func AddrOfSpinLoop() uintptr +TEXT ·AddrOfSpinLoop(SB), $0-8 + MOVQ $·spinLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS MOVQ $1, AX @@ -75,20 +94,32 @@ TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NOTQ DI; \ NOTQ SP; -TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 +TEXT ·twiddleRegsSyscall(SB),NOSPLIT,$0 TWIDDLE_REGS() SYSCALL RET // never reached -TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 +// func AddrOfTwiddleRegsSyscall() uintptr +TEXT ·AddrOfTwiddleRegsSyscall(SB), $0-8 + MOVQ $·twiddleRegsSyscall(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·twiddleRegsFault(SB),NOSPLIT,$0 TWIDDLE_REGS() JMP AX // must fault RET // never reached +// func AddrOfTwiddleRegsFault() uintptr +TEXT ·AddrOfTwiddleRegsFault(SB), $0-8 + MOVQ $·twiddleRegsFault(SB), AX + MOVQ AX, ret+0(FP) + RET + #define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00; #define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00; -TEXT ·TwiddleSegments(SB),NOSPLIT,$0 +TEXT ·twiddleSegments(SB),NOSPLIT,$0 MOVQ $0x0, AX READ_GS() MOVQ AX, BX @@ -96,3 +127,9 @@ TEXT ·TwiddleSegments(SB),NOSPLIT,$0 READ_FS() SYSCALL RET // never reached + +// func AddrOfTwiddleSegments() uintptr +TEXT ·AddrOfTwiddleSegments(SB), $0-8 + MOVQ $·twiddleSegments(SB), AX + MOVQ AX, ret+0(FP) + RET diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index c5235ca9d..6d0ba8252 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package testutil diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go index 4f7fe993a..07eda0ef3 100644 --- a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 90b1ead56..13a55b784 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ptrace @@ -176,6 +177,7 @@ func patchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) { // // This is safe to call in an afterFork context. // +//go:norace //go:nosplit func enableCpuidFault() { unix.RawSyscall6(unix.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index e4257e3bf..8181db659 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 4f0260432..129ca52e2 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package ptrace @@ -120,6 +121,17 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro return nil, err } + return forkStub(flags, instrs) +} + +// In the child, this function must not acquire any locks, because they might +// have been locked at the time of the fork. This means no rescheduling, no +// malloc calls, and no new stack segments. For the same reason compiler does +// not race instrument it. +// +// +//go:norace +func forkStub(flags uintptr, instrs []linux.BPFInstruction) (*thread, error) { // Declare all variables up front in order to ensure that there's no // need for allocations between beforeFork & afterFork. var ( @@ -181,7 +193,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // Set an aggressive BPF filter for the stub and all it's children. See // the description of the BPF program built above. - if errno := seccomp.SetFilter(instrs); errno != 0 { + if errno := seccomp.SetFilterInChild(instrs); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index 9c342c59b..f1e84059d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && (amd64 || arm64) // +build linux // +build amd64 arm64 @@ -26,6 +27,7 @@ import ( // unmaskAllSignals unmasks all signals on the current thread. // +//go:norace //go:nosplit func unmaskAllSignals() unix.Errno { var set linux.SignalSet diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index 38b7b1a5e..304722200 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.12 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package ptrace diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 2029e7cf4..b2fc84181 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/abi/linux", "//pkg/bits", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal", "//pkg/marshal/primitive", @@ -25,7 +26,6 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 235b9c306..00a5e729a 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) const maxInt = int(^uint(0) >> 1) @@ -70,7 +70,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { file := t.GetFile(fd) if file == nil { files.Release(t) - return nil, syserror.EBADF + return nil, linuxerr.EBADF } files = append(files, file) } @@ -169,7 +169,7 @@ func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SC return nil, err } if kernel.ThreadID(cred.PID) != t.ThreadGroup().ID() && !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.PIDNamespace().UserNamespace()) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } return &scmCredentials{t, kuid, kgid}, nil } @@ -473,17 +473,17 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) for i := 0; i < len(buf); { if i+linux.SizeOfControlMessageHeader > len(buf) { - return cmsgs, syserror.EINVAL + return cmsgs, linuxerr.EINVAL } var h linux.ControlMessageHeader h.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageHeader]) if h.Length < uint64(linux.SizeOfControlMessageHeader) { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } if h.Length > uint64(len(buf)-i) { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } i += linux.SizeOfControlMessageHeader @@ -497,7 +497,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) numRights := rightsSize / linux.SizeOfControlMessageRight if len(fds)+numRights > linux.SCM_MAX_FD { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { @@ -508,7 +508,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.SCM_CREDENTIALS: if length < linux.SizeOfControlMessageCredentials { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } var creds linux.ControlMessageCredentials @@ -522,7 +522,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.SO_TIMESTAMP: if length < linux.SizeOfTimeval { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } var ts linux.Timeval ts.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval]) @@ -532,13 +532,13 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) default: // Unknown message type. - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } case linux.SOL_IP: switch h.Type { case linux.IP_TOS: if length < linux.SizeOfControlMessageTOS { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasTOS = true var tos primitive.Uint8 @@ -548,7 +548,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IP_PKTINFO: if length < linux.SizeOfControlMessageIPPacketInfo { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasIPPacketInfo = true @@ -561,7 +561,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IP_RECVORIGDSTADDR: var addr linux.SockAddrInet if length < addr.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()]) cmsgs.IP.OriginalDstAddress = &addr @@ -570,7 +570,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IP_RECVERR: var errCmsg linux.SockErrCMsgIPv4 if length < errCmsg.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()]) @@ -578,13 +578,13 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) i += bits.AlignUp(length, width) default: - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } case linux.SOL_IPV6: switch h.Type { case linux.IPV6_TCLASS: if length < linux.SizeOfControlMessageTClass { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasTClass = true var tclass primitive.Uint32 @@ -595,7 +595,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IPV6_RECVORIGDSTADDR: var addr linux.SockAddrInet6 if length < addr.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()]) cmsgs.IP.OriginalDstAddress = &addr @@ -604,7 +604,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IPV6_RECVERR: var errCmsg linux.SockErrCMsgIPv6 if length < errCmsg.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()]) @@ -612,10 +612,10 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) i += bits.AlignUp(length, width) default: - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } default: - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } } diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go index 37d02948f..0a989cbeb 100644 --- a/pkg/sentry/socket/control/control_vfs2.go +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -17,10 +17,10 @@ package control import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // SCMRightsVFS2 represents a SCM_RIGHTS socket control message. @@ -51,7 +51,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { file := t.GetFileVFS2(fd) if file == nil { files.Release(t) - return nil, syserror.EBADF + return nil, linuxerr.EBADF } files = append(files, file) } diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index d3be2d825..587f479eb 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -20,12 +20,12 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -67,9 +67,25 @@ func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArgument AddressSpaceActive: true, }) return 0, err - + case unix.SIOCGIFFLAGS, unix.SIOCGIFCONF: + cc := &usermem.IOCopyContext{ + Ctx: ctx, + IO: io, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + } + var ifr linux.IFReq + if _, err := ifr.CopyIn(cc, args[2].Pointer()); err != nil { + return 0, err + } + if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&ifr))); errno != 0 { + return 0, translateIOSyscallError(errno) + } + _, err := ifr.CopyOut(cc, args[2].Pointer()) + return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 5d55cc64d..cd6e34ecc 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -18,6 +18,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -105,7 +105,7 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. @@ -113,7 +113,7 @@ func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) @@ -124,7 +124,7 @@ func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // PWrite implements vfs.FileDescriptionImpl. func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. @@ -132,7 +132,7 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go index 8a783712e..2397e04e7 100644 --- a/pkg/sentry/socket/hostinet/sockopt_impl.go +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package hostinet import ( diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index cbb1e905d..7a4e78a5f 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -29,11 +29,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" @@ -320,12 +320,12 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { // AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. func (s *Stack) AddInterfaceAddr(int32, inet.InterfaceAddr) error { - return syserror.EACCES + return linuxerr.EACCES } // RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. func (s *Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error { - return syserror.EACCES + return linuxerr.EACCES } // SupportsIPv6 implements inet.Stack.SupportsIPv6. @@ -340,7 +340,7 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { - return syserror.EACCES + return linuxerr.EACCES } // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. @@ -350,7 +350,7 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { - return syserror.EACCES + return linuxerr.EACCES } // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. @@ -360,7 +360,7 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. func (s *Stack) SetTCPSACKEnabled(bool) error { - return syserror.EACCES + return linuxerr.EACCES } // TCPRecovery implements inet.Stack.TCPRecovery. @@ -370,7 +370,7 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { // SetTCPRecovery implements inet.Stack.SetTCPRecovery. func (s *Stack) SetTCPRecovery(inet.TCPLossRecovery) error { - return syserror.EACCES + return linuxerr.EACCES } // getLine reads one line from proc file, with specified prefix. @@ -483,7 +483,7 @@ func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} // SetForwarding implements inet.Stack.SetForwarding. func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { - return syserror.EACCES + return linuxerr.EACCES } // PortRange implements inet.Stack.PortRange. @@ -494,5 +494,5 @@ func (*Stack) PortRange() (uint16, uint16) { // SetPortRange implements inet.Stack.SetPortRange. func (*Stack) SetPortRange(start uint16, end uint16) error { - return syserror.EACCES + return linuxerr.EACCES } diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go index d8bd86292..af31cbc5b 100644 --- a/pkg/sentry/socket/netfilter/ipv4.go +++ b/pkg/sentry/socket/netfilter/ipv4.go @@ -81,6 +81,8 @@ func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTG copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask) copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + copy(entry.Entry.IP.InputInterface[:], rule.Filter.InputInterface) + copy(entry.Entry.IP.InputInterfaceMask[:], rule.Filter.InputInterfaceMask) if rule.Filter.DstInvert { entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP } diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go index c68230847..6cefe0b9c 100644 --- a/pkg/sentry/socket/netfilter/ipv6.go +++ b/pkg/sentry/socket/netfilter/ipv6.go @@ -81,6 +81,8 @@ func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6T copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask) copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + copy(entry.Entry.IPv6.InputInterface[:], rule.Filter.InputInterface) + copy(entry.Entry.IPv6.InputInterfaceMask[:], rule.Filter.InputInterfaceMask) if rule.Filter.DstInvert { entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP } diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index d53f23a9a..5c3ae26f8 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -214,7 +214,7 @@ func (s *socketOpsCommon) ConnectedPasscred() bool { // Ioctl implements fs.FileOperations.Ioctl. func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // ExtractSockAddr extracts the SockAddrNetlink from b. diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index 842036764..4d3cdea62 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -17,6 +17,7 @@ package netlink import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -108,12 +108,12 @@ func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { // Ioctl implements vfs.FileDescriptionImpl. func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // PRead implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. @@ -121,7 +121,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -134,7 +134,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // PWrite implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. @@ -142,7 +142,7 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 11f75628c..9b844b0c0 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -49,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" @@ -273,6 +274,7 @@ var Metrics = tcpip.Stats{ Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), + SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), }, UDP: tcpip.UDPStats{ PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), @@ -1682,6 +1684,26 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int return nil } +func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { + // packetOverheadFactor is used to multiply the value provided by the user on + // a setsockopt(2) for setting the send/receive buffer sizes sockets. + const packetOverheadFactor = 2 + + if !ignoreMax && newSz > max { + newSz = max + } + + if newSz < math.MaxInt32/packetOverheadFactor { + newSz *= packetOverheadFactor + if newSz < min { + newSz = min + } + } else { + newSz = math.MaxInt32 + } + return newSz +} + // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { @@ -1691,7 +1713,9 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := hostarch.ByteOrder.Uint32(optVal) - ep.SocketOptions().SetSendBufferSize(int64(v), true /* notify */) + min, max := ep.SocketOptions().SendBufferLimits() + clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) + ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) return nil case linux.SO_RCVBUF: @@ -1700,7 +1724,24 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := hostarch.ByteOrder.Uint32(optVal) - ep.SocketOptions().SetReceiveBufferSize(int64(v), true /* notify */) + min, max := ep.SocketOptions().ReceiveBufferLimits() + clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) + ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) + return nil + + case linux.SO_RCVBUFFORCE: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { + return syserr.ErrNotPermitted + } + + v := hostarch.ByteOrder.Uint32(optVal) + min, max := ep.SocketOptions().ReceiveBufferLimits() + clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) + ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) return nil case linux.SO_REUSEADDR: @@ -3016,7 +3057,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc unimpl.EmitUnimplementedEvent(ctx) } - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // interfaceIoctl implements interface requests. diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 30f3ad153..edc160b1b 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -17,6 +17,7 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -104,7 +105,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -125,7 +126,7 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } r := src.Reader(ctx) diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index eef5e6519..0fd0ad32c 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -18,10 +18,10 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -110,24 +110,24 @@ func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) { switch addr.Family { case linux.AF_INET: if len(addr.Addr) != header.IPv4AddressSize { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } if addr.PrefixLen > header.IPv4AddressSize*8 { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } protocol = ipv4.ProtocolNumber address = tcpip.Address(addr.Addr) case linux.AF_INET6: if len(addr.Addr) != header.IPv6AddressSize { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } if addr.PrefixLen > header.IPv6AddressSize*8 { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } protocol = ipv6.ProtocolNumber address = tcpip.Address(addr.Addr) default: - return protocolAddress, syserror.ENOTSUP + return protocolAddress, linuxerr.ENOTSUP } protocolAddress = tcpip.ProtocolAddress{ diff --git a/pkg/sentry/socket/netstack/tun.go b/pkg/sentry/socket/netstack/tun.go index c7ed52702..e67fe9700 100644 --- a/pkg/sentry/socket/netstack/tun.go +++ b/pkg/sentry/socket/netstack/tun.go @@ -16,7 +16,7 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/tcpip/link/tun" ) @@ -41,7 +41,7 @@ func LinuxToTUNFlags(flags uint16) (tun.Flags, error) { // when there is no sk_filter. See __tun_chr_ioctl() in // net/drivers/tun.c. if flags&^uint16(linux.IFF_TUN|linux.IFF_TAP|linux.IFF_NO_PI|linux.IFF_ONE_QUEUE) != 0 { - return tun.Flags{}, syserror.EINVAL + return tun.Flags{}, linuxerr.EINVAL } return tun.Flags{ TUN: flags&linux.IFF_TUN != 0, diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index f5da3c509..658e90bb9 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -509,7 +509,6 @@ func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) { linux.SO_ATTACH_REUSEPORT_EBPF, linux.SO_CNX_ADVICE, linux.SO_DETACH_FILTER, - linux.SO_RCVBUFFORCE, linux.SO_SNDBUFFORCE: t.Kernel().EmitUnimplementedEvent(t) diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 08a00a12f..8c5075a1c 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -30,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -254,7 +253,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PRead implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. @@ -262,7 +261,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -283,7 +282,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // PWrite implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. @@ -291,7 +290,7 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } t := kernel.TaskFromContext(ctx) diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go index cefd20b9b..c42297c80 100644 --- a/pkg/sentry/state/state_metadata.go +++ b/pkg/sentry/state/state_metadata.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package state import ( diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index 6ce1bb592..317c3c31c 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package strace diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go index ce5594301..65f27c810 100644 --- a/pkg/sentry/strace/linux64_arm64.go +++ b/pkg/sentry/strace/linux64_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package strace diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index af7088847..757ff2a40 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -133,6 +133,9 @@ func dump(t *kernel.Task, addr hostarch.Addr, size uint, maximumBlobSize uint) s } func path(t *kernel.Task, addr hostarch.Addr) string { + if addr == 0 { + return "<null>" + } path, err := t.CopyInString(addr, linux.PATH_MAX) if err != nil { return fmt.Sprintf("%#x (error decoding path: %s)", addr, err) @@ -816,10 +819,10 @@ func convertToSyscallFlag(sinks SinkType) uint32 { return ret } -// Enable enables the syscalls in whitelist in all syscall tables. +// Enable enables the syscalls in allowlist in all syscall tables. // // Preconditions: Initialize has been called. -func Enable(whitelist []string, sinks SinkType) error { +func Enable(allowlist []string, sinks SinkType) error { flags := convertToSyscallFlag(sinks) for _, table := range kernel.SyscallTables() { // Is this known? @@ -829,7 +832,7 @@ func Enable(whitelist []string, sinks SinkType) error { } // Convert to a set of system calls numbers. - wl, err := sys.ConvertToSysnoMap(whitelist) + wl, err := sys.ConvertToSysnoMap(allowlist) if err != nil { return err } diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index 02debfc7e..a69ed0746 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -46,21 +45,21 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syserror.EBADF + return linuxerr.EBADF } defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syserror.EBADF + return linuxerr.EBADF } // Try to add the entry. @@ -72,21 +71,21 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syserror.EBADF + return linuxerr.EBADF } defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syserror.EBADF + return linuxerr.EBADF } // Try to update the entry. @@ -98,21 +97,21 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syserror.EBADF + return linuxerr.EBADF } defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syserror.EBADF + return linuxerr.EBADF } // Try to remove the entry. @@ -124,14 +123,14 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeoutInNanos int64) ([]linux // Get epoll from the file descriptor. epollfile := t.GetFile(fd) if epollfile == nil { - return nil, syserror.EBADF + return nil, linuxerr.EBADF } defer epollfile.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return nil, syserror.EBADF + return nil, linuxerr.EBADF } // Try to read events and return right away if we got them or if the diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index a2f612f45..ccccce6a9 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -25,6 +25,7 @@ go_library( "sys_mempolicy.go", "sys_mmap.go", "sys_mount.go", + "sys_msgqueue.go", "sys_pipe.go", "sys_poll.go", "sys_prctl.go", @@ -84,6 +85,7 @@ go_library( "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/eventfd", "//pkg/sentry/kernel/fasync", + "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 165922332..76389fbe3 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -113,7 +113,7 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return true, syserror.EFBIG + return true, linuxerr.EFBIG case linuxerr.Equals(linuxerr.EINTR, translatedErr): // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 090c5ffcb..6f44d767b 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -18,6 +18,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -120,10 +121,10 @@ var AMD64 = &kernel.SyscallTable{ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 66: syscalls.Supported("semctl", Semctl), 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 68: syscalls.Supported("msgget", Msgget), + 69: syscalls.ErrorWithEvent("msgsnd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.PartiallySupported("msgctl", Msgctl, "Only supports IPC_RMID option.", []string{"gvisor.dev/issue/135"}), 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), @@ -187,7 +188,7 @@ var AMD64 = &kernel.SyscallTable{ 132: syscalls.Supported("utime", Utime), 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 135: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), @@ -200,15 +201,15 @@ var AMD64 = &kernel.SyscallTable{ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", linuxerr.EPERM, "", nil), 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 154: syscalls.Error("modify_ldt", linuxerr.EPERM, "", nil), + 155: syscalls.Error("pivot_root", linuxerr.EPERM, "", nil), + 156: syscalls.Error("sysctl", linuxerr.EPERM, "Deprecated. Use /proc/sys instead.", nil), 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), @@ -300,9 +301,9 @@ var AMD64 = &kernel.SyscallTable{ 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 248: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", linuxerr.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", linuxerr.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), @@ -350,17 +351,17 @@ var AMD64 = &kernel.SyscallTable{ 295: syscalls.Supported("preadv", Preadv), 296: syscalls.Supported("pwritev", Pwritev), 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 298: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 303: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 308: syscalls.ErrorWithEvent("setns", linuxerr.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 309: syscalls.Supported("getcpu", Getcpu), 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), @@ -470,7 +471,7 @@ var ARM64 = &kernel.SyscallTable{ 38: syscalls.Supported("renameat", Renameat), 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 41: syscalls.Error("pivot_root", linuxerr.EPERM, "", nil), 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), @@ -521,7 +522,7 @@ var ARM64 = &kernel.SyscallTable{ 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), 90: syscalls.Supported("capget", Capget), 91: syscalls.Supported("capset", Capset), - 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 92: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), 93: syscalls.Supported("exit", Exit), 94: syscalls.Supported("exit_group", ExitGroup), 95: syscalls.Supported("waitid", Waitid), @@ -556,7 +557,7 @@ var ARM64 = &kernel.SyscallTable{ 124: syscalls.Supported("sched_yield", SchedYield), 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 127: syscalls.ErrorWithEvent("sched_rr_get_interval", linuxerr.EPERM, "", nil), 128: syscalls.Supported("restart_syscall", RestartSyscall), 129: syscalls.Supported("kill", Kill), 130: syscalls.Supported("tkill", Tkill), @@ -615,10 +616,10 @@ var ARM64 = &kernel.SyscallTable{ 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 186: syscalls.Supported("msgget", Msgget), + 187: syscalls.PartiallySupported("msgctl", Msgctl, "Only supports IPC_RMID option.", []string{"gvisor.dev/issue/135"}), + 188: syscalls.ErrorWithEvent("msgrcv", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 189: syscalls.ErrorWithEvent("msgsnd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 190: syscalls.Supported("semget", Semget), 191: syscalls.Supported("semctl", Semctl), 192: syscalls.Supported("semtimedop", Semtimedop), @@ -646,9 +647,9 @@ var ARM64 = &kernel.SyscallTable{ 214: syscalls.Supported("brk", Brk), 215: syscalls.Supported("munmap", Munmap), 216: syscalls.Supported("mremap", Mremap), - 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 217: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), + 218: syscalls.Error("request_key", linuxerr.EACCES, "Not available to user.", nil), + 219: syscalls.Error("keyctl", linuxerr.EACCES, "Not available to user.", nil), 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), 221: syscalls.Supported("execve", Execve), 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), @@ -670,18 +671,18 @@ var ARM64 = &kernel.SyscallTable{ 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 241: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 242: syscalls.Supported("accept4", Accept4), 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), 260: syscalls.Supported("wait4", Wait4), 261: syscalls.Supported("prlimit64", Prlimit64), 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 264: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 265: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 268: syscalls.ErrorWithEvent("setns", linuxerr.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index e8c2d8f9e..9dea78085 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -29,7 +30,7 @@ import ( // syscalls are moved into this package, then they can be unexported. func CopyInSigSet(t *kernel.Task, sigSetAddr hostarch.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } b := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index c338a4cc9..4ce3430e2 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -43,7 +43,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, err } if idIn != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents)) @@ -67,7 +67,7 @@ func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys ctx := t.MemoryManager().DestroyAIOContext(t, id) if ctx == nil { // Does not exist. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Drain completed requests amd wait for pending requests until there are no @@ -98,12 +98,12 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Sanity check arguments. if minEvents < 0 || minEvents > events { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ctx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Setup the timeout. @@ -115,7 +115,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } if !d.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration()) haveDeadline = true @@ -172,7 +172,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl done := ctx.WaitChannel() if done == nil { // Context has been destroyed. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil { return nil, err @@ -185,7 +185,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } // Since this I/O will be asynchronous with respect to t's task goroutine, @@ -207,7 +207,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) default: // Not a supported command. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } } @@ -270,7 +270,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host file := t.GetFile(cb.FD) if file == nil { // File not found. - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) @@ -280,14 +280,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. - return syserror.EBADF + return linuxerr.EBADF } defer eventFile.DecRef(t) // Check that it is an eventfd. if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { // Not an event FD. - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -300,14 +300,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } } // Prepare the request. ctx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } if err := ctx.Prepare(); err != nil { return err @@ -336,7 +336,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc addr := args[2].Pointer() if nrEvents < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } for i := int32(0); i < nrEvents; i++ { diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go index d3b85e11b..1e714503c 100644 --- a/pkg/sentry/syscalls/linux/sys_capability.go +++ b/pkg/sentry/syscalls/linux/sys_capability.go @@ -16,22 +16,22 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) { if tid < 0 { - err = syserror.EINVAL + err = linuxerr.EINVAL return } if tid > 0 { t = t.PIDNamespace().TaskWithID(tid) } if t == nil { - err = syserror.ESRCH + err = linuxerr.ESRCH return } creds := t.Credentials() @@ -97,7 +97,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } if dataAddr != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil } @@ -115,7 +115,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } var data linux.CapUserData if _, err := data.CopyIn(t, dataAddr); err != nil { @@ -128,7 +128,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } var data [2]linux.CapUserData if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil { @@ -144,6 +144,6 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if _, err := hdr.CopyOut(t, hdrAddr); err != nil { return 0, nil, err } - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go index dd43cf18d..2b2dbd9f9 100644 --- a/pkg/sentry/syscalls/linux/sys_clone_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go index cf68a8949..877c86e6a 100644 --- a/pkg/sentry/syscalls/linux/sys_clone_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 69cbc98d0..daa151bb4 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -31,7 +32,7 @@ import ( func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags & ^linux.EPOLL_CLOEXEC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } closeOnExec := flags&linux.EPOLL_CLOEXEC != 0 @@ -48,7 +49,7 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S size := args[0].Int() if size <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } fd, err := syscalls.CreateEpoll(t, false) @@ -101,7 +102,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 3b4f879e4..7ba9a755e 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" - "gvisor.dev/gvisor/pkg/syserror" ) // Eventfd2 implements linux syscall eventfd2(2). @@ -30,7 +30,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 6109a2d8c..3528d325f 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -80,12 +80,12 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro // Need to extract the given FD. f = t.GetFile(dirFD) if f == nil { - return syserror.EBADF + return linuxerr.EBADF } rel = f.Dirent if !fs.IsDir(rel.Inode.StableAttr) { f.DecRef(t) - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -153,7 +153,7 @@ func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uin } if fs.IsSymlink(d.Inode.StableAttr) && !resolve { - return syserror.ELOOP + return linuxerr.ELOOP } fileFlags := linuxToFlags(flags) @@ -167,11 +167,11 @@ func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uin } else { // If O_DIRECTORY is set, but the file is not a directory, then fail. if fileFlags.Directory { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // If it's a directory, then make sure. if dirPath { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -220,7 +220,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Do we have the appropriate permissions on the parent? @@ -261,7 +261,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod // Instead of emulating this seemingly useless behaviour, we'll // indicate that the filesystem doesn't support the creation of // sockets. - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP case linux.ModeCharacterDevice: fallthrough @@ -271,12 +271,12 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod // // When we start supporting block and character devices, we'll // need to check for CAP_MKNOD here. - return syserror.EPERM + return linuxerr.EPERM default: // "EINVAL - mode requested creation of something other than a // regular file, device special file, FIFO or socket." - mknod(2) - return syserror.EINVAL + return linuxerr.EINVAL } }) } @@ -326,7 +326,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode ) for { if !fs.IsDir(parent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Start by looking up the dirent at 'name'. @@ -340,7 +340,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode // O_EXCL flag was passed, then we can immediately // return EEXIST. if flags&linux.O_EXCL != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } // If we have a non-symlink, then we can proceed. @@ -351,7 +351,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode // If O_NOFOLLOW was passed, then don't try to resolve // anything. if flags&linux.O_NOFOLLOW != 0 { - return syserror.ELOOP + return linuxerr.ELOOP } // Try to resolve the symlink directly to a Dirent. @@ -528,7 +528,7 @@ func accessAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode uint) error // Sanity check the mode. if mode&^(rOK|wOK|xOK) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -596,7 +596,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -685,7 +685,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Note this is >= because we need a terminator. if uint(len(s)) >= size { - return 0, nil, syserror.ERANGE + return 0, nil, linuxerr.ERANGE } // Copy out the path name for the node. @@ -704,7 +704,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() if !t.HasCapability(linux.CAP_SYS_CHROOT) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } path, _, err := copyInPath(t, addr, false /* allowEmpty */) @@ -715,7 +715,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Is it a directory? if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Does it have execute permissions? @@ -740,7 +740,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Is it a directory? if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Does it have execute permissions? @@ -759,13 +759,13 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ENOTDIR + return 0, nil, linuxerr.ENOTDIR } // Does it have execute permissions? @@ -790,7 +790,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // (and other reference-holding operations complete). file, _ := t.FDTable().Remove(t, fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -804,13 +804,13 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { - return 0, nil, syserror.EMFILE + return 0, nil, linuxerr.EMFILE } return uintptr(newFD), nil, nil } @@ -825,7 +825,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldfd == newfd { oldFile := t.GetFile(oldfd) if oldFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer oldFile.DecRef(t) @@ -844,12 +844,12 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC flags := args[2].Uint() if oldfd == newfd { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } oldFile := t.GetFile(oldfd) if oldFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer oldFile.DecRef(t) @@ -906,7 +906,7 @@ func fSetOwn(t *kernel.Task, fd int, file *fs.File, who int32) error { if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { - return syserror.EINVAL + return linuxerr.EINVAL } pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who)) a.SetOwnerProcessGroup(t, pg) @@ -924,7 +924,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file, flags := t.FDTable().Get(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -957,7 +957,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Normally pipe and socket types lack lock operations. We diverge and use a heavy // hammer by only allowing locks on files and directories. if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Copy in the lock request. @@ -977,7 +977,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case 2: sw = fs.SeekEnd default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Compute the lock offset. @@ -996,7 +996,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } off = uattr.Size default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Compute the lock range. @@ -1010,12 +1010,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch flock.Type { case linux.F_RDLCK: if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, nil) { - return 0, nil, syserror.EAGAIN + return 0, nil, linuxerr.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. @@ -1026,12 +1026,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, nil case linux.F_WRLCK: if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, nil) { - return 0, nil, syserror.EAGAIN + return 0, nil, linuxerr.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. @@ -1044,7 +1044,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng) return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } case linux.F_GETOWN: return uintptr(fGetOwn(t, file)), nil, nil @@ -1067,47 +1067,47 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID)) if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } a.SetOwnerTask(t, task) return 0, nil, nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) if tg == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } a.SetOwnerThreadGroup(t, tg) return 0, nil, nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) if pg == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } a.SetOwnerProcessGroup(t, pg) return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } case linux.F_GET_SEALS: val, err := tmpfs.GetSeals(file.Dirent.Inode) return uintptr(val), nil, err case linux.F_ADD_SEALS: if !file.Flags().Write { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint()) return 0, nil, err case linux.F_GETPIPE_SZ: sz, ok := file.FileOperations.(fs.FifoSizer) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } size, err := sz.FifoSize(t, file) return uintptr(size), nil, err case linux.F_SETPIPE_SZ: sz, ok := file.FileOperations.(fs.FifoSizer) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := sz.SetFifoSize(int64(args[2].Int())) return uintptr(n), nil, err @@ -1119,7 +1119,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, a.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -1132,18 +1132,18 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Note: offset is allowed to be negative. if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } switch advice { @@ -1154,7 +1154,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.POSIX_FADV_DONTNEED: case linux.POSIX_FADV_NOREUSE: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Sure, whatever. @@ -1173,7 +1173,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Does this directory exist already? @@ -1183,7 +1183,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod case err == nil: // The directory existed. defer f.DecRef(t) - return syserror.EEXIST + return linuxerr.EEXIST case linuxerr.Equals(linuxerr.EACCES, err): // Permission denied while walking to the directory. return err @@ -1225,21 +1225,21 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr) error { // Special case: removing the root always returns EBUSY. if path == "/" { - return syserror.EBUSY + return linuxerr.EBUSY } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Linux returns different ernos when the path ends in single // dot vs. double dots. switch name { case ".": - return syserror.EINVAL + return linuxerr.EINVAL case "..": - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } if err := d.MayDelete(t, root, name); err != nil { @@ -1278,7 +1278,7 @@ func symlinkAt(t *kernel.Task, dirFD int32, newAddr hostarch.Addr, oldAddr hosta return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Make sure we have write permissions on the parent directory. @@ -1330,10 +1330,10 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error { // If we are not the owner, then the file must be regular and have // Read+Write permissions. if !fs.IsRegular(target.StableAttr) { - return syserror.EPERM + return linuxerr.EPERM } if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil { - return syserror.EPERM + return linuxerr.EPERM } return nil @@ -1358,7 +1358,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 if allowEmpty && oldPath == "" { target := t.GetFile(oldDirFD) if target == nil { - return syserror.EBADF + return linuxerr.EBADF } defer target.DecRef(t) if err := mayLinkAt(t, target.Dirent.Inode); err != nil { @@ -1368,7 +1368,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 // Resolve the target directory. return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { if !fs.IsDir(newParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Make sure we have write permissions on the parent directory. @@ -1389,7 +1389,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 // Next resolve newDirFD and newAddr to the parent dirent and name. return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { if !fs.IsDir(newParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Make sure we have write permissions on the parent directory. @@ -1432,7 +1432,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Sanity check flags. if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW @@ -1466,7 +1466,7 @@ func readlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, bufAddr hostarc s, err := d.Inode.Readlink(t) if linuxerr.Equals(linuxerr.ENOLINK, err) { - return syserror.EINVAL + return linuxerr.EINVAL } if err != nil { return err @@ -1520,7 +1520,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr) error { return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if err := d.MayDelete(t, root, name); err != nil { @@ -1558,7 +1558,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc length := args[1].Int64() if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) @@ -1566,7 +1566,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if dirPath { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { @@ -1574,7 +1574,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -1584,7 +1584,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // In contrast to open(O_TRUNC), truncate(2) is only valid for file // types. if !fs.IsFile(d.Inode.StableAttr) { - return syserror.EINVAL + return linuxerr.EINVAL } // Reject truncation if the access permissions do not allow truncation. @@ -1611,25 +1611,25 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. if !file.Flags().Write { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // In contrast to open(O_TRUNC), truncate(2) is only valid for file // types. Note that this is different from truncate(2) above, where a // directory returns EISDIR. if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { @@ -1637,7 +1637,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil { @@ -1683,7 +1683,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { kuid := c.UserNamespace.MapToKUID(uid) // Valid UID must be supplied if UID is to be changed. if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // "Only a privileged process (CAP_CHOWN) may change the owner @@ -1693,7 +1693,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { // explicitly not changing its UID. isNoop := uattr.Owner.UID == kuid if !(hasCap || (isOwner && isNoop)) { - return syserror.EPERM + return linuxerr.EPERM } // The setuid and setgid bits are cleared during a chown. @@ -1707,7 +1707,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { kgid := c.UserNamespace.MapToKGID(gid) // Valid GID must be supplied if GID is to be changed. if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // "The owner of a file may change the group of the file to any @@ -1716,7 +1716,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { isNoop := uattr.Owner.GID == kgid isMemberGroup := c.InGroup(kgid) if !(hasCap || (isOwner && (isNoop || isMemberGroup))) { - return syserror.EPERM + return linuxerr.EPERM } // The setuid and setgid bits are cleared during a chown. @@ -1738,7 +1738,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { if clearPrivilege && uattr.Perms.HasSetUIDOrGID() && !fs.IsDir(d.Inode.StableAttr) { uattr.Perms.DropSetUIDAndMaybeGID() if !d.Inode.SetPermissions(t, d, uattr.Perms) { - return syserror.EPERM + return linuxerr.EPERM } } @@ -1755,7 +1755,7 @@ func chownAt(t *kernel.Task, fd int32, addr hostarch.Addr, resolve, allowEmpty b // Annoying. What's wrong with fchown? file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) @@ -1793,7 +1793,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -1809,7 +1809,7 @@ func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := args[4].Int() if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid) @@ -1818,12 +1818,12 @@ func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { // Must own file to change mode. if !d.Inode.CheckOwnership(t) { - return syserror.EPERM + return linuxerr.EPERM } p := fs.FilePermsFromMode(mode) if !d.Inode.SetPermissions(t, d, p) { - return syserror.EPERM + return linuxerr.EPERM } // File attribute changed, generate notification. @@ -1858,7 +1858,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -1889,7 +1889,7 @@ func utimes(t *kernel.Task, dirFD int32, addr hostarch.Addr, ts fs.TimeSpec, res if !d.Inode.CheckOwnership(t) { // Trying to set a specific time? Must be owner. if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) { - return syserror.EPERM + return linuxerr.EPERM } // Trying to set to current system time? Must have write access. @@ -1914,11 +1914,11 @@ func utimes(t *kernel.Task, dirFD int32, addr hostarch.Addr, ts fs.TimeSpec, res if addr == 0 && dirFD != linux.AT_FDCWD { if !resolve { // Linux returns EINVAL in this case. See utimes.c. - return syserror.EINVAL + return linuxerr.EINVAL } f := t.GetFile(dirFD) if f == nil { - return syserror.EBADF + return linuxerr.EBADF } defer f.DecRef(t) @@ -1997,7 +1997,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // If both are UTIME_OMIT, this is a noop. @@ -2032,7 +2032,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if times[0].Usec >= 1e6 || times[0].Usec < 0 || times[1].Usec >= 1e6 || times[1].Usec < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ts = fs.TimeSpec{ @@ -2059,26 +2059,26 @@ func renameAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD in return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error { if !fs.IsDir(oldParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Rename rejects paths that end in ".", "..", or empty (i.e. // the root) with EBUSY. switch oldName { case "", ".", "..": - return syserror.EBUSY + return linuxerr.EBUSY } return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { if !fs.IsDir(newParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Rename rejects paths that end in ".", "..", or empty // (i.e. the root) with EBUSY. switch newName { case "", ".", "..": - return syserror.EBUSY + return linuxerr.EBUSY } return fs.Rename(t, root, oldParent, oldName, newParent, newName) @@ -2113,39 +2113,39 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if offset < 0 || length <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if mode != 0 { t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOTSUP + return 0, nil, linuxerr.ENOTSUP } if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if fs.IsPipe(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if fs.IsDir(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.EISDIR } if !fs.IsRegular(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ENODEV + return 0, nil, linuxerr.ENODEV } size := offset + length if size < 0 { - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil { @@ -2166,7 +2166,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -2184,7 +2184,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, nil) { - return 0, nil, syserror.EWOULDBLOCK + return 0, nil, linuxerr.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. @@ -2196,7 +2196,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, nil) { - return 0, nil, syserror.EWOULDBLOCK + return 0, nil, linuxerr.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. @@ -2208,7 +2208,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng) default: // flock(2): EINVAL operation is invalid. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil @@ -2227,7 +2227,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if flags&^memfdAllFlags != 0 { // Unknown bits in flags. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 @@ -2238,7 +2238,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } if len(name) > memfdMaxNameLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } name = memfdPrefix + name diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index eeea1613b..717cec04d 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -18,6 +18,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -159,7 +160,7 @@ func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { return err } if !locked { - return syserror.EWOULDBLOCK + return linuxerr.EWOULDBLOCK } return nil } @@ -210,7 +211,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // WAIT_BITSET uses an absolute timeout which is either // CLOCK_MONOTONIC or CLOCK_REALTIME. if mask == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) return n, nil, err @@ -224,7 +225,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FUTEX_WAKE_BITSET: if mask == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if val <= 0 { // The Linux kernel wakes one waiter even if val is @@ -295,7 +296,7 @@ func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel length := args[1].SizeT() if length != uint(linux.SizeOfRobustListHead) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.SetRobustList(head) return 0, nil, nil @@ -310,13 +311,13 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel sizeAddr := args[2].Pointer() if tid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ot := t if tid != 0 { if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index bbba71d8f..917717e31 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -19,6 +19,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -38,7 +39,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc minSize := int(smallestDirent(t.Arch())) if size < minSize { // size is smaller than smallest possible dirent. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := getdents(t, fd, addr, size, (*dirent).Serialize) @@ -54,7 +55,7 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy minSize := int(smallestDirent64(t.Arch())) if size < minSize { // size is smaller than smallest possible dirent. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := getdents(t, fd, addr, size, (*dirent).Serialize64) @@ -66,7 +67,7 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy func getdents(t *kernel.Task, fd int32, addr hostarch.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { dir := t.GetFile(fd) if dir == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer dir.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go index a29d307e5..50fcadb58 100644 --- a/pkg/sentry/syscalls/linux/sys_identity.go +++ b/pkg/sentry/syscalls/linux/sys_identity.go @@ -15,10 +15,10 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -142,7 +142,7 @@ func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := int(args[0].Int()) if size < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } kgids := t.Credentials().ExtraKGIDs // "If size is zero, list is not modified, but the total number of @@ -151,7 +151,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return uintptr(len(kgids)), nil, nil } if size < len(kgids) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } gids := make([]auth.GID, len(kgids)) for i, kgid := range kgids { @@ -167,7 +167,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := args[0].Int() if size < 0 || size > maxNGroups { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if size == 0 { return 0, nil, t.SetExtraGIDs(nil) diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index cf47bb9dd..b7ad1922e 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -30,7 +30,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := int(args[0].Int()) if flags&^allFlags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } dirent := fs.NewDirent(t, anon.NewInode(t), "inotify") @@ -65,14 +65,14 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { file := t.GetFile(fd) if file == nil { // Invalid fd. - return nil, nil, syserror.EBADF + return nil, nil, linuxerr.EBADF } ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. file.DecRef(t) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } return ino, file, nil @@ -91,7 +91,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ino, file, err := fdToInotify(t, fd) @@ -108,7 +108,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error { // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Copy out to the return frame. diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 0046347cb..bf71a9af3 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -15,6 +15,7 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -31,7 +32,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -44,7 +45,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case 2: sw = fs.SeekEnd default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } offset, serr := file.Seek(t, sw, offset) diff --git a/pkg/sentry/syscalls/linux/sys_membarrier.go b/pkg/sentry/syscalls/linux/sys_membarrier.go index 63ee5d435..6ceedc086 100644 --- a/pkg/sentry/syscalls/linux/sys_membarrier.go +++ b/pkg/sentry/syscalls/linux/sys_membarrier.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Membarrier implements syscall membarrier(2). @@ -29,7 +29,7 @@ func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy switch cmd { case linux.MEMBARRIER_CMD_QUERY: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var supportedCommands uintptr if t.Kernel().Platform.HaveGlobalMemoryBarrier() { @@ -46,58 +46,58 @@ func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return supportedCommands, nil, nil case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier() case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // no-op return 0, nil, nil case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.MemoryManager().EnableMembarrierPrivate() return 0, nil, nil case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.RSeqAvailable() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.MemoryManager().IsMembarrierRSeqEnabled() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } // MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have // the ability to preempt specific CPUs. return 0, nil, t.Kernel().Platform.PreemptAllCPUs() case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.RSeqAvailable() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.MemoryManager().EnableMembarrierRSeq() return 0, nil, nil default: // Probably a command we don't implement. t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go index 6d27f4292..6e7bcb868 100644 --- a/pkg/sentry/syscalls/linux/sys_mempolicy.go +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -18,10 +18,10 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,7 +43,7 @@ func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, // maxnode-1, not maxnode, as the number of bits. bits := maxnode - 1 if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0 - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if bits == 0 { return 0, nil @@ -58,12 +58,12 @@ func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, // Check that only allowed bits in the first unsigned long in the nodemask // are set. if val&^allowedNodemask != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that all remaining bits in the nodemask are 0. for i := 8; i < len(buf); i++ { if buf[i] != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } return val, nil @@ -74,7 +74,7 @@ func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uin // bits. bits := maxnode - 1 if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0 - return syserror.EINVAL + return linuxerr.EINVAL } if bits == 0 { return nil @@ -89,7 +89,7 @@ func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uin if bits > 64 { remAddr, ok := addr.AddLength(8) if !ok { - return syserror.EFAULT + return linuxerr.EFAULT } remUint64 := (bits - 1) / 64 if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{ @@ -110,7 +110,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := args[4].Uint() if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } nodeFlag := flags&linux.MPOL_F_NODE != 0 addrFlag := flags&linux.MPOL_F_ADDR != 0 @@ -119,7 +119,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // "EINVAL: The value specified by maxnode is less than the number of node // IDs supported by the system." - get_mempolicy(2) if nodemask != 0 && maxnode < maxNodes { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is @@ -130,7 +130,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either // MPOL_F_ADDR or MPOL_F_NODE." if nodeFlag || addrFlag { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil { return 0, nil, err @@ -184,7 +184,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will // just (usually) fail to find a VMA at address 0 and return EFAULT. if addr != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If flags is specified as 0, then information about the calling thread's @@ -198,7 +198,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. policy, nodemaskVal := t.NumaPolicy() if nodeFlag { if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } policy = linux.MPOL_DEFAULT // maxNodes == 1 } @@ -240,12 +240,12 @@ func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall flags := args[5].Uint() if flags&^linux.MPOL_MF_VALID != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be // privileged (CAP_SYS_NICE) to use this flag." - mbind(2) if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode) @@ -264,11 +264,11 @@ func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nod mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS) if flags == linux.MPOL_MODE_FLAGS { // Can't specify both mode flags simultaneously. - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } if mode < 0 || mode >= linux.MPOL_MAX { // Must specify a valid mode. - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } var nodemaskVal uint64 @@ -285,22 +285,22 @@ func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nod // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate; // Linux allows a nodemask to be specified, as long as it is empty. if nodemaskVal != 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } case linux.MPOL_BIND, linux.MPOL_INTERLEAVE: // These require a non-empty nodemask. if nodemaskVal == 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } case linux.MPOL_PREFERRED: // This permits an empty nodemask, as long as no flags are set. if nodemaskVal == 0 && flags != 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } case linux.MPOL_LOCAL: // This requires an empty nodemask and no flags set ... if nodemaskVal != 0 || flags != 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } // ... and is implemented as MPOL_PREFERRED. mode = linux.MPOL_PREFERRED diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 70da0707d..cee621791 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -18,13 +18,13 @@ import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Brk implements linux syscall brk(2). @@ -51,7 +51,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Require exactly one of MAP_PRIVATE and MAP_SHARED. if private == shared { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } opts := memmap.MMapOpts{ @@ -84,14 +84,14 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Convert the passed FD to a file reference. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) flags := file.Flags() // mmap unconditionally requires that the FD is readable. if !flags.Read { - return 0, nil, syserror.EACCES + return 0, nil, linuxerr.EACCES } // MAP_SHARED requires that the FD be writable for PROT_WRITE. if shared && !flags.Write { @@ -132,7 +132,7 @@ func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal newAddr := args[4].Pointer() if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } mayMove := flags&linux.MREMAP_MAYMOVE != 0 fixed := flags&linux.MREMAP_FIXED != 0 @@ -147,7 +147,7 @@ func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case !mayMove && fixed: // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be // specified." - mremap(2) - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ @@ -178,7 +178,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // "The Linux implementation requires that the address addr be // page-aligned, and allows length to be zero." - madvise(2) if addr.RoundDown() != addr { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if length == 0 { return 0, nil, nil @@ -186,7 +186,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Not explicitly stated: length need not be page-aligned. lenAddr, ok := hostarch.Addr(length).RoundUp() if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } length = uint64(lenAddr) @@ -214,10 +214,10 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, syserror.ENOSYS case linux.MADV_HWPOISON: // Only privileged processes are allowed to poison pages. - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM default: // If adv is not a valid value tell the caller. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -228,7 +228,7 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca vec := args[2].Pointer() if addr != addr.RoundDown() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "The length argument need not be a multiple of the page size, but since // residency information is returned for whole pages, length is effectively @@ -265,11 +265,11 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // semantics that are (currently) equivalent to specifying MS_ASYNC." - // msync(2) if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } sync := flags&linux.MS_SYNC != 0 if sync && flags&linux.MS_ASYNC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ Sync: sync, @@ -295,7 +295,7 @@ func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal flags := args[2].Int() if flags&^(linux.MLOCK_ONFAULT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } mode := memmap.MLockEager @@ -318,7 +318,7 @@ func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := args[0].Int() if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } mode := memmap.MLockEager diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index 864d2138c..6d26f89b9 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -16,12 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Mount implements Linux syscall mount(2). @@ -67,7 +66,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Must have CAP_SYS_ADMIN in the mount namespace's associated user // namespace. if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | @@ -83,15 +82,15 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // unknown or unsupported flags are passed. Since we don't implement // everything, we fail explicitly on flags that are unimplemented. if flags&(unsupportedOps|unsupportedFlags) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } rsys, ok := fs.FindFilesystem(fsType) if !ok { - return 0, nil, syserror.ENODEV + return 0, nil, linuxerr.ENODEV } if !rsys.AllowUserMount() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } var superFlags fs.MountSourceFlags @@ -107,7 +106,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -130,7 +129,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE if flags&unsupported != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, _, err := copyInPath(t, addr, false /* allowEmpty */) @@ -143,7 +142,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // // Currently, this is always the init task's user namespace. if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW diff --git a/pkg/sentry/syscalls/linux/sys_msgqueue.go b/pkg/sentry/syscalls/linux/sys_msgqueue.go new file mode 100644 index 000000000..3476e218d --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_msgqueue.go @@ -0,0 +1,57 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" +) + +// Msgget implements msgget(2). +func Msgget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + key := ipc.Key(args[0].Int()) + flag := args[1].Int() + + private := key == linux.IPC_PRIVATE + create := flag&linux.IPC_CREAT == linux.IPC_CREAT + exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL + mode := linux.FileMode(flag & 0777) + + r := t.IPCNamespace().MsgqueueRegistry() + queue, err := r.FindOrCreate(t, key, mode, private, create, exclusive) + if err != nil { + return 0, nil, err + } + return uintptr(queue.ID()), nil, nil +} + +// Msgctl implements msgctl(2). +func Msgctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := ipc.ID(args[0].Int()) + cmd := args[1].Int() + + creds := auth.CredentialsFromContext(t) + + switch cmd { + case linux.IPC_RMID: + return 0, nil, t.IPCNamespace().MsgqueueRegistry().Remove(id, creds) + default: + return 0, nil, linuxerr.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index d95034347..5925c2263 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -16,13 +16,13 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -30,7 +30,7 @@ import ( // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr hostarch.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize) diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 024632475..a80c84fcd 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -158,7 +158,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. func CopyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) { if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } pfd := make([]linux.PollFD, nfds) @@ -218,7 +218,7 @@ func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialB func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Calculate the size of the fd sets (one bit per fd). @@ -265,7 +265,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // OK. Linux is racy in the same way. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } file.DecRef(t) @@ -486,7 +486,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } if timeval.Sec < 0 || timeval.Usec < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } timeout = time.Duration(timeval.ToNsecCapped()) } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 30c15af4a..a16b6b4d6 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" ) // Prctl implements linux syscall prctl(2). @@ -39,7 +38,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_PDEATHSIG: sig := linux.Signal(args[1].Int()) if sig != 0 && !sig.IsValid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.SetParentDeathSignal(sig) return 0, nil, nil @@ -70,7 +69,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall d = mm.UserDumpable default: // N.B. Userspace may not pass SUID_DUMP_ROOT. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.MemoryManager().SetDumpability(d) return 0, nil, nil @@ -91,7 +90,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else if val == 1 { t.SetKeepCaps(true) } else { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil @@ -119,7 +118,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_MM: if !t.HasCapability(linux.CAP_SYS_RESOURCE) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } switch args[1].Int() { @@ -128,13 +127,13 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Set the underlying executable. @@ -156,12 +155,12 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } case linux.PR_SET_NO_NEW_PRIVS: if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // PR_SET_NO_NEW_PRIVS is assumed to always be set. // See kernel.Task.updateCredsForExecLocked. @@ -169,7 +168,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_GET_NO_NEW_PRIVS: if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 1, nil, nil @@ -185,7 +184,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall default: tracer := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) if tracer == nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.SetYAMAException(tracer) return 0, nil, nil @@ -194,7 +193,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_SECCOMP: if args[1].Int() != linux.SECCOMP_MODE_FILTER { // Unsupported mode. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) @@ -205,7 +204,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_READ: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var rv uintptr if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { @@ -216,7 +215,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_DROP: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, t.DropBoundingCapability(cp) @@ -241,7 +240,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go index ae545f80f..f86e87bc7 100644 --- a/pkg/sentry/syscalls/linux/sys_random.go +++ b/pkg/sentry/syscalls/linux/sys_random.go @@ -18,14 +18,13 @@ import ( "io" "math" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) const ( @@ -47,7 +46,7 @@ func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Flags are checked for validity but otherwise ignored. See above. if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if length > math.MaxInt32 { @@ -55,7 +54,7 @@ func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } ar, ok := addr.ToRange(uint64(length)) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } // "If the urandom source has been initialized, reads of up to 256 bytes diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 0f9329fe8..b54a3a11f 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -47,19 +47,19 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -83,29 +83,29 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is valid. if int(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Return EINVAL; if the underlying file type does not support readahead, // then Linux will return EINVAL to indicate as much. In the future, we // may extend this function to actually support readahead hints. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Pread64 implements linux syscall pread64(2). @@ -117,29 +117,29 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is reading at an offset supported? if !file.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -163,13 +163,13 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the destination of the read. @@ -194,23 +194,23 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is reading at an offset supported? if !file.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the destination of the read. @@ -243,30 +243,30 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is reading at an offset supported? if offset > -1 && !file.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check flags field. // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is // accepted as a valid flag argument for preadv2. if flags&^linux.RWF_VALID != 0 { - return 0, nil, syserror.EOPNOTSUPP + return 0, nil, linuxerr.EOPNOTSUPP } // Read the iovecs that specify the destination of the read. diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index e64246d57..a12e1c915 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -105,7 +106,7 @@ func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) } if _, ok := setableLimits[resource]; !ok { - return limits.Limit{}, syserror.EPERM + return limits.Limit{}, linuxerr.EPERM } // "A privileged process (under Linux: one with the CAP_SYS_RESOURCE @@ -129,7 +130,7 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys resource, ok := limits.FromLinuxResource[int(args[0].Int())] if !ok { // Return err; unknown limit. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } addr := args[1].Pointer() rlim, err := newRlimit(t) @@ -150,7 +151,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys resource, ok := limits.FromLinuxResource[int(args[0].Int())] if !ok { // Return err; unknown limit. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } addr := args[1].Pointer() rlim, err := newRlimit(t) @@ -158,7 +159,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } if _, err := rlim.CopyIn(t, addr); err != nil { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } _, err = prlimit64(t, resource, rlim.toLimit()) return 0, nil, err @@ -170,7 +171,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys resource, ok := limits.FromLinuxResource[int(args[1].Int())] if !ok { // Return err; unknown limit. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } newRlimAddr := args[2].Pointer() oldRlimAddr := args[3].Pointer() @@ -179,18 +180,18 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if newRlimAddr != 0 { var nrl rlimit64 if err := nrl.copyIn(t, newRlimAddr); err != nil { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } newLim = nrl.toLimit() } if tid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ot := t if tid > 0 { if ot = t.PIDNamespace().TaskWithID(tid); ot == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -207,7 +208,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys cred.RealKGID != tcred.RealKGID || cred.RealKGID != tcred.EffectiveKGID || cred.RealKGID != tcred.SavedKGID { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } } @@ -218,7 +219,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if oldRlimAddr != 0 { if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } } diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go index 90db10ea6..5fe196647 100644 --- a/pkg/sentry/syscalls/linux/sys_rseq.go +++ b/pkg/sentry/syscalls/linux/sys_rseq.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -43,6 +44,6 @@ func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return 0, nil, t.ClearRSeq(addr, length, signature) default: // Unknown flag. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go index ac5c98a54..a689abcc9 100644 --- a/pkg/sentry/syscalls/linux/sys_rusage.go +++ b/pkg/sentry/syscalls/linux/sys_rusage.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) func getrusage(t *kernel.Task, which int32) linux.Rusage { @@ -76,7 +76,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys addr := args[1].Pointer() if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ru := getrusage(t, which) diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index bfcf44b6f..59c7a4b22 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -38,13 +38,13 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel pid := args[0].Int() param := args[1].Pointer() if param == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } r := SchedParam{schedPriority: onlyPriority} if _, err := r.CopyOut(t, param); err != nil { @@ -58,10 +58,10 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() if pid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } return onlyScheduler, nil, nil } @@ -72,20 +72,20 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke policy := args[1].Int() param := args[2].Pointer() if pid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if policy != onlyScheduler { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } var r SchedParam if _, err := r.CopyIn(t, param); err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if r.schedPriority != onlyPriority { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index e16d6ff3f..b0dc84b8d 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -17,10 +17,10 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. @@ -44,7 +44,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { // We only support SECCOMP_SET_MODE_FILTER at the moment. if mode != linux.SECCOMP_SET_MODE_FILTER { // Unsupported mode. - return syserror.EINVAL + return linuxerr.EINVAL } tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 @@ -52,7 +52,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { // Unsupported flag. - return syserror.EINVAL + return linuxerr.EINVAL } var fprog userSockFprog @@ -66,7 +66,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { compiledFilter, err := bpf.Compile(filter) if err != nil { t.Debugf("Invalid seccomp-bpf filter: %v", err) - return syserror.EINVAL + return linuxerr.EINVAL } return t.AppendSyscallFilter(compiledFilter, tsync) diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index cb320c536..f61cc466c 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -26,14 +26,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) const opsMax = 500 // SEMOPM // Semget handles: semget(key_t key, int nsems, int semflg) func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - key := args[0].Int() + key := ipc.Key(args[0].Int()) nsems := args[1].Int() flag := args[2].Int() @@ -47,7 +47,7 @@ func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - return uintptr(set.ID), nil, nil + return uintptr(set.ID()), nil, nil } // Semtimedop handles: semop(int semid, struct sembuf *sops, size_t nsops, const struct timespec *timeout) @@ -57,15 +57,15 @@ func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return Semop(t, args) } - id := args[0].Int() + id := ipc.ID(args[0].Int()) sembufAddr := args[1].Pointer() nsops := args[2].SizeT() timespecAddr := args[3].Pointer() if nsops <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if nsops > opsMax { - return 0, nil, syserror.E2BIG + return 0, nil, linuxerr.E2BIG } ops := make([]linux.Sembuf, nsops) @@ -78,12 +78,12 @@ func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } if timeout.Sec < 0 || timeout.Nsec < 0 || timeout.Nsec >= 1e9 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := semTimedOp(t, id, ops, true, timeout.ToDuration()); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { - return 0, nil, syserror.EAGAIN + return 0, nil, linuxerr.EAGAIN } return 0, nil, err } @@ -92,15 +92,15 @@ func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Semop handles: semop(int semid, struct sembuf *sops, size_t nsops) func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := args[0].Int() + id := ipc.ID(args[0].Int()) sembufAddr := args[1].Pointer() nsops := args[2].SizeT() if nsops <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if nsops > opsMax { - return 0, nil, syserror.E2BIG + return 0, nil, linuxerr.E2BIG } ops := make([]linux.Sembuf, nsops) @@ -110,11 +110,11 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, semTimedOp(t, id, ops, false, time.Second) } -func semTimedOp(t *kernel.Task, id int32, ops []linux.Sembuf, haveTimeout bool, timeout time.Duration) error { +func semTimedOp(t *kernel.Task, id ipc.ID, ops []linux.Sembuf, haveTimeout bool, timeout time.Duration) error { set := t.IPCNamespace().SemaphoreRegistry().FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) @@ -132,7 +132,7 @@ func semTimedOp(t *kernel.Task, id int32, ops []linux.Sembuf, haveTimeout bool, // Semctl handles: semctl(int semid, int semnum, int cmd, ...) func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := args[0].Int() + id := ipc.ID(args[0].Int()) num := args[1].Int() cmd := args[2].Int() @@ -140,7 +140,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.SETVAL: val := args[3].Int() if val > math.MaxInt16 { - return 0, nil, syserror.ERANGE + return 0, nil, linuxerr.ERANGE } return 0, nil, setVal(t, id, num, int16(val)) @@ -211,7 +211,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.SEM_STAT: arg := args[3].Pointer() // id is an index in SEM_STAT. - semid, ds, err := semStat(t, id) + semid, ds, err := semStat(t, int32(id)) if err != nil { return 0, nil, err } @@ -223,7 +223,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.SEM_STAT_ANY: arg := args[3].Pointer() // id is an index in SEM_STAT. - semid, ds, err := semStatAny(t, id) + semid, ds, err := semStatAny(t, int32(id)) if err != nil { return 0, nil, err } @@ -233,41 +233,41 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return uintptr(semid), nil, err default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } -func remove(t *kernel.Task, id int32) error { +func remove(t *kernel.Task, id ipc.ID) error { r := t.IPCNamespace().SemaphoreRegistry() creds := auth.CredentialsFromContext(t) - return r.RemoveID(id, creds) + return r.Remove(id, creds) } -func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error { +func ipcSet(t *kernel.Task, id ipc.ID, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } owner := fs.FileOwner{UID: kuid, GID: kgid} return set.Change(t, creds, owner, perms) } -func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) { +func ipcStat(t *kernel.Task, id ipc.ID) (*linux.SemidDS, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.GetStat(creds) @@ -277,45 +277,45 @@ func semStat(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByIndex(index) if set == nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) ds, err := set.GetStat(creds) if err != nil { return 0, ds, err } - return set.ID, ds, nil + return int32(set.ID()), ds, nil } func semStatAny(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { set := t.IPCNamespace().SemaphoreRegistry().FindByIndex(index) if set == nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) ds, err := set.GetStatAny(creds) if err != nil { return 0, ds, err } - return set.ID, ds, nil + return int32(set.ID()), ds, nil } -func setVal(t *kernel.Task, id int32, num int32, val int16) error { +func setVal(t *kernel.Task, id ipc.ID, num int32, val int16) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) return set.SetVal(t, num, val, creds, int32(pid)) } -func setValAll(t *kernel.Task, id int32, array hostarch.Addr) error { +func setValAll(t *kernel.Task, id ipc.ID, array hostarch.Addr) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } vals := make([]uint16, set.Size()) if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil { @@ -326,21 +326,21 @@ func setValAll(t *kernel.Task, id int32, array hostarch.Addr) error { return set.SetValAll(t, vals, creds, int32(pid)) } -func getVal(t *kernel.Task, id int32, num int32) (int16, error) { +func getVal(t *kernel.Task, id ipc.ID, num int32) (int16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.GetVal(num, creds) } -func getValAll(t *kernel.Task, id int32, array hostarch.Addr) error { +func getValAll(t *kernel.Task, id ipc.ID, array hostarch.Addr) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) vals, err := set.GetValAll(creds) @@ -351,11 +351,11 @@ func getValAll(t *kernel.Task, id int32, array hostarch.Addr) error { return err } -func getPID(t *kernel.Task, id int32, num int32) (int32, error) { +func getPID(t *kernel.Task, id ipc.ID, num int32) (int32, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) gpid, err := set.GetPID(num, creds) @@ -370,21 +370,21 @@ func getPID(t *kernel.Task, id int32, num int32) (int32, error) { return int32(tg.ID()), nil } -func getZCnt(t *kernel.Task, id int32, num int32) (uint16, error) { +func getZCnt(t *kernel.Task, id ipc.ID, num int32) (uint16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.CountZeroWaiters(num, creds) } -func getNCnt(t *kernel.Task, id int32, num int32) (uint16, error) { +func getNCnt(t *kernel.Task, id ipc.ID, num int32) (uint16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.CountNegativeWaiters(num, creds) diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index 584064143..840540506 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -16,15 +16,16 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" - "gvisor.dev/gvisor/pkg/syserror" ) // Shmget implements shmget(2). func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - key := shm.Key(args[0].Int()) + key := ipc.Key(args[0].Int()) size := uint64(args[1].SizeT()) flag := args[2].Int() @@ -40,31 +41,31 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } defer segment.DecRef(t) - return uintptr(segment.ID), nil, nil + return uintptr(segment.ID()), nil, nil } // findSegment retrives a shm segment by the given id. // // findSegment returns a reference on Shm. -func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) { +func findSegment(t *kernel.Task, id ipc.ID) (*shm.Shm, error) { r := t.IPCNamespace().ShmRegistry() segment := r.FindByID(id) if segment == nil { // No segment with provided id. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } return segment, nil } // Shmat implements shmat(2). func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := shm.ID(args[0].Int()) + id := ipc.ID(args[0].Int()) addr := args[1].Pointer() flag := args[2].Int() segment, err := findSegment(t, id) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) @@ -89,7 +90,7 @@ func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Shmctl implements shmctl(2). func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := shm.ID(args[0].Int()) + id := ipc.ID(args[0].Int()) cmd := args[1].Int() buf := args[2].Pointer() @@ -106,7 +107,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.IPC_STAT: segment, err := findSegment(t, id) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) @@ -130,7 +131,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Remaining commands refer to a specific segment. segment, err := findSegment(t, id) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) @@ -155,6 +156,6 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index db763c68e..45608f3fa 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -80,10 +80,10 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC for { target := t.PIDNamespace().TaskWithID(pid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } info := &linux.SignalInfo{ Signo: int32(sig), @@ -146,7 +146,7 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if delivered > 0 { return 0, nil, lastErr } - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH default: // "If pid equals 0, then sig is sent to every process in the process // group of the calling process." @@ -160,11 +160,11 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // If pid != -1 (i.e. signalling a process group), the returned error // is the last error from any call to group_send_sig_info. - lastErr := syserror.ESRCH + lastErr := error(linuxerr.ESRCH) for _, tg := range t.PIDNamespace().ThreadGroups() { if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { if !mayKill(t, tg.Leader(), sig) { - lastErr = syserror.EPERM + lastErr = linuxerr.EPERM continue } @@ -203,16 +203,16 @@ func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // N.B. Inconsistent with man page, linux actually rejects calls with // tid <=0 by EINVAL. This isn't the same for all signal calls. if tid <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } target := t.PIDNamespace().TaskWithID(tid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } @@ -226,17 +226,17 @@ func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } @@ -249,7 +249,7 @@ func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var newactptr *linux.SigAction @@ -292,7 +292,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } oldmask := t.SignalMask() if setaddr != 0 { @@ -309,7 +309,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel case linux.SIG_SETMASK: t.SetSignalMask(mask) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } if oldaddr != 0 { @@ -339,7 +339,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // these semantics apply to changing the signal stack via a // ucontext during a signal handler. if !t.SetSignalStack(alt) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } } @@ -378,7 +378,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne return 0, nil, err } if !d.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } timeout = time.Duration(d.ToNsecCapped()) } else { @@ -421,17 +421,17 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // Deliver to the given task's thread group. target := t.PIDNamespace().TaskWithID(pid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if err := target.SendGroupSignal(&info); !linuxerr.Equals(linuxerr.ESRCH, err) { @@ -450,7 +450,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy in the info. See RtSigqueueinfo above. @@ -464,17 +464,17 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(&info) } @@ -525,7 +525,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u // Always check for valid flags, even if not creating. if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is this a change to an existing signalfd? @@ -534,7 +534,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u if fd != -1 { file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -545,7 +545,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u } // Not a signalfd. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create a new file. diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 3bd21a911..06eb8f319 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -118,7 +118,7 @@ type multipleMessageHeader64 struct { // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } addrBuf := make([]byte, addrlen) @@ -140,7 +140,7 @@ func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr h } if int32(bufLen) < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // Write the length unconditionally. @@ -174,7 +174,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create the new socket. @@ -206,7 +206,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } fileFlags := fs.SettableFileFlags{ @@ -253,7 +253,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -278,13 +278,13 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) @@ -343,7 +343,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -370,7 +370,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -408,7 +408,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -422,7 +422,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -439,7 +439,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -455,7 +455,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Call syscall implementation then copy both value and value len out. @@ -520,7 +520,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -531,10 +531,10 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if optLen > maxOptLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyInBytes(optValAddr, buf); err != nil { @@ -558,7 +558,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -586,7 +586,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -613,13 +613,13 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -631,7 +631,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if file.Flags().NonBlocking { @@ -661,7 +661,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -670,13 +670,13 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -698,7 +698,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -718,7 +718,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -728,7 +728,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -750,7 +750,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -781,7 +781,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags } if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { @@ -830,18 +830,18 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) @@ -908,13 +908,13 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -926,7 +926,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if file.Flags().NonBlocking { @@ -946,7 +946,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -956,7 +956,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -968,7 +968,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if file.Flags().NonBlocking { @@ -980,7 +980,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -990,7 +990,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -1015,7 +1015,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil { @@ -1035,7 +1035,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1074,13 +1074,13 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 134051124..34d87ac1f 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -27,7 +28,7 @@ import ( // doSplice implements a blocking splice operation. func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) { if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if opts.Length == 0 { return 0, nil @@ -105,33 +106,33 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get files. inFile := t.GetFile(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) if !inFile.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } outFile := t.GetFile(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) if !outFile.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Verify that the outfile Append flag is not set. if outFile.Flags().Append { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Verify that we have a regular infile. This is a requirement; the // same check appears in Linux (fs/splice.c:splice_direct_to_actor). if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var ( @@ -142,7 +143,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Verify that when offset address is not null, infile must be // seekable. The fs.Splice routine itself validates basic read. if !inFile.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Copy in the offset. @@ -190,19 +191,19 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get files. outFile := t.GetFile(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) @@ -226,11 +227,11 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal switch { case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr): if inOffset != 0 { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if outOffset != 0 { if !outFile.Flags().Pwrite { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var offset int64 @@ -244,11 +245,11 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr): if outOffset != 0 { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if inOffset != 0 { if !inFile.Flags().Pread { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var offset int64 @@ -262,15 +263,15 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr): if inOffset != 0 || outOffset != 0 { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // We may not refer to the same pipe; otherwise it's a continuous loop. if inFileAttr.InodeID == outFileAttr.InodeID { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Splice data. @@ -298,30 +299,30 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get files. outFile := t.GetFile(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) // All files must be pipes. if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // We may not refer to the same pipe; see above. if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // The operation is non-blocking if anything is non-blocking. diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 2338ba44b..3da385c66 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -56,7 +56,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Annoying. What's wrong with fstat? file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -98,7 +98,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -108,7 +108,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // stat implements stat from the given *fs.Dirent. func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr hostarch.Addr) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(t) if err != nil { @@ -139,13 +139,13 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall statxAddr := args[4].Pointer() if mask&linux.STATX__RESERVED != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, dirPath, err := copyInPath(t, pathAddr, flags&linux.AT_EMPTY_PATH != 0) @@ -156,7 +156,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if path == "" { file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) uattr, err := file.UnstableAttr(t) @@ -170,7 +170,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(t) if err != nil { @@ -247,7 +247,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go index 0a04a6113..e38066ea8 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go index 5a3b1bfad..b2ea390c5 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 5ebd4461f..6278bef21 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -37,7 +38,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -52,7 +53,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -68,7 +69,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -86,13 +87,13 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel uflags := args[3].Uint() if offset < 0 || offset+nbytes < offset { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if uflags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE| linux.SYNC_FILE_RANGE_WRITE| linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if nbytes == 0 { @@ -101,7 +102,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go index 40c8bb061..ba372f9e3 100644 --- a/pkg/sentry/syscalls/linux/sys_syslog.go +++ b/pkg/sentry/syscalls/linux/sys_syslog.go @@ -15,6 +15,7 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -40,7 +41,7 @@ func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal switch command { case _SYSLOG_ACTION_READ_ALL: if size < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if size > logBufLen { size = logBufLen diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 0d5056303..981cdd985 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -17,8 +17,8 @@ package linux import ( "path" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -31,11 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -const ( - // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux. - exitSignalMask = 0xff -) - var ( // ExecMaxTotalSize is the maximum length of all argv and envv entries. // @@ -112,7 +107,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host } if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } atEmptyPath := flags&linux.AT_EMPTY_PATH != 0 if !atEmptyPath && len(pathname) == 0 { @@ -135,7 +130,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host // Need to extract the given FD. f, fdFlags := t.FDTable().Get(dirFD) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) closeOnExec = fdFlags.CloseOnExec @@ -154,7 +149,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host wd = f.Dirent wd.IncRef() if !fs.IsDir(wd.Inode.StableAttr) { - return 0, nil, syserror.ENOTDIR + return 0, nil, linuxerr.ENOTDIR } } } @@ -187,47 +182,30 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host // Exit implements linux syscall exit(2). func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - status := int(args[0].Int()) - t.PrepareExit(kernel.ExitStatus{Code: status}) + status := args[0].Int() + t.PrepareExit(linux.WaitStatusExit(status & 0xff)) return 0, kernel.CtrlDoExit, nil } // ExitGroup implements linux syscall exit_group(2). func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - status := int(args[0].Int()) - t.PrepareGroupExit(kernel.ExitStatus{Code: status}) + status := args[0].Int() + t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff)) return 0, kernel.CtrlDoExit, nil } // clone is used by Clone, Fork, and VFork. func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) { - opts := kernel.CloneOptions{ - SharingOptions: kernel.SharingOptions{ - NewAddressSpace: flags&linux.CLONE_VM == 0, - NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, - NewThreadGroup: flags&linux.CLONE_THREAD == 0, - TerminationSignal: linux.Signal(flags & exitSignalMask), - NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, - NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, - NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, - NewFiles: flags&linux.CLONE_FILES == 0, - NewFSContext: flags&linux.CLONE_FS == 0, - NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, - NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, - }, - Stack: stack, - SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, - TLS: tls, - ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, - ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, - ChildTID: childTID, - ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, - ParentTID: parentTID, - Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, - Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, - InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, - } - ntid, ctrl, err := t.Clone(&opts) + args := linux.CloneArgs{ + Flags: uint64(uint32(flags) &^ linux.CSIGNAL), + Pidfd: uint64(parentTID), + ChildTID: uint64(childTID), + ParentTID: uint64(parentTID), + ExitSignal: uint64(flags & linux.CSIGNAL), + Stack: uint64(stack), + TLS: uint64(tls), + } + ntid, ctrl, err := t.Clone(&args) return uintptr(ntid), ctrl, err } @@ -260,7 +238,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.NonCloneTasks = true wopts.CloneTasks = true default: - return syserror.EINVAL + return linuxerr.EINVAL } if options&linux.WCONTINUED != 0 { wopts.Events |= kernel.EventGroupContinue @@ -277,7 +255,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) { if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventExit | kernel.EventTraceeStop, @@ -315,7 +293,7 @@ func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusag return 0, err } if statusAddr != 0 { - if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil { + if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil { return 0, err } } @@ -358,10 +336,10 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal rusageAddr := args[4].Pointer() if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventTraceeStop, @@ -374,7 +352,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.P_PGID: wopts.SpecificPGID = kernel.ProcessGroupID(id) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := parseCommonWaitOptions(&wopts, options); err != nil { @@ -418,23 +396,22 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } si.SetPID(int32(wr.TID)) si.SetUID(int32(wr.UID)) - // TODO(b/73541790): convert kernel.ExitStatus to functions and make - // WaitResult.Status a linux.WaitStatus. - s := unix.WaitStatus(wr.Status) + s := wr.Status switch { case s.Exited(): si.Code = linux.CLD_EXITED si.SetStatus(int32(s.ExitStatus())) case s.Signaled(): - si.Code = linux.CLD_KILLED - si.SetStatus(int32(s.Signal())) - case s.CoreDump(): - si.Code = linux.CLD_DUMPED - si.SetStatus(int32(s.Signal())) + if s.CoreDumped() { + si.Code = linux.CLD_DUMPED + } else { + si.Code = linux.CLD_KILLED + } + si.SetStatus(int32(s.TerminationSignal())) case s.Stopped(): if wr.Event == kernel.EventTraceeStop { si.Code = linux.CLD_TRAPPED - si.SetStatus(int32(s.TrapCause())) + si.SetStatus(int32(s.PtraceEvent())) } else { si.Code = linux.CLD_STOPPED si.SetStatus(int32(s.StopSignal())) @@ -461,29 +438,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // Unshare implements linux syscall unshare(2). func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() - opts := kernel.SharingOptions{ - NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, - NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, - NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, - NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, - NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, - NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, - NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, - NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, - NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, - NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, - } // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) - if opts.NewPIDNamespace { - opts.NewThreadGroup = true + if flags&linux.CLONE_NEWPID != 0 { + flags |= linux.CLONE_THREAD } // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." - if opts.NewUserNamespace { - opts.NewThreadGroup = true - opts.NewFSContext = true + if flags&linux.CLONE_NEWUSER != 0 { + flags |= linux.CLONE_THREAD | linux.CLONE_FS } - return 0, nil, t.Unshare(&opts) + return 0, nil, t.Unshare(flags) } // SchedYield implements linux syscall sched_yield(2). @@ -504,7 +468,7 @@ func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker } else { task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -528,7 +492,7 @@ func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // in an array of "unsigned long" so the buffer needs to // be a multiple of the word size. if size&(t.Arch().Width()-1) > 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var task *kernel.Task @@ -537,7 +501,7 @@ func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker } else { task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -545,7 +509,7 @@ func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // The buffer needs to be big enough to hold a cpumask with // all possible cpus. if size < mask.Size() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } _, err := t.CopyOutBytes(maskAddr, mask) @@ -590,16 +554,16 @@ func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if pid != 0 { ot := t.PIDNamespace().TaskWithID(pid) if ot == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } tg = ot.ThreadGroup() if tg.Leader() != ot { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Setpgid only operates on child threadgroups. if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -609,7 +573,7 @@ func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if pgid == 0 { pgid = defaultPGID } else if pgid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // If the pgid is the same as the group, then create a new one. Otherwise, @@ -654,7 +618,7 @@ func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca target := t.PIDNamespace().TaskWithID(tid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil @@ -674,7 +638,7 @@ func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal target := t.PIDNamespace().TaskWithID(tid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil @@ -698,7 +662,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } // From kernel/sys.c:getpriority: @@ -712,7 +676,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -744,7 +708,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } task.SetNiceness(niceval) @@ -754,7 +718,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index 2ec74b33a..674e74f82 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -76,7 +76,7 @@ func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if _, err := getClock(t, clockID); err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if addr == 0 { @@ -95,12 +95,12 @@ type cpuClocker interface { func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { if clockID < 0 { if !isValidCPUClock(clockID) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } targetTask := targetTask(t, clockID) if targetTask == nil { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } var target cpuClocker @@ -117,7 +117,7 @@ func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF. return target.CPUClock(), nil default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } @@ -139,7 +139,7 @@ func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { case linux.CLOCK_THREAD_CPUTIME_ID: return t.CPUClock(), nil default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } @@ -158,7 +158,7 @@ func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // ClockSettime implements linux syscall clock_settime(2). func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } // Time implements linux syscall time(2). @@ -254,7 +254,7 @@ func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if !ts.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Just like linux, we cap the timeout with the max number that int64 can @@ -277,7 +277,7 @@ func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } if !req.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Only allow clock constants also allowed by Linux. @@ -285,7 +285,7 @@ func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if clockID != linux.CLOCK_REALTIME && clockID != linux.CLOCK_MONOTONIC && clockID != linux.CLOCK_PROCESS_CPUTIME_ID { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index cadd9d348..4eeb94231 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -16,12 +16,12 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // TimerfdCreate implements Linux syscall timerfd_create(2). @@ -30,7 +30,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel flags := args[1].Int() if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var c ktime.Clock @@ -40,7 +40,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: c = t.Kernel().MonotonicClock() default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } f := timerfd.NewFile(t, c) defer f.DecRef(t) @@ -66,18 +66,18 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne oldValAddr := args[3].Pointer() if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var newVal linux.Itimerspec @@ -105,13 +105,13 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } tm, s := tf.GetTime() diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go index 6ddd30d5c..8c6cd7511 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build amd64 +//go:build amd64 +// +build amd64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -42,13 +44,13 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go index fb08a356e..ff4ac4d6d 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build arm64 +//go:build arm64 +// +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go index 66c5974f5..4e945d2c0 100644 --- a/pkg/sentry/syscalls/linux/sys_utsname.go +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Uname implements linux syscall uname. @@ -57,10 +57,10 @@ func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel utsns := t.UTSNamespace() if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if size < 0 || size > linux.UTSLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } name, err := t.CopyInString(nameAddr, int(size)) @@ -79,10 +79,10 @@ func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S utsns := t.UTSNamespace() if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if size < 0 || size > linux.UTSLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } name := make([]byte, size) diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index cff355550..872168606 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -47,19 +47,19 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -84,29 +84,29 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is writing at an offset supported? if !file.Flags().Pwrite { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -130,13 +130,13 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the source of the write. @@ -161,23 +161,23 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is writing at an offset supported? if !file.Flags().Pwrite { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the source of the write. @@ -209,34 +209,34 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := int(args[5].Int()) if int(args[4].Int())&0x4 == 1 { - return 0, nil, syserror.EACCES + return 0, nil, linuxerr.EACCES } file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is writing at an offset supported? if offset > -1 && !file.Flags().Pwrite { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is // accepted as a valid flag argument for pwritev2. if flags&^linux.RWF_VALID != 0 { - return uintptr(flags), nil, syserror.EOPNOTSUPP + return uintptr(flags), nil, linuxerr.EOPNOTSUPP } // Check that the file is writeable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the source of the write. diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 37fb67f80..baaf31191 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -48,7 +47,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -74,7 +73,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink n := 0 err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } n, err = getXattr(t, d, nameAddr, valueAddr, size) @@ -100,7 +99,7 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s // TODO(b/148380782): Support xattrs in namespaces other than "user". if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } // If getxattr(2) is called with size 0, the size of the value will be @@ -117,7 +116,7 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s } n := len(value) if uint64(n) > requestedSize { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } // Don't copy out the attribute value if size is 0. @@ -152,7 +151,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -173,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags) @@ -183,7 +182,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink // setXattr implements setxattr(2) from the given *fs.Dirent. func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, size uint64, flags uint32) error { if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } name, err := copyInXattrName(t, nameAddr) @@ -196,7 +195,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s } if size > linux.XATTR_SIZE_MAX { - return syserror.E2BIG + return linuxerr.E2BIG } buf := make([]byte, size) if _, err := t.CopyInBytes(valueAddr, buf); err != nil { @@ -205,7 +204,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s value := string(buf) if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil { @@ -219,12 +218,12 @@ func copyInXattrName(t *kernel.Task, nameAddr hostarch.Addr) (string, error) { name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) if err != nil { if linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return "", err } if len(name) == 0 { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return name, nil } @@ -242,9 +241,9 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error // Restrict xattrs to regular files and directories. if !xattrFileTypeOk(i) { if perms.Write { - return syserror.EPERM + return linuxerr.EPERM } - return syserror.ENODATA + return linuxerr.ENODATA } return i.CheckPermission(t, perms) @@ -269,7 +268,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -294,7 +293,7 @@ func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlin n := 0 err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } n, err = listXattr(t, d, listAddr, size) @@ -334,10 +333,10 @@ func listXattr(t *kernel.Task, d *fs.Dirent, addr hostarch.Addr, size uint64) (i listSize := xattrListSize(xattrs) if listSize > linux.XATTR_SIZE_MAX { - return 0, syserror.E2BIG + return 0, linuxerr.E2BIG } if uint64(listSize) > requestedSize { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } // Don't copy out the attributes if size is 0. @@ -383,7 +382,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -401,7 +400,7 @@ func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSyml return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } return removeXattr(t, d, nameAddr) @@ -420,7 +419,7 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr hostarch.Addr) error { } if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } if err := d.Inode.RemoveXattr(t, d, name); err != nil { diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 3edc922eb..b327e27d6 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -18,6 +18,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -103,7 +104,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time. return 0, err } if !timespec.Valid() { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index fd1863ef3..a8fa86cdc 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -17,6 +17,8 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" @@ -26,8 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) // IoSubmit implements linux syscall io_submit(2). @@ -37,7 +37,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc addr := args[2].Pointer() if nrEvents < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } for i := int32(0); i < nrEvents; i++ { @@ -90,12 +90,12 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // submitCallback processes a single callback. func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error { if cb.Reserved2 != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } fd := t.GetFileVFS2(cb.FD) if fd == nil { - return syserror.EBADF + return linuxerr.EBADF } defer fd.DecRef(t) @@ -104,13 +104,13 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFD = t.GetFileVFS2(cb.ResFD) if eventFD == nil { - return syserror.EBADF + return linuxerr.EBADF } defer eventFD.DecRef(t) // Check that it is an eventfd. if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -123,14 +123,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } } // Prepare the request. aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } if err := aioCtx.Prepare(); err != nil { return err @@ -200,7 +200,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } // Since this I/O will be asynchronous with respect to t's task goroutine, @@ -222,6 +222,6 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) default: // Not a supported command. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 7aff01343..84010db77 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -35,7 +34,7 @@ var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes() func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^linux.EPOLL_CLOEXEC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file, err := t.Kernel().VFS().NewEpollInstanceFD(t) @@ -60,7 +59,7 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // "Since Linux 2.6.8, the size argument is ignored, but must be greater // than zero" - epoll_create(2) if size <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file, err := t.Kernel().VFS().NewEpollInstanceFD(t) @@ -85,20 +84,20 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc epfile := t.GetFileVFS2(epfd) if epfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if epfile == file { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var event linux.EpollEvent @@ -116,24 +115,24 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } return 0, nil, ep.ModifyInterest(file, fd, event) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } epfile := t.GetFileVFS2(epfd) if epfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Allocate space for a few events on the stack for the common case in diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index 807f909da..0dcf1fbff 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Eventfd2 implements linux syscall eventfd2(2). @@ -29,7 +29,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index 3315398a4..38818c175 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -16,7 +16,9 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -24,8 +26,6 @@ import ( slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Execve implements linux syscall execve(2). @@ -48,7 +48,7 @@ func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) @@ -87,7 +87,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr host } dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) if dirfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } start := dirfile.VirtualDentry() start.IncRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 1a31898e8..2cfb12cad 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -16,6 +16,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" @@ -36,7 +37,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // (and other reference-holding operations complete). _, file := t.FDTable().Remove(t, fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -50,13 +51,13 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { - return 0, nil, syserror.EMFILE + return 0, nil, linuxerr.EMFILE } return uintptr(newFD), nil, nil } @@ -70,7 +71,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // As long as oldfd is valid, dup2() does nothing and returns newfd. file := t.GetFileVFS2(oldfd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } file.DecRef(t) return uintptr(newfd), nil, nil @@ -86,7 +87,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC flags := args[2].Uint() if oldfd == newfd { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return dup3(t, oldfd, newfd, flags) @@ -94,12 +95,12 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { if flags&^linux.O_CLOEXEC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(oldfd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -119,7 +120,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file, flags := t.FDTable().GetVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -128,7 +129,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: // allowed default: - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } } @@ -169,7 +170,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ownerType = linux.F_OWNER_PGRP who = -who @@ -192,7 +193,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } n, err := pipefile.SetPipeSize(int64(args[2].Int())) if err != nil { @@ -202,7 +203,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } return uintptr(pipefile.PipeSize()), nil, nil case linux.F_GET_SEALS: @@ -210,7 +211,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(val), nil, err case linux.F_ADD_SEALS: if !file.IsWritable() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err @@ -232,7 +233,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, a.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -269,7 +270,7 @@ func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: // Acceptable type. default: - return syserror.EINVAL + return linuxerr.EINVAL } a := file.SetAsyncHandler(fasync.NewVFS2(fd)).(*fasync.FileAsync) @@ -282,26 +283,26 @@ func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) if task == nil { - return syserror.ESRCH + return linuxerr.ESRCH } a.SetOwnerTask(t, task) return nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) if tg == nil { - return syserror.ESRCH + return linuxerr.ESRCH } a.SetOwnerThreadGroup(t, tg) return nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) if pg == nil { - return syserror.ESRCH + return linuxerr.ESRCH } a.SetOwnerProcessGroup(t, pg) return nil default: - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -319,7 +320,7 @@ func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDes case linux.F_WRLCK: typ = lock.WriteLock default: - return syserror.EINVAL + return linuxerr.EINVAL } r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) if err != nil { @@ -368,13 +369,13 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip switch flock.Type { case linux.F_RDLCK: if !file.IsReadable() { - return syserror.EBADF + return linuxerr.EBADF } return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.ReadLock, r, blocker) case linux.F_WRLCK: if !file.IsWritable() { - return syserror.EBADF + return linuxerr.EBADF } return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.WriteLock, r, blocker) @@ -382,7 +383,7 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip return file.UnlockPOSIX(t, t.FDTable(), r) default: - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -395,22 +396,22 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Note: offset is allowed to be negative. if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } switch advice { @@ -421,7 +422,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.POSIX_FADV_DONTNEED: case linux.POSIX_FADV_NOREUSE: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Sure, whatever. diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 36aa1d3ae..534355237 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -16,12 +16,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Link implements Linux syscall link(2). @@ -43,7 +43,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { return syserror.ENOENT @@ -290,7 +290,7 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := args[2].Int() if flags&^linux.AT_REMOVEDIR != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&linux.AT_REMOVEDIR != 0 { diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go index a7d4d2a36..1e36d9c76 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -16,11 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Getcwd implements Linux syscall getcwd(2). @@ -39,7 +39,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Note this is >= because we need a terminator. if uint(len(s)) >= size { - return 0, nil, syserror.ERANGE + return 0, nil, linuxerr.ERANGE } // Construct a byte slice containing a NUL terminator. @@ -106,7 +106,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() if !t.HasCapability(linux.CAP_SYS_CHROOT) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } path, err := copyInPath(t, addr) diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index b41a3056a..c2c3172bc 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -17,13 +17,12 @@ package vfs2 import ( "fmt" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Getdents implements Linux syscall getdents(2). @@ -43,7 +42,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -100,7 +99,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of 8 if size > cb.remaining { - return syserror.EINVAL + return linuxerr.EINVAL } buf = cb.t.CopyScratchBuffer(size) hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino) @@ -134,7 +133,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of sizeof(long) if size > cb.remaining { - return syserror.EINVAL + return linuxerr.EINVAL } buf = cb.t.CopyScratchBuffer(size) hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino) diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 11753d8e5..d8d5dd7ad 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC @@ -28,7 +28,7 @@ const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^allFlags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) @@ -60,14 +60,14 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, f := t.GetFileVFS2(fd) if f == nil { // Invalid fd. - return nil, nil, syserror.EBADF + return nil, nil, linuxerr.EBADF } ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. f.DecRef(t) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } return ino, f, nil @@ -82,7 +82,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if mask&linux.ALL_INOTIFY_BITS == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index c7c3fed57..b806120cd 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Ioctl implements Linux syscall ioctl(2). @@ -28,12 +28,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Handle ioctls that apply to all FDs. @@ -99,7 +99,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ownerType = linux.F_OWNER_PGRP who = -who diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index d1452a04d..008603173 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Flock implements linux syscall flock(2). @@ -30,7 +30,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -57,7 +57,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } default: // flock(2): EINVAL operation is invalid. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go index c4c0f9e0a..70c2cf5a5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/memfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -35,7 +35,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if flags&^memfdAllFlags != 0 { // Unknown bits in flags. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index c961545f6..c804f9fd3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -16,13 +16,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Mmap implements Linux syscall mmap(2). @@ -38,7 +37,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Require exactly one of MAP_PRIVATE and MAP_SHARED. if private == shared { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } opts := memmap.MMapOpts{ @@ -71,13 +70,13 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Convert the passed FD to a file reference. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { - return 0, nil, syserror.EACCES + return 0, nil, linuxerr.EACCES } // MAP_SHARED requires that the FD be writable for PROT_WRITE. if shared && !file.IsWritable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index dd93430e2..4d73d46ef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -16,12 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Mount implements Linux syscall mount(2). @@ -69,7 +68,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // namespace. creds := t.Credentials() if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | @@ -84,7 +83,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // unknown or unsupported flags are passed. Since we don't implement // everything, we fail explicitly on flags that are unimplemented. if flags&(unsupportedOps|unsupportedFlags) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var opts vfs.MountOptions @@ -125,12 +124,12 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Currently, this is always the init task's user namespace. creds := t.Credentials() if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE if flags&unsupported != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, addr) diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 2aaf1ed74..2bb783a85 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -16,12 +16,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) func copyInPath(t *kernel.Task, addr hostarch.Addr) (fspath.Path, error) { @@ -53,7 +53,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { root.DecRef(t) - return taskPathOperation{}, syserror.EBADF + return taskPathOperation{}, linuxerr.EBADF } start = dirfile.VirtualDentry() start.IncRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go index c6fc1954c..07a89cf4e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/pipe.go +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -16,14 +16,13 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Pipe implements Linux syscall pipe(2). @@ -41,7 +40,7 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func pipe2(t *kernel.Task, addr hostarch.Addr, flags int32) error { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } r, w, err := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index b16773d65..042aa4c97 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -162,7 +162,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. func copyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) { if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } pfd := make([]linux.PollFD, nfds) @@ -222,7 +222,7 @@ func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialB func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Calculate the size of the fd sets (one bit per fd). @@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // OK. Linux is racy in the same way. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } file.DecRef(t) @@ -485,7 +485,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } if timeval.Sec < 0 || timeval.Usec < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } timeout = time.Duration(timeval.ToNsecCapped()) } @@ -562,7 +562,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time. return 0, err } if !timespec.Valid() { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } @@ -574,7 +574,7 @@ func setTempSignalSet(t *kernel.Task, maskAddr hostarch.Addr, maskSize uint) err return nil } if maskSize != linux.SignalSetSize { - return syserror.EINVAL + return linuxerr.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, maskAddr); err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index bbfa4c6d7..fe8aa06da 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -43,14 +43,14 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -74,7 +74,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -141,19 +141,19 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -178,13 +178,13 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -216,13 +216,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -294,14 +294,14 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -325,7 +325,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -391,19 +391,19 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -428,13 +428,13 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -466,13 +466,13 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -561,7 +561,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -577,27 +577,27 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.IsReadable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is valid. if int(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Return EINVAL; if the underlying file type does not support readahead, // then Linux will return EINVAL to indicate as much. In the future, we // may extend this function to actually support readahead hints. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 647e089d0..b5a3b92c5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -16,15 +16,15 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX @@ -65,7 +65,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -105,7 +105,7 @@ func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) @@ -126,7 +126,7 @@ func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vf if owner != -1 { kuid := userns.MapToKUID(auth.UID(owner)) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_UID opts.Stat.UID = uint32(kuid) @@ -134,7 +134,7 @@ func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vf if group != -1 { kgid := userns.MapToKGID(auth.GID(group)) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_GID opts.Stat.GID = uint32(kgid) @@ -150,7 +150,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -167,7 +167,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc length := args[1].Int64() if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, addr) @@ -191,17 +191,17 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys length := args[1].Int64() if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if !file.IsWritable() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } err := file.SetStat(t, vfs.SetStatOptions{ @@ -222,23 +222,23 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if !file.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if mode != 0 { - return 0, nil, syserror.ENOTSUP + return 0, nil, linuxerr.ENOTSUP } if offset < 0 || length <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } size := offset + length if size < 0 { - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } limit := limits.FromContext(t).Get(limits.FileSize).Cur if uint64(size) >= limit { @@ -246,7 +246,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) @@ -340,7 +340,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, op return err } if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME opts.Stat.Atime = linux.StatxTimestamp{ @@ -372,7 +372,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If filename is NULL and dfd refers to an open file, then operate on the @@ -405,7 +405,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, o } if times[0].Nsec != linux.UTIME_OMIT { if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_ATIME opts.Stat.Atime = linux.StatxTimestamp{ @@ -415,7 +415,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, o } if times[1].Nsec != linux.UTIME_OMIT { if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_MTIME opts.Stat.Mtime = linux.StatxTimestamp{ @@ -440,7 +440,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - return syserror.EBADF + return linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.SetStat() instead of @@ -468,7 +468,7 @@ func handleSetSizeError(t *kernel.Task, err error) error { if err == syserror.ErrExceedsFileSizeLimit { // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syserror.EFBIG + return linuxerr.EFBIG } return err } diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go index 6163da103..27fb2139b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/signal.go +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -16,13 +16,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd" "gvisor.dev/gvisor/pkg/sentry/kernel" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // sharedSignalfd is shared between the two calls. @@ -35,7 +34,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u // Always check for valid flags, even if not creating. if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is this a change to an existing signalfd? @@ -44,7 +43,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u if fd != -1 { file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -55,7 +54,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u } // Not a signalfd. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } fileFlags := uint32(linux.O_RDWR) diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 9a4b5e5fc..0c2e0720b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -118,7 +118,7 @@ type multipleMessageHeader64 struct { // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } addrBuf := make([]byte, addrlen) @@ -140,7 +140,7 @@ func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr h } if int32(bufLen) < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // Write the length unconditionally. @@ -174,7 +174,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create the new socket. @@ -207,7 +207,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create the socket pair. @@ -257,7 +257,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -282,13 +282,13 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) @@ -347,7 +347,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -374,7 +374,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -412,7 +412,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -426,7 +426,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -443,7 +443,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -459,7 +459,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Call syscall implementation then copy both value and value len out. @@ -524,7 +524,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -535,10 +535,10 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if optLen > maxOptLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyInBytes(optValAddr, buf); err != nil { @@ -562,7 +562,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -590,7 +590,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -617,13 +617,13 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -635,7 +635,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -665,7 +665,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -674,13 +674,13 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -702,7 +702,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -722,7 +722,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -732,7 +732,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -754,7 +754,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -785,7 +785,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl } if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { @@ -834,18 +834,18 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) @@ -912,13 +912,13 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -930,7 +930,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -950,7 +950,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -960,7 +960,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -972,7 +972,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -984,7 +984,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -994,7 +994,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -1019,7 +1019,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil { @@ -1039,7 +1039,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1078,13 +1078,13 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 19e175203..d8009123f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -18,6 +18,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -46,29 +47,29 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal count = int64(kernel.MAX_RW_COUNT) } if count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get file descriptions. inFile := t.GetFileVFS2(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // The operation is non-blocking if anything is non-blocking. @@ -82,38 +83,38 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) if !inIsPipe && !outIsPipe { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy in offsets. inOffset := int64(-1) if inOffsetPtr != 0 { if inIsPipe { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if inFile.Options().DenyPRead { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil { return 0, nil, err } if inOffset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } outOffset := int64(-1) if outOffsetPtr != 0 { if outIsPipe { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if outFile.Options().DenyPWrite { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil { return 0, nil, err } if outOffset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -189,29 +190,29 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo count = int64(kernel.MAX_RW_COUNT) } if count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get file descriptions. inFile := t.GetFileVFS2(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // The operation is non-blocking if anything is non-blocking. @@ -225,7 +226,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) if !inIsPipe || !outIsPipe { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy data. @@ -270,25 +271,25 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc inFile := t.GetFileVFS2(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) if !inFile.IsReadable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } outFile := t.GetFileVFS2(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) if !outFile.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Verify that the outFile Append flag is not set. if outFile.StatusFlags()&linux.O_APPEND != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Verify that inFile is a regular file or block device. This is a @@ -298,14 +299,14 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } else if stat.Mask&linux.STATX_TYPE == 0 || (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy offset if it exists. offset := int64(-1) if offsetAddr != 0 { if inFile.Options().DenyPRead { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } var offsetP primitive.Int64 if _, err := offsetP.CopyIn(t, offsetAddr); err != nil { @@ -314,16 +315,16 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc offset = int64(offsetP) if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if offset+count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } // Validate count. This must come after offset checks. if count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if count == 0 { return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index 69e77fa99..ba1d30823 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -17,15 +17,15 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Stat implements Linux syscall stat(2). @@ -53,7 +53,7 @@ func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } opts := vfs.StatOptions{ @@ -78,7 +78,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flag } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - return syserror.EBADF + return linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.Stat() instead of @@ -131,7 +131,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -156,15 +156,15 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall statxAddr := args[4].Pointer() if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Make sure that only one sync type option is set. syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE) if syncType != 0 && !bits.IsPowerOfTwo32(syncType) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if mask&linux.STATX__RESERVED != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } opts := vfs.StatOptions{ @@ -190,7 +190,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.Stat() instead of @@ -272,7 +272,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) er // Sanity check the mode. if mode&^(rOK|wOK|xOK) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) @@ -315,7 +315,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { if int(size) <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go index 2da538fc6..122921b52 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go index 88b9c7627..d32031481 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 1f8a5878c..d0ffc7c32 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -16,6 +16,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -32,12 +33,12 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } return 0, nil, file.SyncFS(t) @@ -49,7 +50,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -71,15 +72,15 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // Check for negative values and overflow. if offset < 0 || offset+nbytes < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 250870c03..b8f96a757 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -16,11 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // TimerfdCreate implements Linux syscall timerfd_create(2). @@ -29,7 +29,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel flags := args[1].Int() if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Timerfds aren't writable per se (their implementation of Write just @@ -47,7 +47,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: clock = t.Kernel().MonotonicClock() default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() file, err := timerfd.New(t, vfsObj, clock, fileFlags) @@ -72,18 +72,18 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne oldValAddr := args[3].Pointer() if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var newVal linux.Itimerspec @@ -111,13 +111,13 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } tm, s := tfd.GetTime() diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index c779c6465..7b2f69c45 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -20,12 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // ListXattr implements Linux syscall listxattr(2). @@ -72,7 +70,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -141,7 +139,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -179,7 +177,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli flags := args[4].Int() if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) @@ -217,12 +215,12 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys flags := args[4].Int() if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -281,7 +279,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -297,12 +295,12 @@ func copyInXattrName(t *kernel.Task, nameAddr hostarch.Addr) (string, error) { name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) if err != nil { if linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return "", err } if len(name) == 0 { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return name, nil } @@ -322,16 +320,16 @@ func copyOutXattrNameList(t *kernel.Task, listAddr hostarch.Addr, size uint, nam } if buf.Len() > int(size) { if size >= linux.XATTR_LIST_MAX { - return 0, syserror.E2BIG + return 0, linuxerr.E2BIG } - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return t.CopyOutBytes(listAddr, buf.Bytes()) } func copyInXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint) (string, error) { if size > linux.XATTR_SIZE_MAX { - return "", syserror.E2BIG + return "", linuxerr.E2BIG } buf := make([]byte, size) if _, err := t.CopyInBytes(valueAddr, buf); err != nil { @@ -350,9 +348,9 @@ func copyOutXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint, value } if len(value) > int(size) { if size >= linux.XATTR_SIZE_MAX { - return 0, syserror.E2BIG + return 0, linuxerr.E2BIG } - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) } diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index f88055676..511fb8b28 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -28,6 +28,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -99,13 +100,13 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne Name: name, Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if !t.HasCapability(c) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } t.Kernel().EmitUnimplementedEvent(t) return 0, nil, syserror.ENOSYS }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS), + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, linuxerr.EPERM, c.String(), syserror.ENOSYS), URLs: urls, } } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 202486a1e..36d999c47 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -34,6 +34,7 @@ go_library( ], visibility = ["//:sandbox"], deps = [ + "//pkg/errors/linuxerr", "//pkg/gohacks", "//pkg/log", "//pkg/metric", diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go index 39bf1e0de..eed74f6bd 100644 --- a/pkg/sentry/time/calibrated_clock.go +++ b/pkg/sentry/time/calibrated_clock.go @@ -19,10 +19,10 @@ package time import ( "time" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // CalibratedClock implements a clock that tracks a reference clock. @@ -259,6 +259,6 @@ func (c *CalibratedClocks) GetTime(id ClockID) (int64, error) { case Realtime: return c.realtime.GetTime() default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go index d6622bfe2..0a4b1f1bf 100644 --- a/pkg/sentry/time/calibrated_clock_test.go +++ b/pkg/sentry/time/calibrated_clock_test.go @@ -50,6 +50,7 @@ func TestConstantFrequency(t *testing.T) { if !c.ready { c.mu.RUnlock() t.Fatalf("clock not ready") + return // For checklocks consistency. } // A bit after the last sample. now, ok := c.params.ComputeTime(750000) diff --git a/pkg/sentry/time/sampler_amd64.go b/pkg/sentry/time/sampler_amd64.go index 9f1b4b2fb..5fa1832b4 100644 --- a/pkg/sentry/time/sampler_amd64.go +++ b/pkg/sentry/time/sampler_amd64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build amd64 +//go:build amd64 +// +build amd64 package time diff --git a/pkg/sentry/time/sampler_arm64.go b/pkg/sentry/time/sampler_arm64.go index 4c8d33ae4..3560e66ae 100644 --- a/pkg/sentry/time/sampler_arm64.go +++ b/pkg/sentry/time/sampler_arm64.go @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build arm64 +//go:build arm64 +// +build arm64 package time diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index f48817132..255d3992e 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -19,11 +19,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) // NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name, @@ -101,7 +101,7 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID) } @@ -109,10 +109,10 @@ func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds // GetDentryAt implements FilesystemImpl.GetDentryAt. func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { if !rp.Done() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if opts.CheckSearchable { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // anonDentry no-ops refcounting. return rp.Start(), nil @@ -121,7 +121,7 @@ func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, op // GetParentDentryAt implements FilesystemImpl.GetParentDentryAt. func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) { if !rp.Final() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // anonDentry no-ops refcounting. return rp.Start(), nil @@ -130,63 +130,63 @@ func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPa // LinkAt implements FilesystemImpl.LinkAt. func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // MkdirAt implements FilesystemImpl.MkdirAt. func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // MknodAt implements FilesystemImpl.MknodAt. func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // OpenAt implements FilesystemImpl.OpenAt. func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) { if !rp.Done() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } // ReadlinkAt implements FilesystemImpl.ReadlinkAt. func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) { if !rp.Done() { - return "", syserror.ENOTDIR + return "", linuxerr.ENOTDIR } - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // RenameAt implements FilesystemImpl.RenameAt. func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // RmdirAt implements FilesystemImpl.RmdirAt. func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // SetStatAt implements FilesystemImpl.SetStatAt. func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Linux actually permits anon_inode_inode's metadata to be set, which is // visible to all users of anon_inode_inode. We just silently ignore @@ -197,7 +197,7 @@ func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts // StatAt implements FilesystemImpl.StatAt. func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) { if !rp.Done() { - return linux.Statx{}, syserror.ENOTDIR + return linux.Statx{}, linuxerr.ENOTDIR } // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode(). return linux.Statx{ @@ -218,7 +218,7 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St // StatFSAt implements FilesystemImpl.StatFSAt. func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) { if !rp.Done() { - return linux.Statfs{}, syserror.ENOTDIR + return linux.Statfs{}, linuxerr.ENOTDIR } return linux.Statfs{ Type: linux.ANON_INODE_FS_MAGIC, @@ -229,34 +229,34 @@ func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linu // SymlinkAt implements FilesystemImpl.SymlinkAt. func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // UnlinkAt implements FilesystemImpl.UnlinkAt. func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) { if !rp.Final() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil { return nil, err } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements FilesystemImpl.ListXattrAt. func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { if !rp.Done() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return nil, nil } @@ -264,25 +264,25 @@ func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, si // GetXattrAt implements FilesystemImpl.GetXattrAt. func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) { if !rp.Done() { - return "", syserror.ENOTDIR + return "", linuxerr.ENOTDIR } - return "", syserror.ENOTSUP + return "", linuxerr.ENOTSUP } // SetXattrAt implements FilesystemImpl.SetXattrAt. func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // RemoveXattrAt implements FilesystemImpl.RemoveXattrAt. func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // PrependPath implements FilesystemImpl.PrependPath. diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index e7ca24d96..cb92b6eee 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -18,8 +18,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Dentry represents a node in a Filesystem tree at which a file exists. @@ -196,11 +196,12 @@ func (d *Dentry) OnZeroWatches(ctx context.Context) { // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +// +checklocksacquire:d.mu func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[d] != 0 { vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY // +checklocksforce: inconsistent return. } d.mu.Lock() vfs.mountMu.Unlock() @@ -211,14 +212,14 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion // fails. -// +checklocks:d.mu +// +checklocksrelease:d.mu func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { d.mu.Unlock() } // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -// +checklocks:d.mu +// +checklocksrelease:d.mu func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() @@ -249,16 +250,18 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { // Preconditions: // * If to is not nil, it must be a child Dentry from the same Filesystem. // * from != to. +// +checklocksacquire:from.mu +// +checklocksacquire:to.mu func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY // +checklocksforce: no locks acquired. } if to != nil { if mntns.mountpoints[to] != 0 { vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY // +checklocksforce: no locks acquired. } to.mu.Lock() } @@ -267,13 +270,13 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t // Return with from.mu and to.mu locked, which will be unlocked by // AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry. - return nil + return nil // +checklocksforce: to may not be acquired. } // AbortRenameDentry must be called after PrepareRenameDentry if the rename // fails. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { from.mu.Unlock() if to != nil { @@ -286,8 +289,8 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { @@ -303,8 +306,8 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, fro // from and to are exchanged by rename(RENAME_EXCHANGE). // // Preconditions: PrepareRenameDentry was previously called on from and to. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { from.mu.Unlock() to.mu.Unlock() diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index dde2ad79b..572d81afc 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -18,7 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // DeviceKind indicates whether a device is a block or character device. @@ -100,7 +100,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo defer vfs.devicesMu.RUnlock() rd, ok := vfs.devices[tup] if !ok { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } return rd.dev.Open(ctx, mnt, d, *opts) } @@ -120,7 +120,7 @@ func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { } minor++ } - return 0, syserror.EMFILE + return 0, linuxerr.EMFILE } // PutAnonBlockDevMinor deallocates a minor device number returned by a diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index ae004b371..befe3ca25 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -17,6 +17,7 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" @@ -174,7 +175,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin // that cyclic polling is not introduced after the check. defer epollCycleMu.Unlock() if subep.mightPoll(ep) { - return syserror.ELOOP + return linuxerr.ELOOP } } @@ -187,7 +188,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin num: num, } if _, ok := ep.interest[key]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } // Register interest in file. diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 2bc33d424..ca3303dec 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -253,7 +252,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return err } if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { - return syserror.EPERM + return linuxerr.EPERM } } if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { @@ -267,14 +266,14 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return err } if stat.Mask&linux.STATX_UID == 0 { - return syserror.EPERM + return linuxerr.EPERM } if !CanActAsOwner(creds, auth.KUID(stat.UID)) { - return syserror.EPERM + return linuxerr.EPERM } } if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { - return syserror.EINVAL + return linuxerr.EINVAL } // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()? const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK @@ -568,7 +567,7 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // Allocate grows file represented by FileDescription to offset + length bytes. func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { if !fd.IsWritable() { - return syserror.EBADF + return linuxerr.EBADF } if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil { return err @@ -603,10 +602,10 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) { // partial reads with a nil error. func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { if fd.opts.DenyPRead { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } if !fd.readable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } start := fsmetric.StartReadWait() n, err := fd.impl.PRead(ctx, dst, offset, opts) @@ -621,7 +620,7 @@ func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // Read is similar to PRead, but does not specify an offset. func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { if !fd.readable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } start := fsmetric.StartReadWait() n, err := fd.impl.Read(ctx, dst, opts) @@ -638,10 +637,10 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt // return partial writes with a nil error. func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { if fd.opts.DenyPWrite { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } if !fd.writable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } n, err := fd.impl.PWrite(ctx, src, offset, opts) if n > 0 { @@ -653,7 +652,7 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { if !fd.writable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } n, err := fd.impl.Write(ctx, src, opts) if n > 0 { @@ -874,7 +873,7 @@ func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, l } off = int64(stat.Size) default: - return lock.LockRange{}, syserror.EINVAL + return lock.LockRange{}, linuxerr.EINVAL } return lock.ComputeRange(int64(start), int64(length), off) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 2b6f47b4b..a875fdeca 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -65,7 +66,7 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err // should technically return EISDIR. Allocate should never be called for a // directory, because it requires a writable fd. func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.ENODEV + return linuxerr.ENODEV } // Readiness implements waiter.Waitable.Readiness analogously to @@ -88,81 +89,81 @@ func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { // PRead implements FileDescriptionImpl.PRead analogously to // file_operations::read == file_operations::read_iter == NULL in Linux. func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Read implements FileDescriptionImpl.Read analogously to // file_operations::read == file_operations::read_iter == NULL in Linux. func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // PWrite implements FileDescriptionImpl.PWrite analogously to // file_operations::write == file_operations::write_iter == NULL in Linux. func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Write implements FileDescriptionImpl.Write analogously to // file_operations::write == file_operations::write_iter == NULL in Linux. func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // IterDirents implements FileDescriptionImpl.IterDirents analogously to // file_operations::iterate == file_operations::iterate_shared == NULL in // Linux. func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Seek implements FileDescriptionImpl.Seek analogously to // file_operations::llseek == NULL in Linux. func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Sync implements FileDescriptionImpl.Sync analogously to // file_operations::fsync == NULL in Linux. func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { - return syserror.EINVAL + return linuxerr.EINVAL } // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to // file_operations::mmap == NULL in Linux. func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return syserror.ENODEV + return linuxerr.ENODEV } // Ioctl implements FileDescriptionImpl.Ioctl analogously to // file_operations::unlocked_ioctl == NULL in Linux. func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // ListXattr implements FileDescriptionImpl.ListXattr analogously to // inode_operations::listxattr == NULL in Linux. func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) { // This isn't exactly accurate; see FileDescription.ListXattr. - return nil, syserror.ENOTSUP + return nil, linuxerr.ENOTSUP } // GetXattr implements FileDescriptionImpl.GetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { - return "", syserror.ENOTSUP + return "", linuxerr.ENOTSUP } // SetXattr implements FileDescriptionImpl.SetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error { - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error { - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of @@ -333,10 +334,10 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 offset += fd.off default: // fs/seq_file:seq_lseek() rejects SEEK_END etc. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset != fd.lastRead { // Regenerate the file's contents immediately. Compare @@ -357,7 +358,7 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 // Preconditions: fd.mu must be locked. func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } limit, err := CheckLimit(ctx, offset, src.NumBytes()) if err != nil { @@ -467,27 +468,27 @@ func (NoLockFD) SupportsLocks() bool { // LockBSD implements FileDescriptionImpl.LockBSD. func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { - return linux.Flock{}, syserror.ENOLCK + return linux.Flock{}, linuxerr.ENOLCK } // BadLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface @@ -503,25 +504,25 @@ func (BadLockFD) SupportsLocks() bool { // LockBSD implements FileDescriptionImpl.LockBSD. func (BadLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { - return syserror.EBADF + return linuxerr.EBADF } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (BadLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { - return syserror.EBADF + return linuxerr.EBADF } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (BadLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { - return syserror.EBADF + return linuxerr.EBADF } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (BadLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { - return syserror.EBADF + return linuxerr.EBADF } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (BadLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { - return linux.Flock{}, syserror.EBADF + return linux.Flock{}, linuxerr.EBADF } diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 566ad856a..3423dede1 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -103,7 +103,7 @@ func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, erro // SetStat implements FileDescriptionImpl.SetStat. func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } func TestGenCountFD(t *testing.T) { diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 49d29e20b..088beb8e2 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/uniqueid" @@ -98,7 +99,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. flags &^= linux.O_CLOEXEC if flags&^linux.O_NONBLOCK != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } id := uniqueid.GlobalFromContext(ctx) @@ -184,23 +185,23 @@ func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { // PRead implements FileDescriptionImpl.PRead. func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // PWrite implements FileDescriptionImpl.PWrite. func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements FileDescriptionImpl.Write. func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Read implements FileDescriptionImpl.Read. func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { if dst.NumBytes() < inotifyEventBaseSize { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } i.evMu.Lock() @@ -226,7 +227,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt // write some events out. return writeLen, nil } - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Linux always dequeues an available event as long as there's enough @@ -262,7 +263,7 @@ func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallAr return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } @@ -332,7 +333,7 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { if ws == nil { // While Linux supports inotify watches on all filesystem types, watches on // filesystems like kernfs are not generally useful, so we do not. - return 0, syserror.EPERM + return 0, linuxerr.EPERM } // Does the target already have a watch from this inotify instance? if existing := ws.Lookup(i.id); existing != nil { @@ -360,7 +361,7 @@ func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { w, ok := i.watches[wd] if !ok { i.mu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } // Remove the watch from this instance. diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD index ea82f4987..444ab42b9 100644 --- a/pkg/sentry/vfs/memxattr/BUILD +++ b/pkg/sentry/vfs/memxattr/BUILD @@ -8,9 +8,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/errors/linuxerr", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go index 9b7953fa3..f0f82a4d6 100644 --- a/pkg/sentry/vfs/memxattr/xattr.go +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -20,10 +20,10 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // SimpleExtendedAttributes implements extended attributes using a map of @@ -49,12 +49,12 @@ func (x *SimpleExtendedAttributes) GetXattr(creds *auth.Credentials, mode linux. value, ok := x.xattrs[opts.Name] x.mu.RUnlock() if !ok { - return "", syserror.ENODATA + return "", linuxerr.ENODATA } // Check that the size of the buffer provided in getxattr(2) is large enough // to contain the value. if opts.Size != 0 && uint64(len(value)) > opts.Size { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return value, nil } @@ -69,17 +69,17 @@ func (x *SimpleExtendedAttributes) SetXattr(creds *auth.Credentials, mode linux. defer x.mu.Unlock() if x.xattrs == nil { if opts.Flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } x.xattrs = make(map[string]string) } _, ok := x.xattrs[opts.Name] if ok && opts.Flags&linux.XATTR_CREATE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } if !ok && opts.Flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } x.xattrs[opts.Name] = opts.Value @@ -106,7 +106,7 @@ func (x *SimpleExtendedAttributes) ListXattr(creds *auth.Credentials, size uint6 } x.mu.RUnlock() if size != 0 && uint64(listSize) > size { - return nil, syserror.ERANGE + return nil, linuxerr.ERANGE } return names, nil } @@ -120,7 +120,7 @@ func (x *SimpleExtendedAttributes) RemoveXattr(creds *auth.Credentials, mode lin x.mu.Lock() defer x.mu.Unlock() if _, ok := x.xattrs[name]; !ok { - return syserror.ENODATA + return linuxerr.ENODATA } delete(x.xattrs, name) return nil diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index f93da3af1..4d6b59a26 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -159,7 +160,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth rft := vfs.getFilesystemType(fsTypeName) if rft == nil { ctx.Warningf("Unknown filesystem type: %s", fsTypeName) - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { @@ -192,10 +193,10 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } if !opts.InternalMount && !rft.opts.AllowUserMount { - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { @@ -284,7 +285,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia // UmountAt removes the Mount at the given path. func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // MNT_FORCE is currently unimplemented except for the permission check. @@ -292,7 +293,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti // namespace, and not in the owner user namespace for the target mount. See // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { - return syserror.EPERM + return linuxerr.EPERM } vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) @@ -301,19 +302,19 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } defer vd.DecRef(ctx) if vd.dentry != vd.mount.root { - return syserror.EINVAL + return linuxerr.EINVAL } vfs.mountMu.Lock() if mntns := MountNamespaceFromContext(ctx); mntns != nil { defer mntns.DecRef(ctx) if mntns != vd.mount.ns { vfs.mountMu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } if vd.mount == vd.mount.ns.root { vfs.mountMu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -326,7 +327,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if len(vd.mount.children) != 0 { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY } // We are holding a reference on vd.mount. expectedRefs := int64(1) @@ -336,7 +337,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY } } vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ @@ -710,7 +711,7 @@ func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { func (mnt *Mount) CheckBeginWrite() error { if atomic.AddInt64(&mnt.writers, 1) < 0 { atomic.AddInt64(&mnt.writers, -1) - return syserror.EROFS + return linuxerr.EROFS } return nil } @@ -728,7 +729,7 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error { } if ro { if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { - return syserror.EBUSY + return linuxerr.EBUSY } return nil } diff --git a/pkg/sentry/vfs/opath.go b/pkg/sentry/vfs/opath.go index e9651b631..da0b33b79 100644 --- a/pkg/sentry/vfs/opath.go +++ b/pkg/sentry/vfs/opath.go @@ -17,10 +17,10 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -40,77 +40,77 @@ func (fd *opathFD) Release(context.Context) { // Allocate implements FileDescriptionImpl.Allocate. func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EBADF + return linuxerr.EBADF } // PRead implements FileDescriptionImpl.PRead. func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Read implements FileDescriptionImpl.Read. func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // PWrite implements FileDescriptionImpl.PWrite. func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Write implements FileDescriptionImpl.Write. func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Ioctl implements FileDescriptionImpl.Ioctl. func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // IterDirents implements FileDescriptionImpl.IterDirents. func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.EBADF + return linuxerr.EBADF } // Seek implements FileDescriptionImpl.Seek. func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap. func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return syserror.EBADF + return linuxerr.EBADF } // ListXattr implements FileDescriptionImpl.ListXattr. func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { - return nil, syserror.EBADF + return nil, linuxerr.EBADF } // GetXattr implements FileDescriptionImpl.GetXattr. func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { - return "", syserror.EBADF + return "", linuxerr.EBADF } // SetXattr implements FileDescriptionImpl.SetXattr. func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error { - return syserror.EBADF + return linuxerr.EBADF } // RemoveXattr implements FileDescriptionImpl.RemoveXattr. func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error { - return syserror.EBADF + return linuxerr.EBADF } // Sync implements FileDescriptionImpl.Sync. func (fd *opathFD) Sync(ctx context.Context) error { - return syserror.EBADF + return linuxerr.EBADF } // SetStat implements FileDescriptionImpl.SetStat. func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error { - return syserror.EBADF + return linuxerr.EBADF } // Stat implements FileDescriptionImpl.Stat. diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index b7704874f..4744514bd 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/syserror" @@ -77,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu // the caller's user namespace; compare // kernel/capability.c:privileged_wrt_inode_uidgid(). if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { - return syserror.EACCES + return linuxerr.EACCES } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. @@ -94,7 +95,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu return nil } } - return syserror.EACCES + return linuxerr.EACCES } // MayLink determines whether creating a hard link to a file with the given @@ -110,12 +111,12 @@ func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid // Only regular files can be hard linked. if mode.FileType() != linux.S_IFREG { - return syserror.EPERM + return linuxerr.EPERM } // Setuid files should not get pinned to the filesystem. if mode&linux.S_ISUID != 0 { - return syserror.EPERM + return linuxerr.EPERM } // Executable setgid files should not get pinned to the filesystem, but we @@ -123,7 +124,7 @@ func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid // Hardlinking to unreadable or unwritable sources is dangerous. if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { - return syserror.EPERM + return linuxerr.EPERM } return nil } @@ -199,7 +200,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt } if stat.Mask&linux.STATX_MODE != 0 { if !CanActAsOwner(creds, kuid) { - return syserror.EPERM + return linuxerr.EPERM } // TODO(b/30815691): "If the calling process is not privileged (Linux: // does not have the CAP_FSETID capability), and the group of the file @@ -210,13 +211,13 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt if stat.Mask&linux.STATX_UID != 0 { if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) || HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { - return syserror.EPERM + return linuxerr.EPERM } } if stat.Mask&linux.STATX_GID != 0 { if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) || HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { - return syserror.EPERM + return linuxerr.EPERM } } if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { @@ -229,7 +230,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) || (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { - return syserror.EPERM + return linuxerr.EPERM } if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { return err @@ -252,7 +253,7 @@ func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, paren HasCapabilityOnFile(creds, linux.CAP_FOWNER, childKUID, childKGID) { return nil } - return syserror.EPERM + return linuxerr.EPERM } // CanActAsOwner returns true if creds can act as the owner of a file with the @@ -306,9 +307,9 @@ func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux. return nil } if ats.MayWrite() { - return syserror.EPERM + return linuxerr.EPERM } - return syserror.ENODATA + return linuxerr.ENODATA case strings.HasPrefix(name, linux.XATTR_USER_PREFIX): // In the user.* namespace, only regular files and directories can have // extended attributes. For sticky directories, only the owner and @@ -316,12 +317,12 @@ func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux. filetype := mode.FileType() if filetype != linux.ModeRegular && filetype != linux.ModeDirectory { if ats.MayWrite() { - return syserror.EPERM + return linuxerr.EPERM } - return syserror.ENODATA + return linuxerr.ENODATA } if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) { - return syserror.EPERM + return linuxerr.EPERM } } return nil diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 97b898aba..6f58f33ce 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -327,7 +328,7 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool { // Postconditions: If HandleSymlink returns a nil error, then !rp.Done(). func (rp *ResolvingPath) HandleSymlink(target string) error { if rp.symlinks >= linux.MaxSymlinkTraversals { - return syserror.ELOOP + return linuxerr.ELOOP } if len(target) == 0 { return syserror.ENOENT @@ -377,7 +378,7 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { // Preconditions: !rp.Done(). func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { if rp.symlinks >= linux.MaxSymlinkTraversals { - return syserror.ELOOP + return linuxerr.ELOOP } rp.symlinks++ // Consume the path component that represented the magic link. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index b96de247f..eb3c60610 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -279,14 +279,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential if !newpop.Path.Begin.Ok() { oldVD.DecRef(ctx) if newpop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } return syserror.ENOENT } if newpop.FollowFinalSymlink { oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, newpop) @@ -316,13 +316,13 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia // pop.Path should not be empty in operations that create/delete files. // This is consistent with mkdirat(dirfd, "", mode). if pop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } return syserror.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is // also honored." - mkdir(2) @@ -354,13 +354,13 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia // pop.Path should not be empty in operations that create/delete files. // This is consistent with mknodat(dirfd, "", mode, dev). if pop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } return syserror.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -403,13 +403,13 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // filesystem implementations that do not support it). if opts.Flags&linux.O_TMPFILE != 0 { if opts.Flags&linux.O_DIRECTORY == 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } // O_PATH causes most other flags to be ignored. @@ -427,9 +427,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential if opts.Flags&linux.O_DIRECTORY != 0 { rp.mustBeDir = true } - // Ignore O_PATH for verity, as verity performs extra operations on the fd for verification. - // The underlying filesystem that verity wraps opens the fd with O_PATH. - if opts.Flags&linux.O_PATH != 0 && rp.mount.fs.FilesystemType().Name() != "verity" { + if opts.Flags&linux.O_PATH != 0 { vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) if err != nil { return nil, err @@ -449,7 +447,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential if opts.FileExec { if fd.Mount().Flags.NoExec { fd.DecRef(ctx) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } // Only a regular file can be executed. @@ -460,7 +458,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { fd.DecRef(ctx) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } } @@ -494,13 +492,13 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { if !oldpop.Path.Begin.Ok() { if oldpop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } return syserror.ENOENT } if oldpop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) @@ -509,20 +507,20 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti } if oldName == "." || oldName == ".." { oldParentVD.DecRef(ctx) - return syserror.EBUSY + return linuxerr.EBUSY } if !newpop.Path.Begin.Ok() { oldParentVD.DecRef(ctx) if newpop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } return syserror.ENOENT } if newpop.FollowFinalSymlink { oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, newpop) @@ -556,13 +554,13 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia // pop.Path should not be empty in operations that create/delete files. // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). if pop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } return syserror.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -639,13 +637,13 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent // pop.Path should not be empty in operations that create/delete files. // This is consistent with symlinkat(oldpath, newdirfd, ""). if pop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } return syserror.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -673,13 +671,13 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti // pop.Path should not be empty in operations that create/delete files. // This is consistent with unlinkat(dirfd, "", 0). if pop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } return syserror.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -834,7 +832,7 @@ func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string switch { case err == nil: if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Directory already exists. return nil diff --git a/pkg/shim/BUILD b/pkg/shim/BUILD index b115556f5..367765209 100644 --- a/pkg/shim/BUILD +++ b/pkg/shim/BUILD @@ -8,7 +8,6 @@ go_library( "api.go", "debug.go", "epoll.go", - "errors.go", "options.go", "service.go", "service_linux.go", @@ -45,8 +44,6 @@ go_library( "@com_github_gogo_protobuf//types:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_sirupsen_logrus//:go_default_library", - "@org_golang_google_grpc//codes:go_default_library", - "@org_golang_google_grpc//status:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -54,14 +51,10 @@ go_library( go_test( name = "shim_test", size = "small", - srcs = [ - "errors_test.go", - "service_test.go", - ], + srcs = ["service_test.go"], library = ":shim", deps = [ "//pkg/shim/utils", - "@com_github_containerd_containerd//errdefs:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) diff --git a/pkg/shim/epoll.go b/pkg/shim/epoll.go index 737d2b781..463e11a84 100644 --- a/pkg/shim/epoll.go +++ b/pkg/shim/epoll.go @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package shim diff --git a/pkg/shim/proc/BUILD b/pkg/shim/proc/BUILD index 544bdc170..c8527a6d9 100644 --- a/pkg/shim/proc/BUILD +++ b/pkg/shim/proc/BUILD @@ -20,7 +20,9 @@ go_library( "//shim:__subpackages__", ], deps = [ + "//pkg/cleanup", "//pkg/shim/runsc", + "//pkg/shim/utils", "@com_github_containerd_console//:go_default_library", "@com_github_containerd_containerd//errdefs:go_default_library", "@com_github_containerd_containerd//log:go_default_library", diff --git a/pkg/shim/proc/exec.go b/pkg/shim/proc/exec.go index 14df3a778..da2e21598 100644 --- a/pkg/shim/proc/exec.go +++ b/pkg/shim/proc/exec.go @@ -26,11 +26,13 @@ import ( "github.com/containerd/console" "github.com/containerd/containerd/errdefs" + "github.com/containerd/containerd/log" "github.com/containerd/containerd/pkg/stdio" "github.com/containerd/fifo" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/shim/runsc" ) @@ -92,6 +94,12 @@ func (e *execProcess) SetExited(status int) { } func (e *execProcess) setExited(status int) { + if !e.exited.IsZero() { + log.L.Debugf("Exec: status already set to %d, ignoring status: %d", e.status, status) + return + } + + log.L.Debugf("Exec: setting status: %d", status) e.status = status e.exited = time.Now() e.parent.Platform.ShutdownConsole(context.Background(), e.console) @@ -105,7 +113,7 @@ func (e *execProcess) Delete(ctx context.Context) error { return e.execState.Delete(ctx) } -func (e *execProcess) delete(ctx context.Context) error { +func (e *execProcess) delete() { e.wg.Wait() if e.io != nil { for _, c := range e.closers { @@ -113,13 +121,6 @@ func (e *execProcess) delete(ctx context.Context) error { } e.io.Close() } - pidfile := filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)) - // silently ignore error - os.Remove(pidfile) - internalPidfile := filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)) - // silently ignore error - os.Remove(internalPidfile) - return nil } func (e *execProcess) Resize(ws console.WinSize) error { @@ -171,42 +172,53 @@ func (e *execProcess) Start(ctx context.Context) error { return e.execState.Start(ctx) } -func (e *execProcess) start(ctx context.Context) (err error) { - var ( - socket *runc.Socket - pidfile = filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)) - internalPidfile = filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)) - ) - if e.stdio.Terminal { - if socket, err = runc.NewTempConsoleSocket(); err != nil { +func (e *execProcess) start(ctx context.Context) error { + var socket *runc.Socket + + switch { + case e.stdio.Terminal: + s, err := runc.NewTempConsoleSocket() + if err != nil { return fmt.Errorf("failed to create runc console socket: %w", err) } - defer socket.Close() - } else if e.stdio.IsNull() { - if e.io, err = runc.NewNullIO(); err != nil { + defer s.Close() + socket = s + + case e.stdio.IsNull(): + io, err := runc.NewNullIO() + if err != nil { return fmt.Errorf("creating new NULL IO: %w", err) } - } else { - if e.io, err = runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)); err != nil { + e.io = io + + default: + io, err := runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)) + if err != nil { return fmt.Errorf("failed to create runc io pipes: %w", err) } + e.io = io } + opts := &runsc.ExecOpts{ - PidFile: pidfile, - InternalPidFile: internalPidfile, + PidFile: filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)), + InternalPidFile: filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)), IO: e.io, Detach: true, } + defer func() { + _ = os.Remove(opts.PidFile) + _ = os.Remove(opts.InternalPidFile) + }() if socket != nil { opts.ConsoleSocket = socket } + eventCh := e.parent.Monitor.Subscribe() - defer func() { - // Unsubscribe if an error is returned. - if err != nil { - e.parent.Monitor.Unsubscribe(eventCh) - } - }() + cu := cleanup.Make(func() { + e.parent.Monitor.Unsubscribe(eventCh) + }) + defer cu.Clean() + if err := e.parent.runtime.Exec(ctx, e.parent.id, e.spec, opts); err != nil { close(e.waitBlock) return e.parent.runtimeError(err, "OCI runtime exec failed") @@ -234,6 +246,7 @@ func (e *execProcess) start(ctx context.Context) (err error) { return fmt.Errorf("failed to start io pipe copy: %w", err) } } + pid, err := runc.ReadPidFile(opts.PidFile) if err != nil { return fmt.Errorf("failed to retrieve OCI runtime exec pid: %w", err) @@ -244,6 +257,7 @@ func (e *execProcess) start(ctx context.Context) (err error) { return fmt.Errorf("failed to retrieve OCI runtime exec internal pid: %w", err) } e.internalPid = internalPid + go func() { defer e.parent.Monitor.Unsubscribe(eventCh) for event := range eventCh { @@ -257,21 +271,25 @@ func (e *execProcess) start(ctx context.Context) (err error) { } } }() + + cu.Release() // cancel cleanup on success. return nil } -func (e *execProcess) Status(ctx context.Context) (string, error) { +func (e *execProcess) Status(context.Context) (string, error) { e.mu.Lock() defer e.mu.Unlock() // if we don't have a pid then the exec process has just been created if e.pid == 0 { return "created", nil } - // if we have a pid and it can be signaled, the process is running - // TODO(random-liu): Use `runsc kill --pid`. - if err := unix.Kill(e.pid, 0); err == nil { - return "running", nil + // This checks that `runsc exec` process is still running. This process has + // the same lifetime as the process executing inside the container. So instead + // of calling `runsc kill --pid`, just do a quick check that `runsc exec` is + // still running. + if err := unix.Kill(e.pid, 0); err != nil { + // Can't signal the process, it must have exited. + return "stopped", nil } - // else if we have a pid but it can nolonger be signaled, it has stopped - return "stopped", nil + return "running", nil } diff --git a/pkg/shim/proc/exec_state.go b/pkg/shim/proc/exec_state.go index 9c6edd3f5..03ecb401a 100644 --- a/pkg/shim/proc/exec_state.go +++ b/pkg/shim/proc/exec_state.go @@ -63,10 +63,8 @@ func (s *execCreatedState) Start(ctx context.Context) error { return nil } -func (s *execCreatedState) Delete(ctx context.Context) error { - if err := s.p.delete(ctx); err != nil { - return err - } +func (s *execCreatedState) Delete(context.Context) error { + s.p.delete() s.transition(deleted) return nil } @@ -143,10 +141,8 @@ func (s *execStoppedState) Start(context.Context) error { return fmt.Errorf("cannot start a stopped process") } -func (s *execStoppedState) Delete(ctx context.Context) error { - if err := s.p.delete(ctx); err != nil { - return err - } +func (s *execStoppedState) Delete(context.Context) error { + s.p.delete() s.transition(deleted) return nil } diff --git a/pkg/shim/proc/init_state.go b/pkg/shim/proc/init_state.go index d65020e76..5347ddefe 100644 --- a/pkg/shim/proc/init_state.go +++ b/pkg/shim/proc/init_state.go @@ -23,6 +23,7 @@ import ( "github.com/containerd/containerd/pkg/process" runc "github.com/containerd/go-runc" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/shim/utils" ) type stateTransition int @@ -235,6 +236,6 @@ func handleStoppedKill(signal uint32) error { // already been killed. return nil default: - return errdefs.ToGRPCf(errdefs.ErrNotFound, "process not found") + return utils.ErrToGRPCf(errdefs.ErrNotFound, "process not found") } } diff --git a/pkg/shim/runtimeoptions/runtimeoptions_cri.go b/pkg/shim/runtimeoptions/runtimeoptions_cri.go index e6102b4cf..23bbd82be 100644 --- a/pkg/shim/runtimeoptions/runtimeoptions_cri.go +++ b/pkg/shim/runtimeoptions/runtimeoptions_cri.go @@ -13,6 +13,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package runtimeoptions import ( diff --git a/pkg/shim/service.go b/pkg/shim/service.go index 0b41f0e72..24e3b7a82 100644 --- a/pkg/shim/service.go +++ b/pkg/shim/service.go @@ -452,10 +452,10 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*ta } process, err := newInit(r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &s.opts, st.Rootfs) if err != nil { - return nil, errToGRPC(err) + return nil, utils.ErrToGRPC(err) } if err := process.Create(ctx, config); err != nil { - return nil, errToGRPC(err) + return nil, utils.ErrToGRPC(err) } // Set up OOM notification on the sandbox's cgroup. This is done on @@ -530,10 +530,10 @@ func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*typ p := s.processes[r.ExecID] s.mu.Unlock() if p != nil { - return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) + return nil, utils.ErrToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) } if s.task == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } process, err := s.task.Exec(ctx, s.bundle, &proc.ExecConfig{ ID: r.ExecID, @@ -544,7 +544,7 @@ func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*typ Spec: r.Spec, }) if err != nil { - return nil, errToGRPC(err) + return nil, utils.ErrToGRPC(err) } s.mu.Lock() s.processes[r.ExecID] = process @@ -565,7 +565,7 @@ func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (* Height: uint16(r.Height), } if err := p.Resize(ws); err != nil { - return nil, errToGRPC(err) + return nil, utils.ErrToGRPC(err) } return empty, nil } @@ -615,7 +615,7 @@ func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Em log.L.Debugf("Pause, id: %s", r.ID) if s.task == nil { log.L.Debugf("Pause error, id: %s: container not created", r.ID) - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Pause(ctx, r.ID) if err != nil { @@ -629,7 +629,7 @@ func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types. log.L.Debugf("Resume, id: %s", r.ID) if s.task == nil { log.L.Debugf("Resume error, id: %s: container not created", r.ID) - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Resume(ctx, r.ID) if err != nil { @@ -648,7 +648,7 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empt } if err := p.Kill(ctx, r.Signal, r.All); err != nil { log.L.Debugf("Kill failed: %v", err) - return nil, errToGRPC(err) + return nil, utils.ErrToGRPC(err) } log.L.Debugf("Kill succeeded") return empty, nil @@ -660,7 +660,7 @@ func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.Pi pids, err := s.getContainerPids(ctx, r.ID) if err != nil { - return nil, errToGRPC(err) + return nil, utils.ErrToGRPC(err) } var processes []*task.ProcessInfo for _, pid := range pids { @@ -706,7 +706,7 @@ func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*type // Checkpoint checkpoints the container. func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) { log.L.Debugf("Checkpoint, id: %s", r.ID) - return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) + return empty, utils.ErrToGRPC(errdefs.ErrNotImplemented) } // Connect returns shim information such as the shim's pid. @@ -737,7 +737,7 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. log.L.Debugf("Stats, id: %s", r.ID) if s.task == nil { log.L.Debugf("Stats error, id: %s: container not created", r.ID) - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } stats, err := s.task.Stats(ctx, s.id) if err != nil { @@ -811,7 +811,7 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. // Update updates a running container. func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) { - return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) + return empty, utils.ErrToGRPC(errdefs.ErrNotImplemented) } // Wait waits for a process to exit. @@ -908,14 +908,14 @@ func (s *service) getProcess(execID string) (process.Process, error) { if execID == "" { if s.task == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } return s.task, nil } p := s.processes[execID] if p == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) + return nil, utils.ErrToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) } return p, nil } diff --git a/pkg/shim/service_linux.go b/pkg/shim/service_linux.go index 829f69282..fb2f8b062 100644 --- a/pkg/shim/service_linux.go +++ b/pkg/shim/service_linux.go @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package shim diff --git a/pkg/shim/utils/BUILD b/pkg/shim/utils/BUILD index 54a0aabb7..2eb82f63c 100644 --- a/pkg/shim/utils/BUILD +++ b/pkg/shim/utils/BUILD @@ -6,6 +6,7 @@ go_library( name = "utils", srcs = [ "annotations.go", + "errors.go", "utils.go", "volumes.go", ], @@ -14,14 +15,23 @@ go_library( "//shim:__subpackages__", ], deps = [ + "@com_github_containerd_containerd//errdefs:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + "@org_golang_google_grpc//codes:go_default_library", + "@org_golang_google_grpc//status:go_default_library", ], ) go_test( name = "utils_test", size = "small", - srcs = ["volumes_test.go"], + srcs = [ + "errors_test.go", + "volumes_test.go", + ], library = ":utils", - deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"], + deps = [ + "@com_github_containerd_containerd//errdefs:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], ) diff --git a/pkg/shim/errors.go b/pkg/shim/utils/errors.go index 75d036411..971d68c36 100644 --- a/pkg/shim/errors.go +++ b/pkg/shim/utils/errors.go @@ -12,23 +12,38 @@ // See the License for the specific language governing permissions and // limitations under the License. -package shim +package utils import ( "context" "errors" + "fmt" "github.com/containerd/containerd/errdefs" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) -// errToGRPC wraps containerd's ToGRPC error mapper which depends on +// ErrToGRPC wraps containerd's ToGRPC error mapper which depends on // github.com/pkg/errors to work correctly. Once we upgrade to containerd v1.4, // this function can go away and we can use errdefs.ToGRPC directly instead. // // TODO(gvisor.dev/issue/6232): Remove after upgrading to containerd v1.4 -func errToGRPC(err error) error { +func ErrToGRPC(err error) error { + return errToGRPCMsg(err, err.Error()) +} + +// ErrToGRPCf maps the error to grpc error codes, assembling the formatting +// string and combining it with the target error string. +// +// TODO(gvisor.dev/issue/6232): Remove after upgrading to containerd v1.4 +func ErrToGRPCf(err error, format string, args ...interface{}) error { + formatted := fmt.Sprintf(format, args...) + msg := fmt.Sprintf("%s: %s", formatted, err.Error()) + return errToGRPCMsg(err, msg) +} + +func errToGRPCMsg(err error, msg string) error { if err == nil { return nil } @@ -38,21 +53,21 @@ func errToGRPC(err error) error { switch { case errors.Is(err, errdefs.ErrInvalidArgument): - return status.Errorf(codes.InvalidArgument, err.Error()) + return status.Errorf(codes.InvalidArgument, msg) case errors.Is(err, errdefs.ErrNotFound): - return status.Errorf(codes.NotFound, err.Error()) + return status.Errorf(codes.NotFound, msg) case errors.Is(err, errdefs.ErrAlreadyExists): - return status.Errorf(codes.AlreadyExists, err.Error()) + return status.Errorf(codes.AlreadyExists, msg) case errors.Is(err, errdefs.ErrFailedPrecondition): - return status.Errorf(codes.FailedPrecondition, err.Error()) + return status.Errorf(codes.FailedPrecondition, msg) case errors.Is(err, errdefs.ErrUnavailable): - return status.Errorf(codes.Unavailable, err.Error()) + return status.Errorf(codes.Unavailable, msg) case errors.Is(err, errdefs.ErrNotImplemented): - return status.Errorf(codes.Unimplemented, err.Error()) + return status.Errorf(codes.Unimplemented, msg) case errors.Is(err, context.Canceled): - return status.Errorf(codes.Canceled, err.Error()) + return status.Errorf(codes.Canceled, msg) case errors.Is(err, context.DeadlineExceeded): - return status.Errorf(codes.DeadlineExceeded, err.Error()) + return status.Errorf(codes.DeadlineExceeded, msg) } return errdefs.ToGRPC(err) diff --git a/pkg/shim/errors_test.go b/pkg/shim/utils/errors_test.go index 3c10866cc..0a8fe34c8 100644 --- a/pkg/shim/errors_test.go +++ b/pkg/shim/utils/errors_test.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package shim +package utils import ( "fmt" @@ -39,8 +39,11 @@ func TestGRPCRoundTripsErrors(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - if err := errdefs.FromGRPC(errToGRPC(tc.err)); !tc.test(err) { - t.Errorf("got %+v", err) + if err := errdefs.FromGRPC(ErrToGRPC(tc.err)); !tc.test(err) { + t.Errorf("errToGRPC got %+v", err) + } + if err := errdefs.FromGRPC(ErrToGRPCf(tc.err, "testing %s", "123")); !tc.test(err) { + t.Errorf("errToGRPCf got %+v", err) } }) } diff --git a/pkg/state/state_norace.go b/pkg/state/state_norace.go index 4281aed6d..be09d6141 100644 --- a/pkg/state/state_norace.go +++ b/pkg/state/state_norace.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !race // +build !race package state diff --git a/pkg/state/state_race.go b/pkg/state/state_race.go index 8232981ce..c9f4fd5cf 100644 --- a/pkg/state/state_race.go +++ b/pkg/state/state_race.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package state diff --git a/pkg/state/tests/register_test.go b/pkg/state/tests/register_test.go index 75bdbfc6e..2199d6b01 100644 --- a/pkg/state/tests/register_test.go +++ b/pkg/state/tests/register_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package tests diff --git a/pkg/sync/checklocks_off_unsafe.go b/pkg/sync/checklocks_off_unsafe.go index 62c81b149..87c56dd12 100644 --- a/pkg/sync/checklocks_off_unsafe.go +++ b/pkg/sync/checklocks_off_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build !checklocks // +build !checklocks package sync diff --git a/pkg/sync/checklocks_on_unsafe.go b/pkg/sync/checklocks_on_unsafe.go index 24f933ed1..f2bfde083 100644 --- a/pkg/sync/checklocks_on_unsafe.go +++ b/pkg/sync/checklocks_on_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build checklocks // +build checklocks package sync diff --git a/pkg/sync/goyield_go113_unsafe.go b/pkg/sync/goyield_go113_unsafe.go index 8aee0d455..c4b03e9aa 100644 --- a/pkg/sync/goyield_go113_unsafe.go +++ b/pkg/sync/goyield_go113_unsafe.go @@ -3,8 +3,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.14 +//go:build go1.13 && !go1.14 +// +build go1.13,!go1.14 package sync diff --git a/pkg/sync/goyield_unsafe.go b/pkg/sync/goyield_unsafe.go index f3cc12163..757edbaba 100644 --- a/pkg/sync/goyield_unsafe.go +++ b/pkg/sync/goyield_unsafe.go @@ -3,10 +3,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build go1.14 // +build go1.14 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package sync diff --git a/pkg/sync/mutex_test.go b/pkg/sync/mutex_test.go index 4fb51a8ab..9e4e3f0b2 100644 --- a/pkg/sync/mutex_test.go +++ b/pkg/sync/mutex_test.go @@ -64,7 +64,7 @@ func TestTryLockUnlock(t *testing.T) { if !m.TryLock() { t.Fatal("failed to aquire lock") } - m.Unlock() + m.Unlock() // +checklocksforce if !m.TryLock() { t.Fatal("failed to aquire lock after unlock") } diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go index 411a80a8a..e00d9467d 100644 --- a/pkg/sync/mutex_unsafe.go +++ b/pkg/sync/mutex_unsafe.go @@ -3,8 +3,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.18 +//go:build go1.13 && !go1.18 +// +build go1.13,!go1.18 // When updating the build constraint (above), check that syncMutex matches the // standard library sync.Mutex definition. @@ -32,6 +32,18 @@ func (m *CrossGoroutineMutex) state() *int32 { return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state } +// Lock locks the underlying Mutex. +// +checklocksignore +func (m *CrossGoroutineMutex) Lock() { + m.Mutex.Lock() +} + +// Unlock unlocks the underlying Mutex. +// +checklocksignore +func (m *CrossGoroutineMutex) Unlock() { + m.Mutex.Unlock() +} + const ( mutexUnlocked = 0 mutexLocked = 1 @@ -62,6 +74,7 @@ type Mutex struct { // Lock locks m. If the lock is already in use, the calling goroutine blocks // until the mutex is available. +// +checklocksignore func (m *Mutex) Lock() { noteLock(unsafe.Pointer(m)) m.m.Lock() @@ -80,6 +93,7 @@ func (m *Mutex) Unlock() { // TryLock tries to acquire the mutex. It returns true if it succeeds and false // otherwise. TryLock does not block. +// +checklocksignore func (m *Mutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(m)) diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go index 70b5f3a5e..8eca99134 100644 --- a/pkg/sync/norace_unsafe.go +++ b/pkg/sync/norace_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build !race // +build !race package sync diff --git a/pkg/sync/race_amd64.s b/pkg/sync/race_amd64.s index 57bc0ec79..199602387 100644 --- a/pkg/sync/race_amd64.s +++ b/pkg/sync/race_amd64.s @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build race -// +build amd64 +//go:build race && amd64 +// +build race,amd64 #include "textflag.h" diff --git a/pkg/sync/race_arm64.s b/pkg/sync/race_arm64.s index 88f091fda..c4192e870 100644 --- a/pkg/sync/race_arm64.s +++ b/pkg/sync/race_arm64.s @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build race -// +build arm64 +//go:build race && arm64 +// +build race,arm64 #include "textflag.h" diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go index 59985c270..381163cac 100644 --- a/pkg/sync/race_unsafe.go +++ b/pkg/sync/race_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build race // +build race package sync diff --git a/pkg/sync/runtime_unsafe.go b/pkg/sync/runtime_unsafe.go index 39c766331..49d4109a9 100644 --- a/pkg/sync/runtime_unsafe.go +++ b/pkg/sync/runtime_unsafe.go @@ -3,11 +3,16 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.18 +//go:build go1.13 && !go1.18 +// +build go1.13,!go1.18 -// Check go:linkname function signatures, type definitions, and constants when -// updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. + +// Check type definitions and constants when updating Go version. +// +// TODO(b/165820485): add these checks to checklinkname. package sync @@ -109,10 +114,10 @@ type maptype struct { // These functions are only used within the sync package. //go:linkname semacquire sync.runtime_Semacquire -func semacquire(s *uint32) +func semacquire(addr *uint32) //go:linkname semrelease sync.runtime_Semrelease -func semrelease(s *uint32, handoff bool, skipframes int) +func semrelease(addr *uint32, handoff bool, skipframes int) //go:linkname canSpin sync.runtime_canSpin func canSpin(i int) bool diff --git a/pkg/sync/rwmutex_test.go b/pkg/sync/rwmutex_test.go index 5ca96d12b..56a88e712 100644 --- a/pkg/sync/rwmutex_test.go +++ b/pkg/sync/rwmutex_test.go @@ -172,7 +172,7 @@ func TestRWTryLockUnlock(t *testing.T) { if !rwm.TryLock() { t.Fatal("failed to aquire lock") } - rwm.Unlock() + rwm.Unlock() // +checklocksforce if !rwm.TryLock() { t.Fatal("failed to aquire lock after unlock") } diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go index 892d3e641..7829b06db 100644 --- a/pkg/sync/rwmutex_unsafe.go +++ b/pkg/sync/rwmutex_unsafe.go @@ -37,6 +37,7 @@ const rwmutexMaxReaders = 1 << 30 // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *CrossGoroutineRWMutex) TryRLock() bool { if RaceEnabled { RaceDisable() @@ -65,6 +66,7 @@ func (rw *CrossGoroutineRWMutex) TryRLock() bool { // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. +// +checklocksignore func (rw *CrossGoroutineRWMutex) RLock() { if RaceEnabled { RaceDisable() @@ -83,6 +85,7 @@ func (rw *CrossGoroutineRWMutex) RLock() { // // Preconditions: // * rw is locked for reading. +// +checklocksignore func (rw *CrossGoroutineRWMutex) RUnlock() { if RaceEnabled { RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) @@ -134,6 +137,7 @@ func (rw *CrossGoroutineRWMutex) TryLock() bool { // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. +// +checklocksignore func (rw *CrossGoroutineRWMutex) Lock() { if RaceEnabled { RaceDisable() @@ -228,6 +232,7 @@ type RWMutex struct { // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *RWMutex) TryRLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) @@ -243,6 +248,7 @@ func (rw *RWMutex) TryRLock() bool { // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. +// +checklocksignore func (rw *RWMutex) RLock() { noteLock(unsafe.Pointer(rw)) rw.m.RLock() @@ -261,6 +267,7 @@ func (rw *RWMutex) RUnlock() { // TryLock locks rw for writing. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *RWMutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) @@ -273,6 +280,7 @@ func (rw *RWMutex) TryLock() bool { // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. +// +checklocksignore func (rw *RWMutex) Lock() { noteLock(unsafe.Pointer(rw)) rw.m.Lock() diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD index 5205fa7e4..ceee494fc 100644 --- a/pkg/syserr/BUILD +++ b/pkg/syserr/BUILD @@ -12,6 +12,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux/errno", + "//pkg/errors", "//pkg/errors/linuxerr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go index c8c10f48b..fb92738af 100644 --- a/pkg/syserr/host_linux.go +++ b/pkg/syserr/host_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package syserr diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go index 7d0a5125b..558240008 100644 --- a/pkg/syserr/syserr.go +++ b/pkg/syserr/syserr.go @@ -22,6 +22,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/syserror" ) @@ -51,7 +52,7 @@ func New(message string, linuxTranslation errno.Errno) *Error { } e := error(unix.Errno(err.errno)) - // syserror.ErrWouldBlock gets translated to syserror.EWOULDBLOCK and + // syserror.ErrWouldBlock gets translated to linuxerr.EWOULDBLOCK and // enables proper blocking semantics. This should temporary address the // class of blocking bugs that keep popping up with the current state of // the error space. @@ -281,6 +282,11 @@ func FromError(err error) *Error { if errno, ok := err.(unix.Errno); ok { return FromHost(errno) } + + if linuxErr, ok := err.(*errors.Error); ok { + return FromHost(unix.Errno(linuxErr.Errno())) + } + if errno, ok := syserror.TranslateError(err); ok { return FromHost(errno) } diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 56b621357..b24edb364 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -26,63 +26,16 @@ import ( // The following variables have the same meaning as their syscall equivalent. var ( - E2BIG = error(unix.E2BIG) - EACCES = error(unix.EACCES) - EADDRINUSE = error(unix.EADDRINUSE) - EAGAIN = error(unix.EAGAIN) - EBADF = error(unix.EBADF) - EBADFD = error(unix.EBADFD) - EBUSY = error(unix.EBUSY) - ECHILD = error(unix.ECHILD) - ECONNABORTED = error(unix.ECONNABORTED) - ECONNREFUSED = error(unix.ECONNREFUSED) - ECONNRESET = error(unix.ECONNRESET) - EDEADLK = error(unix.EDEADLK) - EEXIST = error(unix.EEXIST) - EFAULT = error(unix.EFAULT) - EFBIG = error(unix.EFBIG) - EIDRM = error(unix.EIDRM) - EINTR = error(unix.EINTR) - EINVAL = error(unix.EINVAL) - EIO = error(unix.EIO) - EISDIR = error(unix.EISDIR) - ELIBBAD = error(unix.ELIBBAD) - ELOOP = error(unix.ELOOP) - EMFILE = error(unix.EMFILE) - EMLINK = error(unix.EMLINK) - EMSGSIZE = error(unix.EMSGSIZE) - ENAMETOOLONG = error(unix.ENAMETOOLONG) - ENOATTR = ENODATA - ENOBUFS = error(unix.ENOBUFS) - ENODATA = error(unix.ENODATA) - ENODEV = error(unix.ENODEV) - ENOENT = error(unix.ENOENT) - ENOEXEC = error(unix.ENOEXEC) - ENOLCK = error(unix.ENOLCK) - ENOLINK = error(unix.ENOLINK) - ENOMEM = error(unix.ENOMEM) - ENOSPC = error(unix.ENOSPC) - ENOSYS = error(unix.ENOSYS) - ENOTCONN = error(unix.ENOTCONN) - ENOTDIR = error(unix.ENOTDIR) - ENOTEMPTY = error(unix.ENOTEMPTY) - ENOTSOCK = error(unix.ENOTSOCK) - ENOTSUP = error(unix.ENOTSUP) - ENOTTY = error(unix.ENOTTY) - ENXIO = error(unix.ENXIO) - EOPNOTSUPP = error(unix.EOPNOTSUPP) - EOVERFLOW = error(unix.EOVERFLOW) - EPERM = error(unix.EPERM) - EPIPE = error(unix.EPIPE) - ERANGE = error(unix.ERANGE) - EREMOTE = error(unix.EREMOTE) - EROFS = error(unix.EROFS) - ESPIPE = error(unix.ESPIPE) - ESRCH = error(unix.ESRCH) - ETIMEDOUT = error(unix.ETIMEDOUT) - EUSERS = error(unix.EUSERS) - EWOULDBLOCK = error(unix.EWOULDBLOCK) - EXDEV = error(unix.EXDEV) + EIDRM = error(unix.EIDRM) + EINTR = error(unix.EINTR) + EIO = error(unix.EIO) + EISDIR = error(unix.EISDIR) + ENOENT = error(unix.ENOENT) + ENOEXEC = error(unix.ENOEXEC) + ENOMEM = error(unix.ENOMEM) + ENOTSOCK = error(unix.ENOTSOCK) + ENOSPC = error(unix.ENOSPC) + ENOSYS = error(unix.ENOSYS) ) var ( diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go index b1f39e6e6..a647ea968 100644 --- a/pkg/tcpip/header/ndp_options.go +++ b/pkg/tcpip/header/ndp_options.go @@ -233,6 +233,17 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) { case ndpNonceOptionType: return NDPNonceOption(body), false, nil + case ndpRouteInformationType: + if numBodyBytes > ndpRouteInformationMaxLength { + return nil, true, fmt.Errorf("got %d bytes for NDP Route Information option's body, expected at max %d bytes: %w", numBodyBytes, ndpRouteInformationMaxLength, ErrNDPOptMalformedBody) + } + opt := NDPRouteInformation(body) + if err := opt.hasError(); err != nil { + return nil, true, err + } + + return opt, false, nil + case ndpPrefixInformationType: // Make sure the length of a Prefix Information option // body is ndpPrefixInformationLength, as per RFC 4861 @@ -930,3 +941,137 @@ func isUpperLetter(b byte) bool { func isDigit(b byte) bool { return b >= '0' && b <= '9' } + +// As per RFC 4191 section 2.3, +// +// 2.3. Route Information Option +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Type | Length | Prefix Length |Resvd|Prf|Resvd| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Route Lifetime | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Prefix (Variable Length) | +// . . +// . . +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// Fields: +// +// Type 24 +// +// +// Length 8-bit unsigned integer. The length of the option +// (including the Type and Length fields) in units of 8 +// octets. The Length field is 1, 2, or 3 depending on the +// Prefix Length. If Prefix Length is greater than 64, then +// Length must be 3. If Prefix Length is greater than 0, +// then Length must be 2 or 3. If Prefix Length is zero, +// then Length must be 1, 2, or 3. +const ( + ndpRouteInformationType = ndpOptionIdentifier(24) + ndpRouteInformationMaxLength = 22 + + ndpRouteInformationPrefixLengthIdx = 0 + ndpRouteInformationFlagsIdx = 1 + ndpRouteInformationPrfShift = 3 + ndpRouteInformationPrfMask = 3 << ndpRouteInformationPrfShift + ndpRouteInformationRouteLifetimeIdx = 2 + ndpRouteInformationRoutePrefixIdx = 6 +) + +// NDPRouteInformation is the NDP Router Information option, as defined by +// RFC 4191 section 2.3. +type NDPRouteInformation []byte + +func (NDPRouteInformation) kind() ndpOptionIdentifier { + return ndpRouteInformationType +} + +func (o NDPRouteInformation) length() int { + return len(o) +} + +func (o NDPRouteInformation) serializeInto(b []byte) int { + return copy(b, o) +} + +// String implements fmt.Stringer. +func (o NDPRouteInformation) String() string { + return fmt.Sprintf("%T", o) +} + +// PrefixLength returns the length of the prefix. +func (o NDPRouteInformation) PrefixLength() uint8 { + return o[ndpRouteInformationPrefixLengthIdx] +} + +// RoutePreference returns the preference of the route over other routes to the +// same destination but through a different router. +func (o NDPRouteInformation) RoutePreference() NDPRoutePreference { + return NDPRoutePreference((o[ndpRouteInformationFlagsIdx] & ndpRouteInformationPrfMask) >> ndpRouteInformationPrfShift) +} + +// RouteLifetime returns the lifetime of the route. +// +// Note, a value of 0 implies the route is now invalid and a value of +// infinity/forever is represented by NDPInfiniteLifetime. +func (o NDPRouteInformation) RouteLifetime() time.Duration { + return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRouteInformationRouteLifetimeIdx:])) +} + +// Prefix returns the prefix of the destination subnet this route is for. +func (o NDPRouteInformation) Prefix() (tcpip.Subnet, error) { + prefixLength := int(o.PrefixLength()) + if max := IPv6AddressSize * 8; prefixLength > max { + return tcpip.Subnet{}, fmt.Errorf("got prefix length = %d, want <= %d", prefixLength, max) + } + + prefix := o[ndpRouteInformationRoutePrefixIdx:] + var addrBytes [IPv6AddressSize]byte + if n := copy(addrBytes[:], prefix); n != len(prefix) { + panic(fmt.Sprintf("got copy(addrBytes, prefix) = %d, want = %d", n, len(prefix))) + } + + return tcpip.AddressWithPrefix{ + Address: tcpip.Address(addrBytes[:]), + PrefixLen: prefixLength, + }.Subnet(), nil +} + +func (o NDPRouteInformation) hasError() error { + l := len(o) + if l < ndpRouteInformationRoutePrefixIdx { + return fmt.Errorf("%T too small, got = %d bytes: %w", o, l, ErrNDPOptMalformedBody) + } + + prefixLength := int(o.PrefixLength()) + if max := IPv6AddressSize * 8; prefixLength > max { + return fmt.Errorf("got prefix length = %d, want <= %d: %w", prefixLength, max, ErrNDPOptMalformedBody) + } + + // Length 8-bit unsigned integer. The length of the option + // (including the Type and Length fields) in units of 8 + // octets. The Length field is 1, 2, or 3 depending on the + // Prefix Length. If Prefix Length is greater than 64, then + // Length must be 3. If Prefix Length is greater than 0, + // then Length must be 2 or 3. If Prefix Length is zero, + // then Length must be 1, 2, or 3. + l += 2 // Add 2 bytes for the type and length bytes. + lengthField := l / lengthByteUnits + if prefixLength > 64 { + if lengthField != 3 { + return fmt.Errorf("Length field must be 3 when Prefix Length (%d) is > 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody) + } + } else if prefixLength > 0 { + if lengthField != 2 && lengthField != 3 { + return fmt.Errorf("Length field must be 2 or 3 when Prefix Length (%d) is between 0 and 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody) + } + } else if lengthField == 0 || lengthField > 3 { + return fmt.Errorf("Length field must be 1, 2, or 3 when Prefix Length is zero (got = %d): %w", lengthField, ErrNDPOptMalformedBody) + } + + return nil +} diff --git a/pkg/tcpip/header/ndp_router_advert.go b/pkg/tcpip/header/ndp_router_advert.go index 7e2f0c797..7d6efa083 100644 --- a/pkg/tcpip/header/ndp_router_advert.go +++ b/pkg/tcpip/header/ndp_router_advert.go @@ -16,9 +16,12 @@ package header import ( "encoding/binary" + "fmt" "time" ) +var _ fmt.Stringer = NDPRoutePreference(0) + // NDPRoutePreference is the preference values for default routers or // more-specific routes. // @@ -64,6 +67,22 @@ const ( ReservedRoutePreference = 0b10 ) +// String implements fmt.Stringer. +func (p NDPRoutePreference) String() string { + switch p { + case HighRoutePreference: + return "HighRoutePreference" + case MediumRoutePreference: + return "MediumRoutePreference" + case LowRoutePreference: + return "LowRoutePreference" + case ReservedRoutePreference: + return "ReservedRoutePreference" + default: + return fmt.Sprintf("NDPRoutePreference(%d)", p) + } +} + // NDPRouterAdvert is an NDP Router Advertisement message. It will only contain // the body of an ICMPv6 packet. // diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go index 8fd1f7d13..2a897e938 100644 --- a/pkg/tcpip/header/ndp_test.go +++ b/pkg/tcpip/header/ndp_test.go @@ -21,6 +21,7 @@ import ( "fmt" "io" "regexp" + "strings" "testing" "time" @@ -58,6 +59,224 @@ func TestNDPNeighborSolicit(t *testing.T) { } } +func TestNDPRouteInformationOption(t *testing.T) { + tests := []struct { + name string + + length uint8 + prefixLength uint8 + prf NDPRoutePreference + lifetimeS uint32 + prefixBytes []byte + expectedPrefix tcpip.Subnet + + expectedErr error + }{ + { + name: "Length=1 with Prefix Length = 0", + length: 1, + prefixLength: 0, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: IPv6EmptySubnet, + }, + { + name: "Length=1 but Prefix Length > 0", + length: 1, + prefixLength: 1, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedErr: ErrNDPOptMalformedBody, + }, + { + name: "Length=2 with Prefix Length = 0", + length: 2, + prefixLength: 0, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: IPv6EmptySubnet, + }, + { + name: "Length=2 with Prefix Length in [1, 64] (1)", + length: 2, + prefixLength: 1, + prf: LowRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 1, + }.Subnet(), + }, + { + name: "Length=2 with Prefix Length in [1, 64] (64)", + length: 2, + prefixLength: 64, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 64, + }.Subnet(), + }, + { + name: "Length=2 with Prefix Length > 64", + length: 2, + prefixLength: 65, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedErr: ErrNDPOptMalformedBody, + }, + { + name: "Length=3 with Prefix Length = 0", + length: 3, + prefixLength: 0, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: IPv6EmptySubnet, + }, + { + name: "Length=3 with Prefix Length in [1, 64] (1)", + length: 3, + prefixLength: 1, + prf: LowRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 1, + }.Subnet(), + }, + { + name: "Length=3 with Prefix Length in [1, 64] (64)", + length: 3, + prefixLength: 64, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 64, + }.Subnet(), + }, + { + name: "Length=3 with Prefix Length in [65, 128] (65)", + length: 3, + prefixLength: 65, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 65, + }.Subnet(), + }, + { + name: "Length=3 with Prefix Length in [65, 128] (128)", + length: 3, + prefixLength: 128, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 128, + }.Subnet(), + }, + { + name: "Length=3 with (invalid) Prefix Length > 128", + length: 3, + prefixLength: 129, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedErr: ErrNDPOptMalformedBody, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + expectedRouteInformationBytes := [...]byte{ + // Type, Length + 24, test.length, + + // Prefix Length, Prf + uint8(test.prefixLength), uint8(test.prf) << 3, + + // Route Lifetime + 0, 0, 0, 0, + + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + } + binary.BigEndian.PutUint32(expectedRouteInformationBytes[4:], test.lifetimeS) + _ = copy(expectedRouteInformationBytes[8:], test.prefixBytes) + + opts := NDPOptions(expectedRouteInformationBytes[:test.length*lengthByteUnits]) + it, err := opts.Iter(false) + if err != nil { + t.Fatalf("got Iter(false) = (_, %s), want = (_, nil)", err) + } + opt, done, err := it.Next() + if !errors.Is(err, test.expectedErr) { + t.Fatalf("got Next() = (_, _, %s), want = (_, _, %s)", err, test.expectedErr) + } + if want := test.expectedErr != nil; done != want { + t.Fatalf("got Next() = (_, %t, _), want = (_, %t, _)", done, want) + } + if test.expectedErr != nil { + return + } + + if got := opt.kind(); got != ndpRouteInformationType { + t.Errorf("got kind() = %d, want = %d", got, ndpRouteInformationType) + } + + ri, ok := opt.(NDPRouteInformation) + if !ok { + t.Fatalf("got opt = %T, want = NDPRouteInformation", opt) + } + + if got := ri.PrefixLength(); got != test.prefixLength { + t.Errorf("got PrefixLength() = %d, want = %d", got, test.prefixLength) + } + if got := ri.RoutePreference(); got != test.prf { + t.Errorf("got RoutePreference() = %d, want = %d", got, test.prf) + } + if got, want := ri.RouteLifetime(), time.Duration(test.lifetimeS)*time.Second; got != want { + t.Errorf("got RouteLifetime() = %s, want = %s", got, want) + } + if got, err := ri.Prefix(); err != nil { + t.Errorf("Prefix(): %s", err) + } else if got != test.expectedPrefix { + t.Errorf("got Prefix() = %s, want = %s", got, test.expectedPrefix) + } + + // Iterator should not return anything else. + { + next, done, err := it.Next() + if err != nil { + t.Errorf("got Next() = (_, _, %s), want = (_, _, nil)", err) + } + if !done { + t.Error("got Next() = (_, false, _), want = (_, true, _)") + } + if next != nil { + t.Errorf("got Next() = (%x, _, _), want = (nil, _, _)", next) + } + } + }) + } +} + // TestNDPNeighborAdvert tests the functions of NDPNeighborAdvert. func TestNDPNeighborAdvert(t *testing.T) { b := []byte{ @@ -1498,3 +1717,32 @@ func TestNDPOptionsIter(t *testing.T) { t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next) } } + +func TestNDPRoutePreferenceStringer(t *testing.T) { + p := NDPRoutePreference(0) + for { + var wantStr string + switch p { + case 0b01: + wantStr = "HighRoutePreference" + case 0b00: + wantStr = "MediumRoutePreference" + case 0b11: + wantStr = "LowRoutePreference" + case 0b10: + wantStr = "ReservedRoutePreference" + default: + wantStr = fmt.Sprintf("NDPRoutePreference(%d)", p) + } + + if gotStr := p.String(); gotStr != wantStr { + t.Errorf("got NDPRoutePreference(%d).String() = %s, want = %s", p, gotStr, wantStr) + } + + p++ + if p == 0 { + // Overflowed, we hit all values. + break + } + } +} diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 1b56d2b72..e8e716db0 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package fdbased provides the implemention of data-link layer endpoints diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 8aad338b6..eccd21579 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdbased diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go index df14eaad1..904393faa 100644 --- a/pkg/tcpip/link/fdbased/endpoint_unsafe.go +++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdbased diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go index 5d698a5e9..bfae34ab9 100644 --- a/pkg/tcpip/link/fdbased/mmap.go +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (linux && amd64) || (linux && arm64) // +build linux,amd64 linux,arm64 package fdbased diff --git a/pkg/tcpip/link/fdbased/mmap_stub.go b/pkg/tcpip/link/fdbased/mmap_stub.go index 67be52d67..9d8679502 100644 --- a/pkg/tcpip/link/fdbased/mmap_stub.go +++ b/pkg/tcpip/link/fdbased/mmap_stub.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !linux || (!amd64 && !arm64) // +build !linux !amd64,!arm64 package fdbased diff --git a/pkg/tcpip/link/fdbased/mmap_unsafe.go b/pkg/tcpip/link/fdbased/mmap_unsafe.go index 1293f68a2..58d5dfeef 100644 --- a/pkg/tcpip/link/fdbased/mmap_unsafe.go +++ b/pkg/tcpip/link/fdbased/mmap_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (linux && amd64) || (linux && arm64) // +build linux,amd64 linux,arm64 package fdbased diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 4b7ef3aac..ab2855a63 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdbased diff --git a/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go index 2206fe0e6..c1438da21 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && !amd64 && !arm64 // +build linux,!amd64,!arm64 package rawfile diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go index 5002245a1..0b7b9e3de 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build ((linux && amd64) || (linux && arm64)) && go1.12 // +build linux,amd64 linux,arm64 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package rawfile diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go index 9743e70ea..7e21a78d4 100644 --- a/pkg/tcpip/link/rawfile/errors.go +++ b/pkg/tcpip/link/rawfile/errors.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package rawfile diff --git a/pkg/tcpip/link/rawfile/errors_test.go b/pkg/tcpip/link/rawfile/errors_test.go index 8f4bd60da..1b88c309b 100644 --- a/pkg/tcpip/link/rawfile/errors_test.go +++ b/pkg/tcpip/link/rawfile/errors_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package rawfile diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 43fe57830..53448a641 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package rawfile contains utilities for using the netstack with raw host diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go index 8e6f3e5e3..e882a128c 100644 --- a/pkg/tcpip/link/sharedmem/rx.go +++ b/pkg/tcpip/link/sharedmem/rx.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package sharedmem diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index df9a0b90a..30cf659b8 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package sharedmem provides the implemention of data-link layer endpoints diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 0f72d4e95..d6d953085 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package sharedmem diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go index c16c19647..d3edede63 100644 --- a/pkg/tcpip/link/sniffer/pcap.go +++ b/pkg/tcpip/link/sniffer/pcap.go @@ -14,7 +14,14 @@ package sniffer -import "time" +import ( + "encoding" + "encoding/binary" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) type pcapHeader struct { // MagicNumber is the file magic number. @@ -39,28 +46,38 @@ type pcapHeader struct { Network uint32 } -const pcapPacketHeaderLen = 16 - -type pcapPacketHeader struct { - // Seconds is the timestamp seconds. - Seconds uint32 - - // Microseconds is the timestamp microseconds. - Microseconds uint32 +var _ encoding.BinaryMarshaler = (*pcapPacket)(nil) - // IncludedLength is the number of octets of packet saved in file. - IncludedLength uint32 - - // OriginalLength is the actual length of packet. - OriginalLength uint32 +type pcapPacket struct { + timestamp time.Time + packet *stack.PacketBuffer + maxCaptureLen int } -func newPCAPPacketHeader(incLen, orgLen uint32) pcapPacketHeader { - now := time.Now() - return pcapPacketHeader{ - Seconds: uint32(now.Unix()), - Microseconds: uint32(now.Nanosecond() / 1000), - IncludedLength: incLen, - OriginalLength: orgLen, +func (p *pcapPacket) MarshalBinary() ([]byte, error) { + packetSize := p.packet.Size() + captureLen := p.maxCaptureLen + if packetSize < captureLen { + captureLen = packetSize + } + b := make([]byte, 16+captureLen) + binary.BigEndian.PutUint32(b[0:4], uint32(p.timestamp.Unix())) + binary.BigEndian.PutUint32(b[4:8], uint32(p.timestamp.Nanosecond()/1000)) + binary.BigEndian.PutUint32(b[8:12], uint32(captureLen)) + binary.BigEndian.PutUint32(b[12:16], uint32(packetSize)) + w := tcpip.SliceWriter(b[16:]) + for _, v := range p.packet.Views() { + if captureLen == 0 { + break + } + if len(v) > captureLen { + v = v[:captureLen] + } + n, err := w.Write(v) + if err != nil { + panic(err) + } + captureLen -= n } + return b, nil } diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index 2d6a3a833..28a172e71 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -87,11 +87,7 @@ func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoin } func zoneOffset() (int32, error) { - loc, err := time.LoadLocation("Local") - if err != nil { - return 0, err - } - date := time.Date(0, 0, 0, 0, 0, 0, 0, loc) + date := time.Date(0, 0, 0, 0, 0, 0, 0, time.Local) _, offset := date.Zone() return int32(offset), nil } @@ -117,8 +113,9 @@ func writePCAPHeader(w io.Writer, maxLen uint32) error { // NewWithWriter creates a new sniffer link-layer endpoint. It wraps around // another endpoint and logs packets as they traverse the endpoint. // -// Packets are logged to writer in the pcap format. A sniffer created with this -// function will not emit packets using the standard log package. +// Each packet is written to writer in the pcap format in a single Write call +// without synchronization. A sniffer created with this function will not emit +// packets using the standard log package. // // snapLen is the maximum amount of a packet to be saved. Packets with a length // less than or equal to snapLen will be saved in their entirety. Longer @@ -154,32 +151,17 @@ func (e *endpoint) dumpPacket(dir direction, protocol tcpip.NetworkProtocolNumbe logPacket(e.logPrefix, dir, protocol, pkt) } if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 { - totalLength := pkt.Size() - length := totalLength - if max := int(e.maxPCAPLen); length > max { - length = max + packet := pcapPacket{ + timestamp: time.Now(), + packet: pkt, + maxCaptureLen: int(e.maxPCAPLen), } - if err := binary.Write(writer, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil { + b, err := packet.MarshalBinary() + if err != nil { panic(err) } - write := func(b []byte) { - if len(b) > length { - b = b[:length] - } - for len(b) != 0 { - n, err := writer.Write(b) - if err != nil { - panic(err) - } - b = b[n:] - length -= n - } - } - for _, v := range pkt.Views() { - if length == 0 { - break - } - write(v) + if _, err := writer.Write(b); err != nil { + panic(err) } } } diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index 7656cca6a..4758a99ad 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -26,6 +26,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 36af2a029..d23210503 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -88,12 +89,12 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags Flags) error { defer d.mu.Unlock() if d.endpoint != nil { - return syserror.EINVAL + return linuxerr.EINVAL } // Input validation. if flags.TAP && flags.TUN || !flags.TAP && !flags.TUN { - return syserror.EINVAL + return linuxerr.EINVAL } prefix := "tun" @@ -108,7 +109,7 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags Flags) error { endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps) if err != nil { - return syserror.EINVAL + return linuxerr.EINVAL } d.endpoint = endpoint @@ -125,7 +126,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE endpoint, ok := linkEP.(*tunEndpoint) if !ok { // Not a NIC created by tun device. - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } if !endpoint.TryIncRef() { // Race detected: NIC got deleted in between. @@ -159,7 +160,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE // Race detected: A NIC has been created in between. continue default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } } @@ -170,7 +171,7 @@ func (d *Device) Write(data []byte) (int64, error) { endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { - return 0, syserror.EBADFD + return 0, linuxerr.EBADFD } if !endpoint.IsAttached() { return 0, syserror.EIO @@ -207,6 +208,15 @@ func (d *Device) Write(data []byte) (int64, error) { protocol = pktInfoHdr.Protocol() case ethHdr != nil: protocol = ethHdr.Type() + case d.flags.TUN: + // TUN interface with IFF_NO_PI enabled, thus + // we need to determine protocol from version field + version := data[0] >> 4 + if version == 4 { + protocol = header.IPv4ProtocolNumber + } else if version == 6 { + protocol = header.IPv6ProtocolNumber + } } // Try to determine remote link address, default zero. @@ -233,7 +243,7 @@ func (d *Device) Read() ([]byte, error) { endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { - return nil, syserror.EBADFD + return nil, linuxerr.EBADFD } for { @@ -264,13 +274,6 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) { vv.AppendView(buffer.View(hdr)) } - // If the packet does not already have link layer header, and the route - // does not exist, we can't compute it. This is possibly a raw packet, tun - // device doesn't support this at the moment. - if info.Pkt.LinkHeader().View().IsEmpty() && len(info.Route.RemoteLinkAddress) == 0 { - return nil, false - } - // Ethernet header (TAP only). if d.flags.TAP { // Add ethernet header if not provided. diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go index 0591fbd63..db4338e79 100644 --- a/pkg/tcpip/link/tun/tun_unsafe.go +++ b/pkg/tcpip/link/tun/tun_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package tun contains methods to open TAP and TUN devices. diff --git a/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go b/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go index 0b51563cd..1261ad414 100644 --- a/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go +++ b/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go @@ -126,7 +126,7 @@ func (m *mockMulticastGroupProtocol) sendQueuedReports() { // Precondition: m.mu must be read locked. func (m *mockMulticastGroupProtocol) Enabled() bool { if m.mu.TryLock() { - m.mu.Unlock() + m.mu.Unlock() // +checklocksforce: TryLock. m.t.Fatal("got write lock, expected to not take the lock; generic multicast protocol must take the read or write lock before calling Enabled") } @@ -138,11 +138,11 @@ func (m *mockMulticastGroupProtocol) Enabled() bool { // Precondition: m.mu must be locked. func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) { if m.mu.TryLock() { - m.mu.Unlock() + m.mu.Unlock() // +checklocksforce: TryLock. m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress) } if m.mu.TryRLock() { - m.mu.RUnlock() + m.mu.RUnlock() // +checklocksforce: TryLock. m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress) } @@ -155,11 +155,11 @@ func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (boo // Precondition: m.mu must be locked. func (m *mockMulticastGroupProtocol) SendLeave(groupAddress tcpip.Address) tcpip.Error { if m.mu.TryLock() { - m.mu.Unlock() + m.mu.Unlock() // +checklocksforce: TryLock. m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress) } if m.mu.TryRLock() { - m.mu.RUnlock() + m.mu.RUnlock() // +checklocksforce: TryLock. m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress) } diff --git a/pkg/tcpip/network/ipv6/ndp.go b/pkg/tcpip/network/ipv6/ndp.go index 9cd283eba..8837d66d8 100644 --- a/pkg/tcpip/network/ipv6/ndp.go +++ b/pkg/tcpip/network/ipv6/ndp.go @@ -54,6 +54,11 @@ const ( // Advertisements, as a host. defaultDiscoverDefaultRouters = true + // defaultDiscoverMoreSpecificRoutes is the default configuration for + // whether or not to discover more-specific routes from incoming Router + // Advertisements, as a host. + defaultDiscoverMoreSpecificRoutes = true + // defaultDiscoverOnLinkPrefixes is the default configuration for // whether or not to discover on-link prefixes from incoming Router // Advertisements' Prefix Information option, as a host. @@ -352,12 +357,18 @@ type NDPConfigurations struct { // DiscoverDefaultRouters determines whether or not default routers are // discovered from Router Advertisements, as per RFC 4861 section 6. This - // configuration is ignored if HandleRAs is false. + // configuration is ignored if RAs will not be processed (see HandleRAs). DiscoverDefaultRouters bool + // DiscoverMoreSpecificRoutes determines whether or not more specific routes + // are discovered from Router Advertisements, as per RFC 4191. This + // configuration is ignored if RAs will not be processed (see HandleRAs). + DiscoverMoreSpecificRoutes bool + // DiscoverOnLinkPrefixes determines whether or not on-link prefixes are // discovered from Router Advertisements' Prefix Information option, as per - // RFC 4861 section 6. This configuration is ignored if HandleRAs is false. + // RFC 4861 section 6. This configuration is ignored if RAs will not be + // processed (see HandleRAs). DiscoverOnLinkPrefixes bool // AutoGenGlobalAddresses determines whether or not an IPv6 endpoint performs @@ -408,6 +419,7 @@ func DefaultNDPConfigurations() NDPConfigurations { MaxRtrSolicitationDelay: defaultMaxRtrSolicitationDelay, HandleRAs: defaultHandleRAs, DiscoverDefaultRouters: defaultDiscoverDefaultRouters, + DiscoverMoreSpecificRoutes: defaultDiscoverMoreSpecificRoutes, DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes, AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses, AutoGenTempGlobalAddresses: defaultAutoGenTempGlobalAddresses, @@ -786,6 +798,32 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { if opt.AutonomousAddressConfigurationFlag() { ndp.handleAutonomousPrefixInformation(opt) } + + case header.NDPRouteInformation: + if !ndp.configs.DiscoverMoreSpecificRoutes { + continue + } + + dest, err := opt.Prefix() + if err != nil { + panic(fmt.Sprintf("%T.Prefix(): %s", opt, err)) + } + + prf := opt.RoutePreference() + if prf == header.ReservedRoutePreference { + // As per RFC 4191 section 2.3, + // + // Prf (Route Preference) + // 2-bit signed integer. The Route Preference indicates + // whether to prefer the router associated with this prefix + // over others, when multiple identical prefixes (for + // different routers) have been received. If the Reserved + // (10) value is received, the Route Information Option MUST + // be ignored. + continue + } + + ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: dest, router: ip}, opt.RouteLifetime(), prf) } // TODO(b/141556115): Do (MTU) Parameter Discovery. diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index b7f6d52ae..fe98a52af 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -12,6 +12,7 @@ go_library( deps = [ "//pkg/sync", "//pkg/tcpip", + "//pkg/tcpip/header", ], ) diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go index 854d6a6ba..fb8ef1ee2 100644 --- a/pkg/tcpip/ports/ports.go +++ b/pkg/tcpip/ports/ports.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" ) const ( @@ -122,7 +123,7 @@ type deviceToDest map[tcpip.NICID]destToCounter // If either of the port reuse flags is enabled on any of the nodes, all nodes // sharing a port must share at least one reuse flag. This matches Linux's // behavior. -func (dd deviceToDest) isAvailable(res Reservation) bool { +func (dd deviceToDest) isAvailable(res Reservation, portSpecified bool) bool { flagBits := res.Flags.Bits() if res.BindToDevice == 0 { intersection := FlagMask @@ -138,6 +139,9 @@ func (dd deviceToDest) isAvailable(res Reservation) bool { return false } } + if !portSpecified && res.Transport == header.TCPProtocolNumber { + return false + } return true } @@ -146,16 +150,26 @@ func (dd deviceToDest) isAvailable(res Reservation) bool { if dests, ok := dd[0]; ok { var count int intersection, count = dests.intersectionFlags(res) - if count > 0 && intersection&flagBits == 0 { - return false + if count > 0 { + if intersection&flagBits == 0 { + return false + } + if !portSpecified && res.Transport == header.TCPProtocolNumber { + return false + } } } if dests, ok := dd[res.BindToDevice]; ok { flags, count := dests.intersectionFlags(res) intersection &= flags - if count > 0 && intersection&flagBits == 0 { - return false + if count > 0 { + if intersection&flagBits == 0 { + return false + } + if !portSpecified && res.Transport == header.TCPProtocolNumber { + return false + } } } @@ -168,12 +182,12 @@ type addrToDevice map[tcpip.Address]deviceToDest // isAvailable checks whether an IP address is available to bind to. If the // address is the "any" address, check all other addresses. Otherwise, just // check against the "any" address and the provided address. -func (ad addrToDevice) isAvailable(res Reservation) bool { +func (ad addrToDevice) isAvailable(res Reservation, portSpecified bool) bool { if res.Addr == anyIPAddress { // If binding to the "any" address then check that there are no // conflicts with all addresses. for _, devices := range ad { - if !devices.isAvailable(res) { + if !devices.isAvailable(res, portSpecified) { return false } } @@ -182,14 +196,14 @@ func (ad addrToDevice) isAvailable(res Reservation) bool { // Check that there is no conflict with the "any" address. if devices, ok := ad[anyIPAddress]; ok { - if !devices.isAvailable(res) { + if !devices.isAvailable(res, portSpecified) { return false } } // Check that this is no conflict with the provided address. if devices, ok := ad[res.Addr]; ok { - if !devices.isAvailable(res) { + if !devices.isAvailable(res, portSpecified) { return false } } @@ -310,7 +324,7 @@ func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort Por // If a port is specified, just try to reserve it for all network // protocols. if res.Port != 0 { - if !pm.reserveSpecificPortLocked(res) { + if !pm.reserveSpecificPortLocked(res, true /* portSpecified */) { return 0, &tcpip.ErrPortInUse{} } if testPort != nil { @@ -330,7 +344,7 @@ func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort Por // A port wasn't specified, so try to find one. return pm.PickEphemeralPort(rng, func(p uint16) (bool, tcpip.Error) { res.Port = p - if !pm.reserveSpecificPortLocked(res) { + if !pm.reserveSpecificPortLocked(res, false /* portSpecified */) { return false, nil } if testPort != nil { @@ -350,12 +364,12 @@ func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort Por // reserveSpecificPortLocked tries to reserve the given port on all given // protocols. -func (pm *PortManager) reserveSpecificPortLocked(res Reservation) bool { +func (pm *PortManager) reserveSpecificPortLocked(res Reservation, portSpecified bool) bool { // Make sure the port is available. for _, network := range res.Networks { desc := portDescriptor{network, res.Transport, res.Port} if addrs, ok := pm.allocatedPorts[desc]; ok { - if !addrs.isAvailable(res) { + if !addrs.isAvailable(res, portSpecified) { return false } } diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go index b9a24ff56..009cab643 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/main.go +++ b/pkg/tcpip/sample/tun_tcp_connect/main.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // This sample creates a stack with TCP and IPv4 protocols on top of a TUN diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go index ef1bfc186..c10b19aa0 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/main.go +++ b/pkg/tcpip/sample/tun_tcp_echo/main.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // This sample creates a stack with TCP and IPv4 protocols on top of a TUN diff --git a/pkg/tcpip/socketops.go b/pkg/tcpip/socketops.go index 0ea85f9ed..5642c86f8 100644 --- a/pkg/tcpip/socketops.go +++ b/pkg/tcpip/socketops.go @@ -15,17 +15,11 @@ package tcpip import ( - "math" "sync/atomic" - "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" ) -// PacketOverheadFactor is used to multiply the value provided by the user on a -// SetSockOpt for setting the send/receive buffer sizes sockets. -const PacketOverheadFactor = 2 - // SocketOptionsHandler holds methods that help define endpoint specific // behavior for socket level socket options. These must be implemented by // endpoints to get notified when socket level options are set. @@ -60,7 +54,7 @@ type SocketOptionsHandler interface { // buffer size. It also returns the newly set value. OnSetSendBufferSize(v int64) (newSz int64) - // OnSetReceiveBufferSize is invoked to set the SO_RCVBUFSIZE. + // OnSetReceiveBufferSize is invoked by SO_RCVBUF and SO_RCVBUFFORCE. OnSetReceiveBufferSize(v, oldSz int64) (newSz int64) } @@ -213,16 +207,24 @@ type SocketOptions struct { // will not change. getSendBufferLimits GetSendBufferLimits `state:"manual"` + // sendBufSizeMu protects sendBufferSize and calls to + // handler.OnSetSendBufferSize. + sendBufSizeMu sync.Mutex `state:"nosave"` + // sendBufferSize determines the send buffer size for this socket. - sendBufferSize atomicbitops.AlignedAtomicInt64 + sendBufferSize int64 // getReceiveBufferLimits provides the handler to get the min, default and // max size for receive buffer. It is initialized at the creation time and // will not change. getReceiveBufferLimits GetReceiveBufferLimits `state:"manual"` + // receiveBufSizeMu protects receiveBufferSize and calls to + // handler.OnSetReceiveBufferSize. + receiveBufSizeMu sync.Mutex `state:"nosave"` + // receiveBufferSize determines the receive buffer size for this socket. - receiveBufferSize atomicbitops.AlignedAtomicInt64 + receiveBufferSize int64 // mu protects the access to the below fields. mu sync.Mutex `state:"nosave"` @@ -612,81 +614,52 @@ func (so *SocketOptions) SetBindToDevice(bindToDevice int32) Error { return nil } +// SendBufferLimits returns the [min, max) range of allowable send buffer +// sizes. +func (so *SocketOptions) SendBufferLimits() (min, max int64) { + limits := so.getSendBufferLimits(so.stackHandler) + return int64(limits.Min), int64(limits.Max) +} + // GetSendBufferSize gets value for SO_SNDBUF option. func (so *SocketOptions) GetSendBufferSize() int64 { - return so.sendBufferSize.Load() + so.sendBufSizeMu.Lock() + defer so.sendBufSizeMu.Unlock() + return so.sendBufferSize } // SetSendBufferSize sets value for SO_SNDBUF option. notify indicates if the // stack handler should be invoked to set the send buffer size. func (so *SocketOptions) SetSendBufferSize(sendBufferSize int64, notify bool) { - v := sendBufferSize - - if !notify { - so.sendBufferSize.Store(v) - return - } - - // Make sure the send buffer size is within the min and max - // allowed. - ss := so.getSendBufferLimits(so.stackHandler) - min := int64(ss.Min) - max := int64(ss.Max) - // Validate the send buffer size with min and max values. - // Multiply it by factor of 2. - if v > max { - v = max - } - - if v < math.MaxInt32/PacketOverheadFactor { - v *= PacketOverheadFactor - if v < min { - v = min - } - } else { - v = math.MaxInt32 + so.sendBufSizeMu.Lock() + defer so.sendBufSizeMu.Unlock() + if notify { + sendBufferSize = so.handler.OnSetSendBufferSize(sendBufferSize) } + so.sendBufferSize = sendBufferSize +} - // Notify endpoint about change in buffer size. - newSz := so.handler.OnSetSendBufferSize(v) - so.sendBufferSize.Store(newSz) +// ReceiveBufferLimits returns the [min, max) range of allowable receive buffer +// sizes. +func (so *SocketOptions) ReceiveBufferLimits() (min, max int64) { + limits := so.getReceiveBufferLimits(so.stackHandler) + return int64(limits.Min), int64(limits.Max) } // GetReceiveBufferSize gets value for SO_RCVBUF option. func (so *SocketOptions) GetReceiveBufferSize() int64 { - return so.receiveBufferSize.Load() + so.receiveBufSizeMu.Lock() + defer so.receiveBufSizeMu.Unlock() + return so.receiveBufferSize } -// SetReceiveBufferSize sets value for SO_RCVBUF option. +// SetReceiveBufferSize sets the value of the SO_RCVBUF option, optionally +// notifying the owning endpoint. func (so *SocketOptions) SetReceiveBufferSize(receiveBufferSize int64, notify bool) { - if !notify { - so.receiveBufferSize.Store(receiveBufferSize) - return - } - - // Make sure the send buffer size is within the min and max - // allowed. - v := receiveBufferSize - ss := so.getReceiveBufferLimits(so.stackHandler) - min := int64(ss.Min) - max := int64(ss.Max) - // Validate the send buffer size with min and max values. - if v > max { - v = max - } - - // Multiply it by factor of 2. - if v < math.MaxInt32/PacketOverheadFactor { - v *= PacketOverheadFactor - if v < min { - v = min - } - } else { - v = math.MaxInt32 + so.receiveBufSizeMu.Lock() + defer so.receiveBufSizeMu.Unlock() + if notify { + receiveBufferSize = so.handler.OnSetReceiveBufferSize(receiveBufferSize, so.receiveBufferSize) } - - oldSz := so.receiveBufferSize.Load() - // Notify endpoint about change in buffer size. - newSz := so.handler.OnSetReceiveBufferSize(v, oldSz) - so.receiveBufferSize.Store(newSz) + so.receiveBufferSize = receiveBufferSize } diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go index ce9cebdaa..ae0bb4ace 100644 --- a/pkg/tcpip/stack/addressable_endpoint_state.go +++ b/pkg/tcpip/stack/addressable_endpoint_state.go @@ -249,7 +249,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address // or we are adding a new temporary or permanent address. // // The address MUST be write locked at this point. - defer addrState.mu.Unlock() + defer addrState.mu.Unlock() // +checklocksforce if permanent { if addrState.mu.kind.IsPermanent() { diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index 782e74b24..068dab7ce 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -363,7 +363,7 @@ func (ct *ConnTrack) insertConn(conn *conn) { // Unlocking can happen in any order. ct.buckets[tupleBucket].mu.Unlock() if tupleBucket != replyBucket { - ct.buckets[replyBucket].mu.Unlock() + ct.buckets[replyBucket].mu.Unlock() // +checklocksforce } } @@ -626,7 +626,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo // Don't re-unlock if both tuples are in the same bucket. if differentBuckets { - ct.buckets[replyBucket].mu.Unlock() + ct.buckets[replyBucket].mu.Unlock() // +checklocksforce } return true diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 9623d9c28..4d5431da1 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -1152,6 +1152,39 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on }) } +// raBufWithRIO returns a valid NDP Router Advertisement with a single Route +// Information option. +// +// All fields in the RA will be zero except the RIO option. +func raBufWithRIO(t *testing.T, ip tcpip.Address, prefix tcpip.AddressWithPrefix, lifetimeSeconds uint32, prf header.NDPRoutePreference) *stack.PacketBuffer { + // buf will hold the route information option after the Type and Length + // fields. + // + // 2.3. Route Information Option + // + // 0 1 2 3 + // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + // | Type | Length | Prefix Length |Resvd|Prf|Resvd| + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + // | Route Lifetime | + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + // | Prefix (Variable Length) | + // . . + // . . + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + var buf [22]byte + buf[0] = uint8(prefix.PrefixLen) + buf[1] = byte(prf) << 3 + binary.BigEndian.PutUint32(buf[2:], lifetimeSeconds) + if n := copy(buf[6:], prefix.Address); n != len(prefix.Address) { + t.Fatalf("got copy(...) = %d, want = %d", n, len(prefix.Address)) + } + return raBufWithOpts(ip, 0 /* router lifetime */, header.NDPOptionsSerializer{ + header.NDPRouteInformation(buf[:]), + }) +} + func TestDynamicConfigurationsDisabled(t *testing.T) { const ( nicID = 1 @@ -1308,8 +1341,8 @@ func boolToUint64(v bool) uint64 { return 0 } -func checkOffLinkRouteEvent(e ndpOffLinkRouteEvent, nicID tcpip.NICID, router tcpip.Address, prf header.NDPRoutePreference, updated bool) string { - return cmp.Diff(ndpOffLinkRouteEvent{nicID: nicID, subnet: header.IPv6EmptySubnet, router: router, prf: prf, updated: updated}, e, cmp.AllowUnexported(e)) +func checkOffLinkRouteEvent(e ndpOffLinkRouteEvent, nicID tcpip.NICID, subnet tcpip.Subnet, router tcpip.Address, prf header.NDPRoutePreference, updated bool) string { + return cmp.Diff(ndpOffLinkRouteEvent{nicID: nicID, subnet: subnet, router: router, prf: prf, updated: updated}, e, cmp.AllowUnexported(e)) } func testWithRAs(t *testing.T, f func(*testing.T, ipv6.HandleRAsConfiguration, bool)) { @@ -1342,122 +1375,167 @@ func testWithRAs(t *testing.T, f func(*testing.T, ipv6.HandleRAsConfiguration, b } } -func TestRouterDiscovery(t *testing.T) { +func TestOffLinkRouteDiscovery(t *testing.T) { const nicID = 1 - testWithRAs(t, func(t *testing.T, handleRAs ipv6.HandleRAsConfiguration, forwarding bool) { - ndpDisp := ndpDispatcher{ - offLinkRouteC: make(chan ndpOffLinkRouteEvent, 1), - } - e := channel.New(0, 1280, linkAddr1) - clock := faketime.NewManualClock() - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: handleRAs, - DiscoverDefaultRouters: true, - }, - NDPDisp: &ndpDisp, - })}, - Clock: clock, - }) + moreSpecificPrefix := tcpip.AddressWithPrefix{Address: testutil.MustParse6("a00::"), PrefixLen: 16} + tests := []struct { + name string - expectOffLinkRouteEvent := func(addr tcpip.Address, prf header.NDPRoutePreference, updated bool) { - t.Helper() + discoverDefaultRouters bool + discoverMoreSpecificRoutes bool - select { - case e := <-ndpDisp.offLinkRouteC: - if diff := checkOffLinkRouteEvent(e, nicID, addr, prf, updated); diff != "" { - t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) + dest tcpip.Subnet + ra func(*testing.T, tcpip.Address, uint16, header.NDPRoutePreference) *stack.PacketBuffer + }{ + { + name: "Default router discovery", + discoverDefaultRouters: true, + discoverMoreSpecificRoutes: false, + dest: header.IPv6EmptySubnet, + ra: func(_ *testing.T, router tcpip.Address, lifetimeSeconds uint16, prf header.NDPRoutePreference) *stack.PacketBuffer { + return raBufWithPrf(router, lifetimeSeconds, prf) + }, + }, + { + name: "More-specific route discovery", + discoverDefaultRouters: false, + discoverMoreSpecificRoutes: true, + dest: moreSpecificPrefix.Subnet(), + ra: func(t *testing.T, router tcpip.Address, lifetimeSeconds uint16, prf header.NDPRoutePreference) *stack.PacketBuffer { + return raBufWithRIO(t, router, moreSpecificPrefix, uint32(lifetimeSeconds), prf) + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + testWithRAs(t, func(t *testing.T, handleRAs ipv6.HandleRAsConfiguration, forwarding bool) { + ndpDisp := ndpDispatcher{ + offLinkRouteC: make(chan ndpOffLinkRouteEvent, 1), } - default: - t.Fatal("expected router discovery event") - } - } + e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + HandleRAs: handleRAs, + DiscoverDefaultRouters: test.discoverDefaultRouters, + DiscoverMoreSpecificRoutes: test.discoverMoreSpecificRoutes, + }, + NDPDisp: &ndpDisp, + })}, + Clock: clock, + }) - expectAsyncOffLinkRouteInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) { - t.Helper() + expectOffLinkRouteEvent := func(addr tcpip.Address, prf header.NDPRoutePreference, updated bool) { + t.Helper() - clock.Advance(timeout) - select { - case e := <-ndpDisp.offLinkRouteC: - var prf header.NDPRoutePreference - if diff := checkOffLinkRouteEvent(e, nicID, addr, prf, false); diff != "" { - t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) + select { + case e := <-ndpDisp.offLinkRouteC: + if diff := checkOffLinkRouteEvent(e, nicID, test.dest, addr, prf, updated); diff != "" { + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected router discovery event") + } } - default: - t.Fatal("timed out waiting for router discovery event") - } - } - if err := s.SetForwardingDefaultAndAllNICs(ipv6.ProtocolNumber, forwarding); err != nil { - t.Fatalf("SetForwardingDefaultAndAllNICs(%d, %t): %s", ipv6.ProtocolNumber, forwarding, err) - } + expectAsyncOffLinkRouteInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) { + t.Helper() - if err := s.CreateNIC(nicID, e); err != nil { - t.Fatalf("CreateNIC(%d, _): %s", nicID, err) - } + clock.Advance(timeout) + select { + case e := <-ndpDisp.offLinkRouteC: + var prf header.NDPRoutePreference + if diff := checkOffLinkRouteEvent(e, nicID, test.dest, addr, prf, false); diff != "" { + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("timed out waiting for router discovery event") + } + } - // Rx an RA from lladdr2 with zero lifetime. It should not be - // remembered. - e.InjectInbound(header.IPv6ProtocolNumber, raBufSimple(llAddr2, 0)) - select { - case <-ndpDisp.offLinkRouteC: - t.Fatal("unexpectedly updated an off-link route with 0 lifetime") - default: - } + if err := s.SetForwardingDefaultAndAllNICs(ipv6.ProtocolNumber, forwarding); err != nil { + t.Fatalf("SetForwardingDefaultAndAllNICs(%d, %t): %s", ipv6.ProtocolNumber, forwarding, err) + } - // Rx an RA from lladdr2 with a huge lifetime and reserved preference value - // (which should be interpreted as the default (medium) preference value). - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPrf(llAddr2, 1000, header.ReservedRoutePreference)) - expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, true) - - // Rx an RA from another router (lladdr3) with non-zero lifetime and - // non-default preference value. - const l3LifetimeSeconds = 6 - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPrf(llAddr3, l3LifetimeSeconds, header.HighRoutePreference)) - expectOffLinkRouteEvent(llAddr3, header.HighRoutePreference, true) - - // Rx an RA from lladdr2 with lesser lifetime and default (medium) - // preference value. - const l2LifetimeSeconds = 2 - e.InjectInbound(header.IPv6ProtocolNumber, raBufSimple(llAddr2, l2LifetimeSeconds)) - select { - case <-ndpDisp.offLinkRouteC: - t.Fatal("should not receive a off-link route event when updating lifetimes for known routers") - default: - } + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } - // Rx an RA from lladdr2 with a different preference. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPrf(llAddr2, l2LifetimeSeconds, header.LowRoutePreference)) - expectOffLinkRouteEvent(llAddr2, header.LowRoutePreference, true) - - // Wait for lladdr2's router invalidation job to execute. The lifetime - // of the router should have been updated to the most recent (smaller) - // lifetime. - // - // Wait for the normal lifetime plus an extra bit for the - // router to get invalidated. If we don't get an invalidation - // event after this time, then something is wrong. - expectAsyncOffLinkRouteInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second) - - // Rx an RA from lladdr2 with huge lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufSimple(llAddr2, 1000)) - expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, true) - - // Rx an RA from lladdr2 with zero lifetime. It should be invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufSimple(llAddr2, 0)) - expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, false) - - // Wait for lladdr3's router invalidation job to execute. The lifetime - // of the router should have been updated to the most recent (smaller) - // lifetime. - // - // Wait for the normal lifetime plus an extra bit for the - // router to get invalidated. If we don't get an invalidation - // event after this time, then something is wrong. - expectAsyncOffLinkRouteInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second) - }) + // Rx an RA from lladdr2 with zero lifetime. It should not be + // remembered. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 0, header.MediumRoutePreference)) + select { + case <-ndpDisp.offLinkRouteC: + t.Fatal("unexpectedly updated an off-link route with 0 lifetime") + default: + } + + // Discover an off-link route through llAddr2. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 1000, header.ReservedRoutePreference)) + if test.discoverMoreSpecificRoutes { + // The reserved value is considered invalid with more-specific route + // discovery so we inject the same packet but with the default + // (medium) preference value. + select { + case <-ndpDisp.offLinkRouteC: + t.Fatal("unexpectedly updated an off-link route with a reserved preference value") + default: + } + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 1000, header.MediumRoutePreference)) + } + expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, true) + + // Rx an RA from another router (lladdr3) with non-zero lifetime and + // non-default preference value. + const l3LifetimeSeconds = 6 + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr3, l3LifetimeSeconds, header.HighRoutePreference)) + expectOffLinkRouteEvent(llAddr3, header.HighRoutePreference, true) + + // Rx an RA from lladdr2 with lesser lifetime and default (medium) + // preference value. + const l2LifetimeSeconds = 2 + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, l2LifetimeSeconds, header.MediumRoutePreference)) + select { + case <-ndpDisp.offLinkRouteC: + t.Fatal("should not receive a off-link route event when updating lifetimes for known routers") + default: + } + + // Rx an RA from lladdr2 with a different preference. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, l2LifetimeSeconds, header.LowRoutePreference)) + expectOffLinkRouteEvent(llAddr2, header.LowRoutePreference, true) + + // Wait for lladdr2's router invalidation job to execute. The lifetime + // of the router should have been updated to the most recent (smaller) + // lifetime. + // + // Wait for the normal lifetime plus an extra bit for the + // router to get invalidated. If we don't get an invalidation + // event after this time, then something is wrong. + expectAsyncOffLinkRouteInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second) + + // Rx an RA from lladdr2 with huge lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 1000, header.MediumRoutePreference)) + expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, true) + + // Rx an RA from lladdr2 with zero lifetime. It should be invalidated. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 0, header.MediumRoutePreference)) + expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, false) + + // Wait for lladdr3's router invalidation job to execute. The lifetime + // of the router should have been updated to the most recent (smaller) + // lifetime. + // + // Wait for the normal lifetime plus an extra bit for the + // router to get invalidated. If we don't get an invalidation + // event after this time, then something is wrong. + expectAsyncOffLinkRouteInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second) + }) + }) + } } // TestRouterDiscoveryMaxRouters tests that only @@ -1494,7 +1572,7 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) { if i <= ipv6.MaxDiscoveredOffLinkRoutes { select { case e := <-ndpDisp.offLinkRouteC: - if diff := checkOffLinkRouteEvent(e, nicID, llAddr, header.MediumRoutePreference, true); diff != "" { + if diff := checkOffLinkRouteEvent(e, nicID, header.IPv6EmptySubnet, llAddr, header.MediumRoutePreference, true); diff != "" { t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) } default: @@ -4583,7 +4661,7 @@ func TestNoCleanupNDPStateWhenForwardingEnabled(t *testing.T) { ) select { case e := <-ndpDisp.offLinkRouteC: - if diff := checkOffLinkRouteEvent(e, nicID, llAddr3, header.MediumRoutePreference, true /* discovered */); diff != "" { + if diff := checkOffLinkRouteEvent(e, nicID, header.IPv6EmptySubnet, llAddr3, header.MediumRoutePreference, true /* discovered */); diff != "" { t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) } default: @@ -5278,8 +5356,9 @@ func TestRouterSolicitation(t *testing.T) { RandSource: &randSource, }) - if err := s.CreateNIC(nicID, &e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + opts := stack.NICOptions{Disabled: true} + if err := s.CreateNICWithOptions(nicID, &e, opts); err != nil { + t.Fatalf("CreateNICWithOptions(%d, _, %#v) = %s", nicID, opts, err) } if addr := test.nicAddr; addr != "" { @@ -5288,6 +5367,10 @@ func TestRouterSolicitation(t *testing.T) { } } + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("EnableNIC(%d): %s", nicID, err) + } + // Make sure each RS is sent at the right time. remaining := test.maxRtrSolicit if remaining != 0 { diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 8f2658f64..55683b4fb 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -1845,6 +1845,10 @@ type TCPStats struct { // FailedPortReservations is the number of times TCP failed to reserve // a port. FailedPortReservations *StatCounter + + // SegmentsAckedWithDSACK is the number of segments acknowledged with + // DSACK. + SegmentsAckedWithDSACK *StatCounter } // UDPStats collects UDP-specific stats. diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index cb316d27a..f9a15efb2 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -213,6 +213,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. +// +checklocks:e.mu func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.state { case stateInitial: @@ -229,10 +230,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip } e.mu.RUnlock() - defer e.mu.RLock() - e.mu.Lock() - defer e.mu.Unlock() + defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index b6687911a..b3d8951ff 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -132,7 +132,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt // headers included. Because they're write-only, We don't need to // register with the stack. if !associated { - e.ops.SetReceiveBufferSize(0, false) + e.ops.SetReceiveBufferSize(0, false /* notify */) e.waiterQueue = nil return e, nil } @@ -455,8 +455,21 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { } // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. -func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { - return tcpip.FullAddress{}, &tcpip.ErrNotSupported{} +func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { + e.mu.RLock() + defer e.mu.RUnlock() + + addr := e.BindAddr + if e.connected { + addr = e.route.LocalAddress() + } + + return tcpip.FullAddress{ + NIC: e.RegisterNICID, + Addr: addr, + // Linux returns the protocol in the port field. + Port: uint16(e.TransProto), + }, nil } // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index d807b13b7..aa413ad05 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -330,7 +330,9 @@ func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions, } ep := h.ep - if err := h.complete(); err != nil { + // N.B. the endpoint is generated above by startHandshake, and will be + // returned locked. This first call is forced. + if err := h.complete(); err != nil { // +checklocksforce ep.stack.Stats().TCP.FailedConnectionAttempts.Increment() ep.stats.FailedConnectionAttempts.Increment() l.cleanupFailedHandshake(h) @@ -364,6 +366,7 @@ func (l *listenContext) closeAllPendingEndpoints() { } // Precondition: h.ep.mu must be held. +// +checklocks:h.ep.mu func (l *listenContext) cleanupFailedHandshake(h *handshake) { e := h.ep e.mu.Unlock() @@ -504,7 +507,9 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header } go func() { - if err := h.complete(); err != nil { + // Note that startHandshake returns a locked endpoint. The + // force call here just makes it so. + if err := h.complete(); err != nil { // +checklocksforce e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() ctx.cleanupFailedHandshake(h) diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index e39d1623d..93ed161f9 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -511,6 +511,7 @@ func (h *handshake) start() { } // complete completes the TCP 3-way handshake initiated by h.start(). +// +checklocks:h.ep.mu func (h *handshake) complete() tcpip.Error { // Set up the wakers. var s sleep.Sleeper @@ -1283,42 +1284,45 @@ func (e *endpoint) disableKeepaliveTimer() { e.keepalive.Unlock() } -// protocolMainLoop is the main loop of the TCP protocol. It runs in its own -// goroutine and is responsible for sending segments and handling received -// segments. -func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error { - e.mu.Lock() - var closeTimer tcpip.Timer - var closeWaker sleep.Waker - - epilogue := func() { - // e.mu is expected to be hold upon entering this section. - if e.snd != nil { - e.snd.resendTimer.cleanup() - e.snd.probeTimer.cleanup() - e.snd.reorderTimer.cleanup() - } +// protocolMainLoopDone is called at the end of protocolMainLoop. +// +checklocksrelease:e.mu +func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer, closeWaker *sleep.Waker) { + if e.snd != nil { + e.snd.resendTimer.cleanup() + e.snd.probeTimer.cleanup() + e.snd.reorderTimer.cleanup() + } - if closeTimer != nil { - closeTimer.Stop() - } + if closeTimer != nil { + closeTimer.Stop() + } - e.completeWorkerLocked() + e.completeWorkerLocked() - if e.drainDone != nil { - close(e.drainDone) - } + if e.drainDone != nil { + close(e.drainDone) + } - e.mu.Unlock() + e.mu.Unlock() - e.drainClosingSegmentQueue() + e.drainClosingSegmentQueue() - // When the protocol loop exits we should wake up our waiters. - e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) - } + // When the protocol loop exits we should wake up our waiters. + e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) +} +// protocolMainLoop is the main loop of the TCP protocol. It runs in its own +// goroutine and is responsible for sending segments and handling received +// segments. +func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error { + var ( + closeTimer tcpip.Timer + closeWaker sleep.Waker + ) + + e.mu.Lock() if handshake { - if err := e.h.complete(); err != nil { + if err := e.h.complete(); err != nil { // +checklocksforce e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() @@ -1327,8 +1331,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.hardError = err e.workerCleanup = true - // Lock released below. - epilogue() + e.protocolMainLoopDone(closeTimer, &closeWaker) return err } } @@ -1472,7 +1475,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // Only block the worker if the endpoint // is not in closed state or error state. close(e.drainDone) - e.mu.Unlock() + e.mu.Unlock() // +checklocksforce <-e.undrain e.mu.Lock() } @@ -1533,8 +1536,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ if err != nil { e.resetConnectionLocked(err) } - // Lock released below. - epilogue() } loop: @@ -1558,6 +1559,7 @@ loop: // just want to terminate the loop and cleanup the // endpoint. cleanupOnError(nil) + e.protocolMainLoopDone(closeTimer, &closeWaker) return nil case StateTimeWait: fallthrough @@ -1566,6 +1568,7 @@ loop: default: if err := funcs[v].f(); err != nil { cleanupOnError(err) + e.protocolMainLoopDone(closeTimer, &closeWaker) return nil } } @@ -1589,13 +1592,13 @@ loop: // Handle any StateError transition from StateTimeWait. if e.EndpointState() == StateError { cleanupOnError(nil) + e.protocolMainLoopDone(closeTimer, &closeWaker) return nil } e.transitionToStateCloseLocked() - // Lock released below. - epilogue() + e.protocolMainLoopDone(closeTimer, &closeWaker) // A new SYN was received during TIME_WAIT and we need to abort // the timewait and redirect the segment to the listener queue @@ -1665,6 +1668,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func() // should be executed after releasing the endpoint registrations. This is // done in cases where a new SYN is received during TIME_WAIT that carries // a sequence number larger than one see on the connection. +// +checklocks:e.mu func (e *endpoint) doTimeWait() (twReuse func()) { // Trigger a 2 * MSL time wait state. During this period // we will drop all incoming segments. diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go index dff7cb89c..7d110516b 100644 --- a/pkg/tcpip/transport/tcp/dispatcher.go +++ b/pkg/tcpip/transport/tcp/dispatcher.go @@ -127,7 +127,7 @@ func (p *processor) start(wg *sync.WaitGroup) { case !ep.segmentQueue.empty(): p.epQ.enqueue(ep) } - ep.mu.Unlock() + ep.mu.Unlock() // +checklocksforce } else { ep.newSegmentWaker.Assert() } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 4acddc959..044123185 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -478,7 +478,7 @@ type endpoint struct { // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags - // tcpRecovery is the loss deteoction algorithm used by TCP. + // tcpRecovery is the loss recovery algorithm used by TCP. tcpRecovery tcpip.TCPRecovery // sack holds TCP SACK related information for this endpoint. @@ -664,6 +664,7 @@ func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { // The assumption behind spinning here being that background packet processing // should not be holding the lock for long and spinning reduces latency as we // avoid an expensive sleep/wakeup of of the syscall goroutine). +// +checklocksacquire:e.mu func (e *endpoint) LockUser() { for { // Try first if the sock is locked then check if it's owned @@ -683,7 +684,7 @@ func (e *endpoint) LockUser() { continue } atomic.StoreUint32(&e.ownedByUser, 1) - return + return // +checklocksforce } } @@ -700,7 +701,7 @@ func (e *endpoint) LockUser() { // protocol goroutine altogether. // // Precondition: e.LockUser() must have been called before calling e.UnlockUser() -// +checklocks:e.mu +// +checklocksrelease:e.mu func (e *endpoint) UnlockUser() { // Lock segment queue before checking so that we avoid a race where // segments can be queued between the time we check if queue is empty @@ -736,12 +737,13 @@ func (e *endpoint) UnlockUser() { } // StopWork halts packet processing. Only to be used in tests. +// +checklocksacquire:e.mu func (e *endpoint) StopWork() { e.mu.Lock() } // ResumeWork resumes packet processing. Only to be used in tests. -// +checklocks:e.mu +// +checklocksrelease:e.mu func (e *endpoint) ResumeWork() { e.mu.Unlock() } @@ -752,7 +754,7 @@ func (e *endpoint) ResumeWork() { // // Precondition: e.mu must be held to call this method. func (e *endpoint) setEndpointState(state EndpointState) { - oldstate := EndpointState(atomic.LoadUint32(&e.state)) + oldstate := EndpointState(atomic.SwapUint32(&e.state, uint32(state))) switch state { case StateEstablished: e.stack.Stats().TCP.CurrentEstablished.Increment() @@ -769,7 +771,6 @@ func (e *endpoint) setEndpointState(state EndpointState) { e.stack.Stats().TCP.CurrentEstablished.Decrement() } } - atomic.StoreUint32(&e.state, uint32(state)) } // EndpointState returns the current state of the endpoint. @@ -868,8 +869,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue e.maxSynRetries = uint8(synRetries) } - s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) - if p := s.GetTCPProbe(); p != nil { e.probe = p } @@ -1480,86 +1479,101 @@ func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { return avail, nil } -// Write writes data to the endpoint's peer. -func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - // Linux completely ignores any address passed to sendto(2) for TCP sockets - // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More - // and opts.EndOfRecord are also ignored. +// readFromPayloader reads a slice from the Payloader. +// +checklocks:e.mu +// +checklocks:e.sndQueueInfo.sndQueueMu +func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) { + // We can release locks while copying data. + // + // This is not possible if atomic is set, because we can't allow the + // available buffer space to be consumed by some other caller while we + // are copying data in. + if !opts.Atomic { + e.sndQueueInfo.sndQueueMu.Unlock() + defer e.sndQueueInfo.sndQueueMu.Lock() - e.LockUser() - defer e.UnlockUser() + e.UnlockUser() + defer e.LockUser() + } - nextSeg, n, err := func() (*segment, int, tcpip.Error) { - e.sndQueueInfo.sndQueueMu.Lock() - defer e.sndQueueInfo.sndQueueMu.Unlock() + // Fetch data. + if l := p.Len(); l < avail { + avail = l + } + if avail == 0 { + return nil, nil + } + v := make([]byte, avail) + n, err := p.Read(v) + if err != nil && err != io.EOF { + return nil, &tcpip.ErrBadBuffer{} + } + return v[:n], nil +} + +// queueSegment reads data from the payloader and returns a segment to be sent. +// +checklocks:e.mu +func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { + e.sndQueueInfo.sndQueueMu.Lock() + defer e.sndQueueInfo.sndQueueMu.Unlock() + + avail, err := e.isEndpointWritableLocked() + if err != nil { + e.stats.WriteErrors.WriteClosed.Increment() + return nil, 0, err + } + + v, err := e.readFromPayloader(p, opts, avail) + if err != nil { + return nil, 0, err + } + + // Do not queue zero length segments. + if len(v) == 0 { + return nil, 0, nil + } + if !opts.Atomic { + // Since we released locks in between it's possible that the + // endpoint transitioned to a CLOSED/ERROR states so make + // sure endpoint is still writable before trying to write. avail, err := e.isEndpointWritableLocked() if err != nil { e.stats.WriteErrors.WriteClosed.Increment() return nil, 0, err } - v, err := func() ([]byte, tcpip.Error) { - // We can release locks while copying data. - // - // This is not possible if atomic is set, because we can't allow the - // available buffer space to be consumed by some other caller while we - // are copying data in. - if !opts.Atomic { - e.sndQueueInfo.sndQueueMu.Unlock() - defer e.sndQueueInfo.sndQueueMu.Lock() - - e.UnlockUser() - defer e.LockUser() - } - - // Fetch data. - if l := p.Len(); l < avail { - avail = l - } - if avail == 0 { - return nil, nil - } - v := make([]byte, avail) - n, err := p.Read(v) - if err != nil && err != io.EOF { - return nil, &tcpip.ErrBadBuffer{} - } - return v[:n], nil - }() - if len(v) == 0 || err != nil { - return nil, 0, err + // Discard any excess data copied in due to avail being reduced due + // to a simultaneous write call to the socket. + if avail < len(v) { + v = v[:avail] } + } - if !opts.Atomic { - // Since we released locks in between it's possible that the - // endpoint transitioned to a CLOSED/ERROR states so make - // sure endpoint is still writable before trying to write. - avail, err := e.isEndpointWritableLocked() - if err != nil { - e.stats.WriteErrors.WriteClosed.Increment() - return nil, 0, err - } + // Add data to the send queue. + s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) + e.sndQueueInfo.SndBufUsed += len(v) + e.snd.writeList.PushBack(s) - // Discard any excess data copied in due to avail being reduced due - // to a simultaneous write call to the socket. - if avail < len(v) { - v = v[:avail] - } - } + return s, len(v), nil +} + +// Write writes data to the endpoint's peer. +func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + // Linux completely ignores any address passed to sendto(2) for TCP sockets + // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More + // and opts.EndOfRecord are also ignored. - // Add data to the send queue. - s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) - e.sndQueueInfo.SndBufUsed += len(v) - e.snd.writeList.PushBack(s) + e.LockUser() + defer e.UnlockUser() - return s, len(v), nil - }() // Return if either we didn't queue anything or if an error occurred while // attempting to queue data. + nextSeg, n, err := e.queueSegment(p, opts) if n == 0 || err != nil { return 0, err } + e.sendData(nextSeg) return int64(n), nil } @@ -2504,6 +2518,7 @@ func (e *endpoint) listen(backlog int) tcpip.Error { // startAcceptedLoop sets up required state and starts a goroutine with the // main loop for accepted connections. +// +checklocksrelease:e.mu func (e *endpoint) startAcceptedLoop() { e.workerRunning = true e.mu.Unlock() @@ -2905,6 +2920,7 @@ func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { } if bool(v) && synOpts.SACKPermitted { e.SACKPermitted = true + e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) } } diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index 65c86823a..2e709ed78 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -164,8 +164,9 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, return nil, err } - // Start the protocol goroutine. - ep.startAcceptedLoop() + // Start the protocol goroutine. Note that the endpoint is returned + // from performHandshake locked. + ep.startAcceptedLoop() // +checklocksforce return ep, nil } diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 2fc282e73..18b834243 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -478,8 +478,7 @@ func NewProtocol(s *stack.Stack) stack.TransportProtocol { minRTO: MinRTO, maxRTO: MaxRTO, maxRetries: MaxRetries, - // TODO(gvisor.dev/issue/5243): Set recovery to tcpip.TCPRACKLossDetection. - recovery: 0, + recovery: tcpip.TCPRACKLossDetection, } p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0)) return &p diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 72d58dcff..92a66f17e 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -1154,6 +1154,13 @@ func (s *sender) walkSACK(rcvdSeg *segment) { idx := 0 n := len(rcvdSeg.parsedOptions.SACKBlocks) if checkDSACK(rcvdSeg) { + dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] + numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) + // numDSACK can be zero when DSACK is sent for subsegments. + if numDSACK < 1 { + numDSACK = 1 + } + s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) s.rc.setDSACKSeen(true) idx = 1 n-- diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go index ced3a9c58..84fb1c416 100644 --- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go +++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go @@ -16,6 +16,7 @@ // iterations taking long enough that the retransmit timer can kick in causing // the congestion window measurements to fail due to extra packets etc. // +//go:build !race // +build !race package tcp_test diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go index d6cf786a1..89e9fb886 100644 --- a/pkg/tcpip/transport/tcp/tcp_rack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go @@ -36,9 +36,9 @@ const ( latency = 5 * time.Millisecond ) -func setStackRACKPermitted(t *testing.T, c *context.Context) { +func setStackTCPRecovery(t *testing.T, c *context.Context, recovery int) { t.Helper() - opt := tcpip.TCPRACKLossDetection + opt := tcpip.TCPRecovery(recovery) if err := c.Stack().SetTransportProtocolOption(header.TCPProtocolNumber, &opt); err != nil { t.Fatalf("c.s.SetTransportProtocolOption(%d, &%v(%v)): %s", header.TCPProtocolNumber, opt, opt, err) } @@ -70,7 +70,6 @@ func TestRACKUpdate(t *testing.T) { close(probeDone) }) setStackSACKPermitted(t, c, true) - setStackRACKPermitted(t, c) createConnectedWithSACKAndTS(c) data := make([]byte, maxPayload) @@ -129,7 +128,6 @@ func TestRACKDetectReorder(t *testing.T) { close(probeDone) }) setStackSACKPermitted(t, c, true) - setStackRACKPermitted(t, c) createConnectedWithSACKAndTS(c) data := make([]byte, ackNumToVerify*maxPayload) for i := range data { @@ -162,8 +160,8 @@ func TestRACKDetectReorder(t *testing.T) { func sendAndReceiveWithSACK(t *testing.T, c *context.Context, numPackets int, enableRACK bool) []byte { setStackSACKPermitted(t, c, true) - if enableRACK { - setStackRACKPermitted(t, c) + if !enableRACK { + setStackTCPRecovery(t, c, 0) } createConnectedWithSACKAndTS(c) @@ -542,6 +540,28 @@ func TestRACKDetectDSACK(t *testing.T) { case invalidDSACKDetected: t.Fatalf("RACK DSACK detected when there is no duplicate SACK") } + + metricPollFn := func() error { + tcpStats := c.Stack().Stats().TCP + stats := []struct { + stat *tcpip.StatCounter + name string + want uint64 + }{ + // Check DSACK was received for one segment. + {tcpStats.SegmentsAckedWithDSACK, "stats.TCP.SegmentsAckedWithDSACK", 1}, + } + for _, s := range stats { + if got, want := s.stat.Value(), s.want; got != want { + return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want) + } + } + return nil + } + + if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil { + t.Error(err) + } } // TestRACKDetectDSACKWithOutOfOrder tests that RACK detects DSACK with out of @@ -682,6 +702,28 @@ func TestRACKDetectDSACKSingleDup(t *testing.T) { case invalidDSACKDetected: t.Fatalf("RACK DSACK detected when there is no duplicate SACK") } + + metricPollFn := func() error { + tcpStats := c.Stack().Stats().TCP + stats := []struct { + stat *tcpip.StatCounter + name string + want uint64 + }{ + // Check DSACK was received for a subsegment. + {tcpStats.SegmentsAckedWithDSACK, "stats.TCP.SegmentsAckedWithDSACK", 1}, + } + for _, s := range stats { + if got, want := s.stat.Value(), s.want; got != want { + return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want) + } + } + return nil + } + + if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil { + t.Error(err) + } } // TestRACKDetectDSACKDupWithCumulativeACK tests DSACK for two non-contiguous @@ -998,7 +1040,6 @@ func TestRACKWithWindowFull(t *testing.T) { defer c.Cleanup() setStackSACKPermitted(t, c, true) - setStackRACKPermitted(t, c) createConnectedWithSACKAndTS(c) seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go index 20c9761f2..83e0653b9 100644 --- a/pkg/tcpip/transport/tcp/tcp_sack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go @@ -61,6 +61,7 @@ func TestSackPermittedConnect(t *testing.T) { defer c.Cleanup() setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) rep := createConnectedWithSACKPermittedOption(c) data := []byte{1, 2, 3} @@ -105,6 +106,7 @@ func TestSackDisabledConnect(t *testing.T) { defer c.Cleanup() setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) rep := c.CreateConnectedWithOptions(header.TCPSynOptions{}) @@ -166,6 +168,7 @@ func TestSackPermittedAccept(t *testing.T) { } } setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, SACKPermitted: tc.sackPermitted}) // Now verify no SACK blocks are @@ -239,6 +242,7 @@ func TestSackDisabledAccept(t *testing.T) { } setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS}) @@ -386,6 +390,7 @@ func TestSACKRecovery(t *testing.T) { log.Printf("state: %+v\n", s) }) setStackSACKPermitted(t, c, true) + setStackTCPRecovery(t, c, 0) createConnectedWithSACKAndTS(c) const iterations = 3 diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 71c4aa85d..031f01357 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -2147,7 +2147,7 @@ func TestSmallSegReceiveWindowAdvertisement(t *testing.T) { // Bump up the receive buffer size such that, when the receive window grows, // the scaled window exceeds maxUint16. - c.EP.SocketOptions().SetReceiveBufferSize(int64(opt.Max), true) + c.EP.SocketOptions().SetReceiveBufferSize(int64(opt.Max)*2, true /* notify */) // Keep the payload size < segment overhead and such that it is a multiple // of the window scaled value. This enables the test to perform equality @@ -2267,7 +2267,7 @@ func TestNoWindowShrinking(t *testing.T) { initialWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale initialLastAcceptableSeq := iss.Add(seqnum.Size(initialWnd)) // Now shrink the receive buffer to half its original size. - c.EP.SocketOptions().SetReceiveBufferSize(int64(rcvBufSize/2), true) + c.EP.SocketOptions().SetReceiveBufferSize(int64(rcvBufSize), true /* notify */) data := generateRandomPayload(t, rcvBufSize) // Send a payload of half the size of rcvBufSize. @@ -2523,7 +2523,7 @@ func TestScaledWindowAccept(t *testing.T) { defer ep.Close() // Set the window size greater than the maximum non-scaled window. - ep.SocketOptions().SetReceiveBufferSize(65535*3, true) + ep.SocketOptions().SetReceiveBufferSize(65535*6, true /* notify */) if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) @@ -2595,7 +2595,7 @@ func TestNonScaledWindowAccept(t *testing.T) { defer ep.Close() // Set the window size greater than the maximum non-scaled window. - ep.SocketOptions().SetReceiveBufferSize(65535*3, true) + ep.SocketOptions().SetReceiveBufferSize(65535*6, true /* notify */) if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) @@ -3188,7 +3188,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) { // Set the buffer size to a deterministic size so that we can check the // window scaling option. const rcvBufferSize = 0x20000 - ep.SocketOptions().SetReceiveBufferSize(rcvBufferSize, true) + ep.SocketOptions().SetReceiveBufferSize(rcvBufferSize*2, true /* notify */) if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) @@ -3327,7 +3327,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) { // window scaling option. const rcvBufferSize = 0x20000 const wndScale = 3 - c.EP.SocketOptions().SetReceiveBufferSize(rcvBufferSize, true) + c.EP.SocketOptions().SetReceiveBufferSize(rcvBufferSize*2, true /* notify */) // Start connection attempt. we, ch := waiter.NewChannelEntry(nil) @@ -3624,6 +3624,38 @@ func TestMaxRTO(t *testing.T) { } } +// TestZeroSizedWriteRetransmit tests that a zero sized write should not +// result in a panic on an RTO as no segment should have been queued for +// a zero sized write. +func TestZeroSizedWriteRetransmit(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.CreateConnected(context.TestInitialSequenceNumber, 30000 /* rcvWnd */, -1 /* epRcvBuf */) + + var r bytes.Reader + _, err := c.EP.Write(&r, tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %s", err) + } + // Now do a non-zero sized write to trigger actual sending of data. + r.Reset(make([]byte, 1)) + _, err = c.EP.Write(&r, tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %s", err) + } + // Do not ACK the packet and expect an original transmit and a + // retransmit. This should not cause a panic. + for i := 0; i < 2; i++ { + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlagsMatch(header.TCPFlagAck, ^header.TCPFlagPsh), + ), + ) + } +} + // TestRetransmitIPv4IDUniqueness tests that the IPv4 Identification field is // unique on retransmits. func TestRetransmitIPv4IDUniqueness(t *testing.T) { @@ -4637,52 +4669,6 @@ func TestDefaultBufferSizes(t *testing.T) { checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*3) } -func TestMinMaxBufferSizes(t *testing.T) { - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, - TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}, - }) - - // Check the default values. - ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{}) - if err != nil { - t.Fatalf("NewEndpoint failed; %s", err) - } - defer ep.Close() - - // Change the min/max values for send/receive - { - opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20} - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) - } - } - - { - opt := tcpip.TCPSendBufferSizeRangeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30} - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) - } - } - - // Set values below the min/2. - ep.SocketOptions().SetReceiveBufferSize(99, true) - checkRecvBufferSize(t, ep, 200) - - ep.SocketOptions().SetSendBufferSize(149, true) - - checkSendBufferSize(t, ep, 300) - - // Set values above the max. - ep.SocketOptions().SetReceiveBufferSize(1+tcp.DefaultReceiveBufferSize*20, true) - // Values above max are capped at max and then doubled. - checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20*2) - - ep.SocketOptions().SetSendBufferSize(1+tcp.DefaultSendBufferSize*30, true) - // Values above max are capped at max and then doubled. - checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30*2) -} - func TestBindToDeviceOption(t *testing.T) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, @@ -7720,7 +7706,7 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) { // Increasing the buffer from should generate an ACK, // since window grew from small value to larger equal MSS - c.EP.SocketOptions().SetReceiveBufferSize(rcvBuf*2, true) + c.EP.SocketOptions().SetReceiveBufferSize(rcvBuf*4, true /* notify */) checker.IPv4(t, c.GetPacket(), checker.PayloadLen(header.TCPMinimumSize), checker.TCP( diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 53efecc5a..96e4849d2 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -757,7 +757,7 @@ func (c *Context) Create(epRcvBuf int) { } if epRcvBuf != -1 { - c.EP.SocketOptions().SetReceiveBufferSize(int64(epRcvBuf), true /* notify */) + c.EP.SocketOptions().SetReceiveBufferSize(int64(epRcvBuf)*2, true /* notify */) } } diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index def9d7186..82a3f2287 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -364,6 +364,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. +// +checklocks:e.mu func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.EndpointState() { case StateInitial: @@ -380,10 +381,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip } e.mu.RUnlock() - defer e.mu.RLock() - e.mu.Lock() - defer e.mu.Unlock() + defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. @@ -449,37 +448,20 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp return n, err } -func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - if err := e.LastError(); err != nil { - return 0, err - } - - // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.) - if opts.More { - return 0, &tcpip.ErrInvalidOptionValue{} - } - - to := opts.To - +func (e *endpoint) buildUDPPacketInfo(p tcpip.Payloader, opts tcpip.WriteOptions) (udpPacketInfo, tcpip.Error) { e.mu.RLock() - lockReleased := false - defer func() { - if lockReleased { - return - } - e.mu.RUnlock() - }() + defer e.mu.RUnlock() // If we've shutdown with SHUT_WR we are in an invalid state for sending. if e.shutdownFlags&tcpip.ShutdownWrite != 0 { - return 0, &tcpip.ErrClosedForSend{} + return udpPacketInfo{}, &tcpip.ErrClosedForSend{} } // Prepare for write. for { - retry, err := e.prepareForWrite(to) + retry, err := e.prepareForWrite(opts.To) if err != nil { - return 0, err + return udpPacketInfo{}, err } if !retry { @@ -489,34 +471,34 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp route := e.route dstPort := e.dstPort - if to != nil { + if opts.To != nil { // Reject destination address if it goes through a different // NIC than the endpoint was bound to. - nicID := to.NIC + nicID := opts.To.NIC if nicID == 0 { nicID = tcpip.NICID(e.ops.GetBindToDevice()) } if e.BindNICID != 0 { if nicID != 0 && nicID != e.BindNICID { - return 0, &tcpip.ErrNoRoute{} + return udpPacketInfo{}, &tcpip.ErrNoRoute{} } nicID = e.BindNICID } - if to.Port == 0 { + if opts.To.Port == 0 { // Port 0 is an invalid port to send to. - return 0, &tcpip.ErrInvalidEndpointState{} + return udpPacketInfo{}, &tcpip.ErrInvalidEndpointState{} } - dst, netProto, err := e.checkV4MappedLocked(*to) + dst, netProto, err := e.checkV4MappedLocked(*opts.To) if err != nil { - return 0, err + return udpPacketInfo{}, err } r, _, err := e.connectRoute(nicID, dst, netProto) if err != nil { - return 0, err + return udpPacketInfo{}, err } defer r.Release() @@ -525,12 +507,12 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp } if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { - return 0, &tcpip.ErrBroadcastDisabled{} + return udpPacketInfo{}, &tcpip.ErrBroadcastDisabled{} } v := make([]byte, p.Len()) if _, err := io.ReadFull(p, v); err != nil { - return 0, &tcpip.ErrBadBuffer{} + return udpPacketInfo{}, &tcpip.ErrBadBuffer{} } if len(v) > header.UDPMaximumPacketSize { // Payload can't possibly fit in a packet. @@ -548,24 +530,39 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp v, ) } - return 0, &tcpip.ErrMessageTooLong{} + return udpPacketInfo{}, &tcpip.ErrMessageTooLong{} } ttl := e.ttl useDefaultTTL := ttl == 0 - if header.IsV4MulticastAddress(route.RemoteAddress()) || header.IsV6MulticastAddress(route.RemoteAddress()) { ttl = e.multicastTTL // Multicast allows a 0 TTL. useDefaultTTL = false } - localPort := e.ID.LocalPort - sendTOS := e.sendTOS - owner := e.owner - noChecksum := e.SocketOptions().GetNoChecksum() - lockReleased = true - e.mu.RUnlock() + return udpPacketInfo{ + route: route, + data: buffer.View(v), + localPort: e.ID.LocalPort, + remotePort: dstPort, + ttl: ttl, + useDefaultTTL: useDefaultTTL, + tos: e.sendTOS, + owner: e.owner, + noChecksum: e.SocketOptions().GetNoChecksum(), + }, nil +} + +func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + if err := e.LastError(); err != nil { + return 0, err + } + + // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.) + if opts.More { + return 0, &tcpip.ErrInvalidOptionValue{} + } // Do not hold lock when sending as loopback is synchronous and if the UDP // datagram ends up generating an ICMP response then it can result in a @@ -577,10 +574,15 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp // // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read // locking is prohibited. - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil { + u, err := e.buildUDPPacketInfo(p, opts) + if err != nil { return 0, err } - return int64(len(v)), nil + n, err := u.send() + if err != nil { + return 0, err + } + return int64(n), nil } // OnReuseAddressSet implements tcpip.SocketOptionsHandler. @@ -817,14 +819,30 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { return nil } -// sendUDP sends a UDP segment via the provided network endpoint and under the -// provided identity. -func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner, noChecksum bool) tcpip.Error { +// udpPacketInfo contains all information required to send a UDP packet. +// +// This should be used as a value-only type, which exists in order to simplify +// return value syntax. It should not be exported or extended. +type udpPacketInfo struct { + route *stack.Route + data buffer.View + localPort uint16 + remotePort uint16 + ttl uint8 + useDefaultTTL bool + tos uint8 + owner tcpip.PacketOwner + noChecksum bool +} + +// send sends the given packet. +func (u *udpPacketInfo) send() (int, tcpip.Error) { + vv := u.data.ToVectorisedView() pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: header.UDPMinimumSize + int(r.MaxHeaderLength()), - Data: data, + ReserveHeaderBytes: header.UDPMinimumSize + int(u.route.MaxHeaderLength()), + Data: vv, }) - pkt.Owner = owner + pkt.Owner = u.owner // Initialize the UDP header. udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize)) @@ -832,8 +850,8 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u length := uint16(pkt.Size()) udp.Encode(&header.UDPFields{ - SrcPort: localPort, - DstPort: remotePort, + SrcPort: u.localPort, + DstPort: u.remotePort, Length: length, }) @@ -841,30 +859,30 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u // On IPv4, UDP checksum is optional, and a zero value indicates the // transmitter skipped the checksum generation (RFC768). // On IPv6, UDP checksum is not optional (RFC2460 Section 8.1). - if r.RequiresTXTransportChecksum() && - (!noChecksum || r.NetProto() == header.IPv6ProtocolNumber) { - xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) - for _, v := range data.Views() { + if u.route.RequiresTXTransportChecksum() && + (!u.noChecksum || u.route.NetProto() == header.IPv6ProtocolNumber) { + xsum := u.route.PseudoHeaderChecksum(ProtocolNumber, length) + for _, v := range vv.Views() { xsum = header.Checksum(v, xsum) } udp.SetChecksum(^udp.CalculateChecksum(xsum)) } - if useDefaultTTL { - ttl = r.DefaultTTL() + if u.useDefaultTTL { + u.ttl = u.route.DefaultTTL() } - if err := r.WritePacket(stack.NetworkHeaderParams{ + if err := u.route.WritePacket(stack.NetworkHeaderParams{ Protocol: ProtocolNumber, - TTL: ttl, - TOS: tos, + TTL: u.ttl, + TOS: u.tos, }, pkt); err != nil { - r.Stats().UDP.PacketSendErrors.Increment() - return err + u.route.Stats().UDP.PacketSendErrors.Increment() + return 0, err } // Track count of packets sent. - r.Stats().UDP.PacketsSent.Increment() - return nil + u.route.Stats().UDP.PacketsSent.Increment() + return len(u.data), nil } // checkV4MappedLocked determines the effective network protocol and converts diff --git a/pkg/test/testutil/testutil_runfiles.go b/pkg/test/testutil/testutil_runfiles.go index ece9ea9a1..1dbd48a47 100644 --- a/pkg/test/testutil/testutil_runfiles.go +++ b/pkg/test/testutil/testutil_runfiles.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package testutil import ( diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD index 229a8341b..9c37a9626 100644 --- a/pkg/usermem/BUILD +++ b/pkg/usermem/BUILD @@ -14,10 +14,10 @@ go_library( deps = [ "//pkg/atomicbitops", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/gohacks", "//pkg/hostarch", "//pkg/safemem", - "//pkg/syserror", ], ) @@ -33,6 +33,5 @@ go_test( "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/safemem", - "//pkg/syserror", ], ) diff --git a/pkg/usermem/bytes_io.go b/pkg/usermem/bytes_io.go index 3da3c0294..777ac59a6 100644 --- a/pkg/usermem/bytes_io.go +++ b/pkg/usermem/bytes_io.go @@ -16,9 +16,9 @@ package usermem import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/syserror" ) const maxInt = int(^uint(0) >> 1) @@ -51,7 +51,7 @@ func (b *BytesIO) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, op // ZeroOut implements IO.ZeroOut. func (b *BytesIO) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts IOOpts) (int64, error) { if toZero > int64(maxInt) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } rngN, rngErr := b.rangeCheck(addr, int(toZero)) if rngN == 0 { @@ -89,15 +89,15 @@ func (b *BytesIO) rangeCheck(addr hostarch.Addr, length int) (int, error) { return 0, nil } if length < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } max := hostarch.Addr(len(b.Bytes)) if addr >= max { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } end, ok := addr.AddLength(uint64(length)) if !ok || end > max { - return int(max - addr), syserror.EFAULT + return int(max - addr), linuxerr.EFAULT } return length, nil } diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index 0d6d25e50..cde1038ed 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -22,11 +22,10 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/safemem" ) // IO provides access to the contents of a virtual memory space. @@ -163,7 +162,7 @@ func (rw *IOReadWriter) Read(dst []byte) (int, error) { // Disallow wraparound. rw.Addr = ^hostarch.Addr(0) if err != nil { - err = syserror.EFAULT + err = linuxerr.EFAULT } } return n, err @@ -179,7 +178,7 @@ func (rw *IOReadWriter) Write(src []byte) (int, error) { // Disallow wraparound. rw.Addr = ^hostarch.Addr(0) if err != nil { - err = syserror.EFAULT + err = linuxerr.EFAULT } } return n, err @@ -214,7 +213,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr hostarch.Addr, maxlen int, o } end, ok := addr.AddLength(uint64(readlen)) if !ok { - return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT + return gohacks.StringFromImmutableBytes(buf[:done]), linuxerr.EFAULT } // Shorten the read to avoid crossing page boundaries, since faulting // in a page unnecessarily is expensive. This also ensures that partial @@ -244,7 +243,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr hostarch.Addr, maxlen int, o } addr = end } - return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG + return gohacks.StringFromImmutableBytes(buf), linuxerr.ENAMETOOLONG } // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The @@ -382,7 +381,7 @@ func CopyInt32StringsInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSe // Parse a single value. val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) if err != nil { - return int64(i), syserror.EINVAL + return int64(i), linuxerr.EINVAL } dsts[j] = int32(val) @@ -398,7 +397,7 @@ func CopyInt32StringsInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSe return int64(i), cperr } if j == 0 { - return int64(i), syserror.EINVAL + return int64(i), linuxerr.EINVAL } return int64(i), nil } diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go index 6ef2b571f..a5e2fe69e 100644 --- a/pkg/usermem/usermem_test.go +++ b/pkg/usermem/usermem_test.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/syserror" ) // newContext returns a context.Context that we can use in these tests (we @@ -52,7 +51,7 @@ func TestBytesIOCopyOutSuccess(t *testing.T) { func TestBytesIOCopyOutFailure(t *testing.T) { b := newBytesIOString("ABC") n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := 2, linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) { @@ -76,7 +75,7 @@ func TestBytesIOCopyInFailure(t *testing.T) { b := newBytesIOString("Afo") var dst [3]byte n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := 2, linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) { @@ -98,7 +97,7 @@ func TestBytesIOZeroOutSuccess(t *testing.T) { func TestBytesIOZeroOutFailure(t *testing.T) { b := newBytesIOString("ABC") n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{}) - if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := int64(2), linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) { @@ -126,7 +125,7 @@ func TestBytesIOCopyOutFromFailure(t *testing.T) { {Start: 1, End: 4}, {Start: 4, End: 7}, }), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := int64(4), linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) { @@ -156,7 +155,7 @@ func TestBytesIOCopyInToFailure(t *testing.T) { {Start: 1, End: 4}, {Start: 4, End: 7}, }), safemem.FromIOWriter{&dst}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := int64(4), linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { @@ -207,14 +206,14 @@ func TestCopyStringInVeryLong(t *testing.T) { func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { want := strings.Repeat("A", copyStringIncrement-1) got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) - if wantErr := syserror.EFAULT; got != want || err != wantErr { + if wantErr := linuxerr.EFAULT; got != want || err != wantErr { t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) } } func TestCopyStringInTruncatedByMaxlen(t *testing.T) { got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{}) - if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr { + if want, wantErr := strings.Repeat("A", 5), linuxerr.ENAMETOOLONG; got != want || err != wantErr { t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) } } @@ -274,7 +273,7 @@ func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) { initial := []int32{1, 2} dsts := append([]int32(nil), initial...) if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); !linuxerr.Equals(linuxerr.EINVAL, err) { - t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL) + t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, linuxerr.EINVAL) } if !reflect.DeepEqual(dsts, initial) { t.Errorf("dsts: got %v, wanted %v", dsts, initial) |