From 14087485bdf1f697ab7b5ece168408de2aab4624 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Thu, 18 Jun 2020 23:20:23 -0400 Subject: updated the functions to distinguish IA/DA for Arm64 We need to correctly distinguish instruction_abort/data_abort for mem_abort@Arm64. So, EC/WNR/FSC in esr_el1 should be checked. Signed-off-by: Bin Lu --- pkg/sentry/platform/kvm/kvm_const_arm64.go | 11 +++++ pkg/sentry/platform/kvm/machine_arm64.go | 72 +++++++++++++----------------- 2 files changed, 41 insertions(+), 42 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go index 6f0539c29..fdc599477 100644 --- a/pkg/sentry/platform/kvm/kvm_const_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go @@ -116,6 +116,17 @@ const ( // Arm64: Exception Syndrome Register EL1. const ( + _ESR_ELx_EC_SHIFT = 26 + _ESR_ELx_EC_MASK = 0x3F << _ESR_ELx_EC_SHIFT + + _ESR_ELx_EC_IMP_DEF = 0x1f + _ESR_ELx_EC_IABT_LOW = 0x20 + _ESR_ELx_EC_IABT_CUR = 0x21 + _ESR_ELx_EC_PC_ALIGN = 0x22 + + _ESR_ELx_CM = 1 << 8 + _ESR_ELx_WNR = 1 << 6 + _ESR_ELx_FSC = 0x3F _ESR_SEGV_MAPERR_L0 = 0x4 diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index f3bf973de..9db171af9 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -125,71 +125,59 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.Acc return usermem.NoAccess, platform.ErrContextSignal } +// isInstructionAbort returns true if it is an instruction abort. +// +//go:nosplit +func isInstructionAbort(code uint64) bool { + value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT + return value == _ESR_ELx_EC_IABT_LOW +} + +// isWriteFault returns whether it is a write fault. +// +//go:nosplit +func isWriteFault(code uint64) bool { + if isInstructionAbort(code) { + return false + } + + return (code & _ESR_ELx_WNR) != 0 +} + // fault generates an appropriate fault return. // //go:nosplit func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + bluepill(c) // Probably no-op, but may not be. faultAddr := c.GetFaultAddr() code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return usermem.NoAccess, platform.ErrContextInterrupt + } + // Reset the pointed SignalInfo. *info = arch.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) - read := true - write := false - execute := true - ret := code & _ESR_ELx_FSC switch ret { case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: info.Code = 1 //SEGV_MAPERR - read = false - write = true - execute = false case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: info.Code = 2 // SEGV_ACCERR. - read = true - write = false - execute = false default: info.Code = 2 } - if !user { - read = true - write = false - execute = true - - } accessType := usermem.AccessType{ - Read: read, - Write: write, - Execute: execute, + Read: !isWriteFault(uint64(code)), + Write: isWriteFault(uint64(code)), + Execute: isInstructionAbort(uint64(code)), } return accessType, platform.ErrContextSignal } - -// retryInGuest runs the given function in guest mode. -// -// If the function does not complete in guest mode (due to execution of a -// system call due to a GC stall, for example), then it will be retried. The -// given function must be idempotent as a result of the retry mechanism. -func (m *machine) retryInGuest(fn func()) { - c := m.Get() - defer m.Put(c) - for { - c.ClearErrorCode() // See below. - bluepill(c) // Force guest mode. - fn() // Execute the given function. - _, user := c.ErrorCode() - if user { - // If user is set, then we haven't bailed back to host - // mode via a kernel exception or system call. We - // consider the full function to have executed in guest - // mode and we can return. - break - } - } -} -- cgit v1.2.3 From f82dd8ddb477b8923d1db12654a62d55d613019c Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 28 Jul 2020 21:22:52 -0700 Subject: Redirect TODO to GitHub issues PiperOrigin-RevId: 323715260 --- pkg/sentry/socket/netstack/netstack.go | 4 ++-- pkg/tcpip/stack/nic.go | 2 +- pkg/tcpip/stack/stack_test.go | 6 +++--- pkg/tcpip/transport/packet/endpoint.go | 4 ++-- test/syscalls/linux/packet_socket.cc | 4 ++-- test/syscalls/linux/packet_socket_raw.cc | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 44b3fff46..f86e6cd7a 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -423,7 +423,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } - // TODO(b/129292371): Return protocol too. + // TODO(gvisor.dev/issue/173): Return protocol too. return tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), @@ -2418,7 +2418,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) return &out, uint32(sockAddrInet6Size) case linux.AF_PACKET: - // TODO(b/129292371): Return protocol too. + // TODO(gvisor.dev/issue/173): Return protocol too. var out linux.SockAddrLink out.Family = linux.AF_PACKET out.InterfaceIndex = int32(addr.NIC) diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index fea0ce7e8..9256d4d43 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -181,7 +181,7 @@ func (n *NIC) disableLocked() *tcpip.Error { return nil } - // TODO(b/147015577): Should Routes that are currently bound to n be + // TODO(gvisor.dev/issue/1491): Should Routes that are currently bound to n be // invalidated? Currently, Routes will continue to work when a NIC is enabled // again, and applications may not know that the underlying NIC was ever // disabled. diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 7657a4101..101ca2206 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -867,9 +867,9 @@ func TestRouteWithDownNIC(t *testing.T) { // Writes with Routes that use NIC1 after being brought up should // succeed. // - // TODO(b/147015577): Should we instead completely invalidate all - // Routes that were bound to a NIC that was brought down at some - // point? + // TODO(gvisor.dev/issue/1491): Should we instead completely + // invalidate all Routes that were bound to a NIC that was brought + // down at some point? if err := upFn(s, nicID1); err != nil { t.Fatalf("test.upFn(_, %d): %s", nicID1, err) } diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 0e46e6355..df478115d 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -193,7 +193,7 @@ func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMes } func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { - // TODO(b/129292371): Implement. + // TODO(gvisor.dev/issue/173): Implement. return 0, nil, tcpip.ErrInvalidOptionValue } @@ -432,7 +432,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, // Push new packet into receive list and increment the buffer size. var packet packet - // TODO(b/129292371): Return network protocol. + // TODO(gvisor.dev/issue/173): Return network protocol. if len(pkt.LinkHeader) > 0 { // Get info directly from the ethernet header. hdr := header.Ethernet(pkt.LinkHeader) diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc index 40aa9326d..861617ff7 100644 --- a/test/syscalls/linux/packet_socket.cc +++ b/test/syscalls/linux/packet_socket.cc @@ -188,7 +188,7 @@ void ReceiveMessage(int sock, int ifindex) { // sizeof(sockaddr_ll). ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); - // TODO(b/129292371): Verify protocol once we return it. + // TODO(gvisor.dev/issue/173): Verify protocol once we return it. // Verify the source address. EXPECT_EQ(src.sll_family, AF_PACKET); EXPECT_EQ(src.sll_ifindex, ifindex); @@ -234,7 +234,7 @@ TEST_P(CookedPacketTest, Receive) { // Send via a packet socket. TEST_P(CookedPacketTest, Send) { - // TODO(b/129292371): Remove once we support packet socket writing. + // TODO(gvisor.dev/issue/173): Remove once we support packet socket writing. SKIP_IF(IsRunningOnGvisor()); // Let's send a UDP packet and receive it using a regular UDP socket. diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc index 2fca9fe4d..a11a03415 100644 --- a/test/syscalls/linux/packet_socket_raw.cc +++ b/test/syscalls/linux/packet_socket_raw.cc @@ -195,7 +195,7 @@ TEST_P(RawPacketTest, Receive) { // sizeof(sockaddr_ll). ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); - // TODO(b/129292371): Verify protocol once we return it. + // TODO(gvisor.dev/issue/173): Verify protocol once we return it. // Verify the source address. EXPECT_EQ(src.sll_family, AF_PACKET); EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex()); @@ -240,7 +240,7 @@ TEST_P(RawPacketTest, Receive) { // Send via a packet socket. TEST_P(RawPacketTest, Send) { - // TODO(b/129292371): Remove once we support packet socket writing. + // TODO(gvisor.dev/issue/173): Remove once we support packet socket writing. SKIP_IF(IsRunningOnGvisor()); // Let's send a UDP packet and receive it using a regular UDP socket. -- cgit v1.2.3 From 4cd4759238655e35c8dc63723f4e55014b5ea9ea Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 29 Jul 2020 14:56:14 -0700 Subject: Force registration for EPOLLHUP, not EPOLLRDHUP, in vfs2's epoll. Compare Linux's fs/eventpoll.c:do_epoll_ctl(). I don't know where EPOLLRDHUP came from. PiperOrigin-RevId: 323874419 --- pkg/sentry/vfs/epoll.go | 4 ++-- test/syscalls/linux/epoll.cc | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 599c3131c..5b009b928 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -186,7 +186,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin } // Register interest in file. - mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP epi := &epollInterest{ epoll: ep, key: key, @@ -257,7 +257,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event } // Update epi for the next call to ep.ReadEvents(). - mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP ep.mu.Lock() epi.mask = mask epi.userData = event.Data diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc index f57d38dc7..2101e5c9f 100644 --- a/test/syscalls/linux/epoll.cc +++ b/test/syscalls/linux/epoll.cc @@ -422,6 +422,28 @@ TEST(EpollTest, CloseFile) { SyscallSucceedsWithValue(0)); } +TEST(EpollTest, PipeReaderHupAfterWriterClosed) { + auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD()); + int pipefds[2]; + ASSERT_THAT(pipe(pipefds), SyscallSucceeds()); + FileDescriptor rfd(pipefds[0]); + FileDescriptor wfd(pipefds[1]); + + ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), rfd.get(), 0, kMagicConstant)); + struct epoll_event result[kFDsPerEpoll]; + // Initially, rfd should not generate any events of interest. + ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, 0), + SyscallSucceedsWithValue(0)); + // Close the write end of the pipe. + wfd.reset(); + // rfd should now generate EPOLLHUP, which EPOLL_CTL_ADD unconditionally adds + // to the set of events of interest. + ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, 0), + SyscallSucceedsWithValue(1)); + EXPECT_EQ(result[0].events, EPOLLHUP); + EXPECT_EQ(result[0].data.u64, kMagicConstant); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 2e19a8b951e9402b28b4e601e65c51e69c815db1 Mon Sep 17 00:00:00 2001 From: Jinmou Li Date: Sat, 27 Jun 2020 02:14:56 +0000 Subject: Add FUSE_INIT This change allows the sentry to send FUSE_INIT request and process the reply. It adds the corresponding structs, employs the fuse device to send and read the message, and stores the results of negotiation in corresponding places (inside connection struct). It adds a CallAsync() function to the FUSE connection interface: - like Call(), but it's for requests that do not expect immediate response (init, release, interrupt etc.) - will block if the connection hasn't initialized, which is the same for Call() --- pkg/abi/linux/fuse.go | 105 ++++++++++++++++ pkg/sentry/fsimpl/fuse/BUILD | 4 +- pkg/sentry/fsimpl/fuse/connection.go | 234 +++++++++++++++++++++++++++++++---- pkg/sentry/fsimpl/fuse/dev.go | 13 +- pkg/sentry/fsimpl/fuse/dev_test.go | 19 ++- pkg/sentry/fsimpl/fuse/fusefs.go | 22 ++-- pkg/sentry/fsimpl/fuse/init.go | 166 +++++++++++++++++++++++++ pkg/syserror/syserror.go | 1 + 8 files changed, 512 insertions(+), 52 deletions(-) create mode 100644 pkg/sentry/fsimpl/fuse/init.go (limited to 'pkg/sentry') diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go index d3ebbccc4..5c6ffe4a3 100644 --- a/pkg/abi/linux/fuse.go +++ b/pkg/abi/linux/fuse.go @@ -141,3 +141,108 @@ type FUSEWriteIn struct { _ uint32 } + +// FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h. +const ( + FUSE_ASYNC_READ = 1 << 0 + FUSE_POSIX_LOCKS = 1 << 1 + FUSE_FILE_OPS = 1 << 2 + FUSE_ATOMIC_O_TRUNC = 1 << 3 + FUSE_EXPORT_SUPPORT = 1 << 4 + FUSE_BIG_WRITES = 1 << 5 + FUSE_DONT_MASK = 1 << 6 + FUSE_SPLICE_WRITE = 1 << 7 + FUSE_SPLICE_MOVE = 1 << 8 + FUSE_SPLICE_READ = 1 << 9 + FUSE_FLOCK_LOCKS = 1 << 10 + FUSE_HAS_IOCTL_DIR = 1 << 11 + FUSE_AUTO_INVAL_DATA = 1 << 12 + FUSE_DO_READDIRPLUS = 1 << 13 + FUSE_READDIRPLUS_AUTO = 1 << 14 + FUSE_ASYNC_DIO = 1 << 15 + FUSE_WRITEBACK_CACHE = 1 << 16 + FUSE_NO_OPEN_SUPPORT = 1 << 17 + FUSE_PARALLEL_DIROPS = 1 << 18 + FUSE_HANDLE_KILLPRIV = 1 << 19 + FUSE_POSIX_ACL = 1 << 20 + FUSE_ABORT_ERROR = 1 << 21 + FUSE_MAX_PAGES = 1 << 22 + FUSE_CACHE_SYMLINKS = 1 << 23 + FUSE_NO_OPENDIR_SUPPORT = 1 << 24 + FUSE_EXPLICIT_INVAL_DATA = 1 << 25 + FUSE_MAP_ALIGNMENT = 1 << 26 +) + +// currently supported FUSE protocol version numbers. +const ( + FUSE_KERNEL_VERSION = 7 + FUSE_KERNEL_MINOR_VERSION = 31 +) + +// FUSEInitIn is the request sent by the kernel to the daemon, +// to negotiate the version and flags. +// +// +marshal +type FUSEInitIn struct { + // Major version supported by kernel. + Major uint32 + + // Minor version supported by the kernel. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead + // decided by the kernel. + MaxReadahead uint32 + + // Flags of this init request. + Flags uint32 +} + +// FUSEInitOut is the reply sent by the daemon to the kernel +// for FUSEInitIn. +// +// +marshal +type FUSEInitOut struct { + // Major version supported by daemon. + Major uint32 + + // Minor version supported by daemon. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead. + // Decided by the daemon, after receiving the value from kernel. + MaxReadahead uint32 + + // Flags of this init reply. + Flags uint32 + + // MaxBackground is the maximum number of pending background requests + // that the daemon wants. + MaxBackground uint16 + + // CongestionThreshold is the daemon-decided threshold for + // the number of the pending background requests. + CongestionThreshold uint16 + + // MaxWrite is the daemon's maximum size of a write buffer. + // Kernel adjusts it to the minimum (fuse/init.go:fuseMinMaxWrite). + // if the value from daemon is too small. + MaxWrite uint32 + + // TimeGran is the daemon's time granularity for mtime and ctime metadata. + // The unit is nanosecond. + // Value should be power of 10. + // 1 indicates full nanosecond granularity support. + TimeGran uint32 + + // MaxPages is the daemon's maximum number of pages for one write operation. + // Kernel adjusts it to the maximum (fuse/init.go:FUSE_MAX_MAX_PAGES). + // if the value from daemon is too large. + MaxPages uint16 + + // MapAlignment is an unknown field and not used by this package at this moment. + // Use as a placeholder to be consistent with the FUSE protocol. + MapAlignment uint16 + + _ [8]uint32 +} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 67649e811..999111deb 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -21,6 +21,7 @@ go_library( "connection.go", "dev.go", "fusefs.go", + "init.go", "register.go", "request_list.go", ], @@ -44,14 +45,13 @@ go_library( ) go_test( - name = "dev_test", + name = "fuse_test", size = "small", srcs = ["dev_test.go"], library = ":fuse", deps = [ "//pkg/abi/linux", "//pkg/sentry/fsimpl/testutil", - "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index f330da0bd..6df2728ab 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -17,6 +17,8 @@ package fuse import ( "errors" "fmt" + "sync" + "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,18 +27,29 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" "gvisor.dev/gvisor/tools/go_marshal/marshal" ) -// MaxActiveRequestsDefault is the default setting controlling the upper bound +// maxActiveRequestsDefault is the default setting controlling the upper bound // on the number of active requests at any given time. -const MaxActiveRequestsDefault = 10000 +const maxActiveRequestsDefault = 10000 -var ( - // Ordinary requests have even IDs, while interrupts IDs are odd. - InitReqBit uint64 = 1 - ReqIDStep uint64 = 2 +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + +const ( + // fuseDefaultMaxBackground is the default value for MaxBackground. + fuseDefaultMaxBackground = 12 + + // fuseDefaultCongestionThreshold is the default value for CongestionThreshold, + // and is 75% of the default maximum of MaxGround. + fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4) + + // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq. + fuseDefaultMaxPagesPerReq = 32 ) // Request represents a FUSE operation request that hasn't been sent to the @@ -61,17 +74,125 @@ type Response struct { data []byte } -// Connection is the struct by which the sentry communicates with the FUSE server daemon. -type Connection struct { +// connection is the struct by which the sentry communicates with the FUSE server daemon. +type connection struct { fd *DeviceFD - // MaxWrite is the daemon's maximum size of a write buffer. - // This is negotiated during FUSE_INIT. - MaxWrite uint32 + // The following FUSE_INIT flags are currently unsupported by this implementation: + // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC) + // - FUSE_EXPORT_SUPPORT + // - FUSE_HANDLE_KILLPRIV + // - FUSE_POSIX_LOCKS: requires POSIX locks + // - FUSE_FLOCK_LOCKS: requires POSIX locks + // - FUSE_AUTO_INVAL_DATA: requires page caching eviction + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction + // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation + // - FUSE_ASYNC_DIO + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler + + // initialized after receiving FUSE_INIT reply. + // Until it's set, suspend sending FUSE requests. + // Use SetInitialized() and IsInitialized() for atomic access. + initialized int32 + + // initializedChan is used to block requests before initialization. + initializedChan chan struct{} + + // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground). + // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block. + blocked bool + + // connected (connection established) when a new FUSE file system is created. + // Set to false when: + // umount, + // connection abort, + // device release. + connected bool + + // aborted via sysfs. + // TODO(gvisor.dev/issue/3185): abort all queued requests. + aborted bool + + // connInitError if FUSE_INIT encountered error (major version mismatch). + // Only set in INIT. + connInitError bool + + // connInitSuccess if FUSE_INIT is successful. + // Only set in INIT. + // Used for destory. + connInitSuccess bool + + // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress. + + // NumberBackground is the number of requests in the background. + numBackground uint16 + + // congestionThreshold for NumBackground. + // Negotiated in FUSE_INIT. + congestionThreshold uint16 + + // maxBackground is the maximum number of NumBackground. + // Block connection when it is reached. + // Negotiated in FUSE_INIT. + maxBackground uint16 + + // numActiveBackground is the number of requests in background and has being marked as active. + numActiveBackground uint16 + + // numWating is the number of requests waiting for completion. + numWaiting uint32 + + // TODO(gvisor.dev/issue/3185): BgQueue + // some queue for background queued requests. + + // bgLock protects: + // MaxBackground, CongestionThreshold, NumBackground, + // NumActiveBackground, BgQueue, Blocked. + bgLock sync.Mutex + + // maxRead is the maximum size of a read buffer in in bytes. + maxRead uint32 + + // maxWrite is the maximum size of a write buffer in bytes. + // Negotiated in FUSE_INIT. + maxWrite uint32 + + // maxPages is the maximum number of pages for a single request to use. + // Negotiated in FUSE_INIT. + maxPages uint16 + + // minor version of the FUSE protocol. + // Negotiated and only set in INIT. + minor uint32 + + // asyncRead if read pages asynchronously. + // Negotiated and only set in INIT. + asyncRead bool + + // abortErr is true if kernel need to return an unique read error after abort. + // Negotiated and only set in INIT. + abortErr bool + + // writebackCache is true for write-back cache policy, + // false for write-through policy. + // Negotiated and only set in INIT. + writebackCache bool + + // cacheSymlinks if filesystem needs to cache READLINK responses in page cache. + // Negotiated and only set in INIT. + cacheSymlinks bool + + // bigWrites if doing multi-page cached writes. + // Negotiated and only set in INIT. + bigWrites bool + + // dontMask if filestestem does not apply umask to creation modes. + // Negotiated in INIT. + dontMask bool } -// NewFUSEConnection creates a FUSE connection to fd -func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*Connection, error) { +// newFUSEConnection creates a FUSE connection to fd. +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) { // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to // mount a FUSE filesystem. fuseFD := fd.Impl().(*DeviceFD) @@ -84,16 +205,41 @@ func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRe fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) fuseFD.writeCursor = 0 - return &Connection{ - fd: fuseFD, + return &connection{ + fd: fuseFD, + maxBackground: fuseDefaultMaxBackground, + congestionThreshold: fuseDefaultCongestionThreshold, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, }, nil } +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) +} + +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + // NewRequest creates a new request that can be sent to the FUSE server. -func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() - conn.fd.nextOpID += linux.FUSEOpID(ReqIDStep) + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() hdr := linux.FUSEHeaderIn{ @@ -118,13 +264,49 @@ func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint } // Call makes a request to the server and blocks the invoking task until a -// server responds with a response. -// NOTE: If no task is provided then the Call will simply enqueue the request -// and return a nil response. No blocking will happen in this case. Instead, -// this is used to signify that the processing of this request will happen by -// the kernel.Task that writes the response. See FUSE_INIT for such an -// invocation. -func (conn *Connection) Call(t *kernel.Task, r *Request) (*Response, error) { +// server responds with a response. Task should never be nil. +// Requests will not be sent before the connection is initialized. +// For async tasks, use CallAsync(). +func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { + // Block requests sent before connection is initalized. + if !conn.Initialized() { + if err := t.Block(conn.initializedChan); err != nil { + return nil, err + } + } + + return conn.call(t, r) +} + +// CallAsync makes an async (aka background) request. +// Those requests either do not expect a response (e.g. release) or +// the response should be handled by others (e.g. init). +// Return immediately unless the connection is blocked (before initialization). +// Async call example: init, release, forget, aio, interrupt. +// When the Request is FUSE_INIT, it will not be blocked before initialization. +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + // Block requests sent before connection is initalized. + if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { + if err := t.Block(conn.initializedChan); err != nil { + return err + } + } + + // This should be the only place that invokes call() with a nil task. + _, err := conn.call(nil, r) + return err +} + +// call makes a call without blocking checks. +func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { + if !conn.connected { + return nil, syserror.ENOTCONN + } + + if conn.connInitError { + return nil, syserror.ECONNREFUSED + } + fut, err := conn.callFuture(t, r) if err != nil { return nil, err @@ -160,7 +342,7 @@ func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. -func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { +func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() @@ -195,7 +377,7 @@ func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, } // callFutureLocked makes a request to the server and returns a future response. -func (conn *Connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { +func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { conn.fd.queue.PushBack(r) conn.fd.numActiveRequests += 1 fut := newFutureResponse(r.hdr.Opcode) diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index f3443ac71..2225076bc 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -98,7 +99,9 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release() {} +func (fd *DeviceFD) Release() { + fd.fs.conn.connected = false +} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { @@ -124,7 +127,7 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R minBuffSize := linux.FUSE_MIN_READ_BUFFER inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) - negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.MaxWrite + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite if minBuffSize < negotiatedMinBuffSize { minBuffSize = negotiatedMinBuffSize } @@ -385,9 +388,9 @@ func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) er // FUSE_INIT. func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { if r.opcode == linux.FUSE_INIT { - // TODO: process init response here. - // Maybe get the creds from the context? - // creds := auth.CredentialsFromContext(ctx) + creds := auth.CredentialsFromContext(ctx) + rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() + return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) } return nil diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index fcd77832a..122415275 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -60,25 +59,25 @@ func TestFUSECommunication(t *testing.T) { Name: "SingleClientSingleServer", NumClients: 1, NumServers: 1, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "SingleClientMultipleServers", NumClients: 1, NumServers: 10, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "MultipleClientsSingleServer", NumClients: 10, NumServers: 1, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "MultipleClientsMultipleServers", NumClients: 10, NumServers: 10, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "RequestCapacityFull", @@ -145,7 +144,7 @@ func TestFUSECommunication(t *testing.T) { // CallTest makes a request to the server and blocks the invoking // goroutine until a server responds with a response. Doesn't block // a kernel.Task. Analogous to Connection.Call but used for testing. -func CallTest(conn *Connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { +func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { conn.fd.mu.Lock() // Wait until we're certain that a new request can be processed. @@ -214,7 +213,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem. // fuseClientRun emulates all the actions of a normal FUSE request. It creates // a header, a payload, calls the server, waits for the response, and processes // the response. -func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *Connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { +func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { defer func() { clientDone <- struct{}{} }() tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) @@ -343,7 +342,7 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } @@ -353,7 +352,7 @@ func setup(t *testing.T) *testutil.System { // newTestConnection creates a fuse connection that the sentry can communicate with // and the FD for the server to communicate with. -func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*Connection, *vfs.FileDescription, error) { +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { vfsObj := &vfs.VirtualFilesystem{} fuseDev := &DeviceFD{} @@ -426,4 +425,4 @@ func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) // WriteTo implements io.WriterTo.WriteTo. func (t *testPayload) WriteTo(w io.Writer) (int64, error) { panic("not implemented") -} +} \ No newline at end of file diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 911b6f7cb..200a93bbf 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -65,7 +65,7 @@ type filesystem struct { // conn is used for communication between the FUSE server // daemon and the sentry fusefs. - conn *Connection + conn *connection // opts is the options the fusefs is initialized with. opts *filesystemOptions @@ -140,7 +140,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts.rootMode = rootMode // Set the maxInFlightRequests option. - fsopts.maxActiveRequests = MaxActiveRequestsDefault + fsopts.maxActiveRequests = maxActiveRequestsDefault // Check for unparsed options. if len(mopts) != 0 { @@ -157,8 +157,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - // TODO: dispatch a FUSE_INIT request to the FUSE daemon server before - // returning. Mount will not block on this dispatched request. + // Send a FUSE_INIT request to the FUSE daemon server before returning. + // This call is not blocking. + if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil { + log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err) + return nil, nil, err + } // root is the fusefs root directory. root := fs.newInode(creds, fsopts.rootMode) @@ -173,7 +177,7 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt opts: opts, } - conn, err := NewFUSEConnection(ctx, device, opts.maxActiveRequests) + conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL @@ -192,8 +196,8 @@ func (fs *filesystem) Release() { fs.Filesystem.Release() } -// Inode implements kernfs.Inode. -type Inode struct { +// inode implements kernfs.Inode. +type inode struct { kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -206,7 +210,7 @@ type Inode struct { } func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &Inode{} + i := &inode{} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.dentry.Init(i) @@ -215,7 +219,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke } // Open implements kernfs.Inode.Open. -func (i *Inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go new file mode 100644 index 000000000..779c2bd3f --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/init.go @@ -0,0 +1,166 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// consts used by FUSE_INIT negotiation. +const ( + // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut. + // Follow the same behavior as unix fuse implementation. + fuseMaxMaxPages = 256 + + // Maximum value for the time granularity for file time stamps, 1s. + // Follow the same behavior as unix fuse implementation. + fuseMaxTimeGranNs = 1000000000 + + // Minimum value for MaxWrite. + // Follow the same behavior as unix fuse implementation. + fuseMinMaxWrite = 4096 + + // Temporary default value for max readahead, 128kb. + fuseDefaultMaxReadahead = 131072 + + // The FUSE_INIT_IN flags sent to the daemon. + // TODO(gvisor.dev/issue/3199): complete the flags. + fuseDefaultInitFlags = linux.FUSE_MAX_PAGES +) + +// Adjustable maximums for Connection's cogestion control parameters. +// Used as the upperbound of the config values. +// Currently we do not support adjustment to them. +var ( + MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground + MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold +) + +// InitSend sends a FUSE_INIT request. +func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { + in := linux.FUSEInitIn{ + Major: linux.FUSE_KERNEL_VERSION, + Minor: linux.FUSE_KERNEL_MINOR_VERSION, + // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this + MaxReadahead: fuseDefaultMaxReadahead, + Flags: fuseDefaultInitFlags, + } + + req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) + if err != nil { + return err + } + + // Since there is no task to block on and FUSE_INIT is the request + // to unblock other requests, use nil. + return conn.CallAsync(nil, req) +} + +// InitRecv receives a FUSE_INIT reply and process it. +func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { + if err := res.Error(); err != nil { + return err + } + + var out linux.FUSEInitOut + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + return conn.initProcessReply(&out, hasSysAdminCap) +} + +// Process the FUSE_INIT reply from the FUSE server. +func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { + // No support for old major fuse versions. + if out.Major != linux.FUSE_KERNEL_VERSION { + conn.connInitError = true + + // Set the connection as initialized and unblock the blocked requests + // (i.e. return error for them). + conn.SetInitialized() + + return nil + } + + // Start processing the reply. + conn.connInitSuccess = true + conn.minor = out.Minor + + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.bgLock.Lock() + + if out.MaxBackground > 0 { + conn.maxBackground = out.MaxBackground + + if !hasSysAdminCap && + conn.maxBackground > MaxUserBackgroundRequest { + conn.maxBackground = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.congestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.congestionThreshold > MaxUserCongestionThreshold { + conn.congestionThreshold = MaxUserCongestionThreshold + } + } + + conn.bgLock.Unlock() + } + + // No support for the following flags before minor version 6. + if out.Minor >= 6 { + conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0 + conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 + conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 + conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 + conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0 + conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0 + + // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). + + if out.Flags&linux.FUSE_MAX_PAGES != 0 { + maxPages := out.MaxPages + if maxPages < 1 { + maxPages = 1 + } + if maxPages > fuseMaxMaxPages { + maxPages = fuseMaxMaxPages + } + conn.maxPages = maxPages + } + } + + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite + } + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite + } + + // Set connection as initialized and unblock the requests + // issued before init. + conn.SetInitialized() + + return nil +} diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index c73072c42..798e07b01 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -61,6 +61,7 @@ var ( ENOMEM = error(syscall.ENOMEM) ENOSPC = error(syscall.ENOSPC) ENOSYS = error(syscall.ENOSYS) + ENOTCONN = error(syscall.ENOTCONN) ENOTDIR = error(syscall.ENOTDIR) ENOTEMPTY = error(syscall.ENOTEMPTY) ENOTSOCK = error(syscall.ENOTSOCK) -- cgit v1.2.3 From bb25c9611ba9cb12ab42b08a3a91a095b6f7c824 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Thu, 30 Jul 2020 03:42:46 -0400 Subject: add usr-tls test cases for Arm64 Signed-off-by: Bin Lu --- pkg/sentry/platform/kvm/testutil/testutil_arm64.go | 4 ++++ pkg/sentry/platform/kvm/testutil/testutil_arm64.s | 14 ++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index ca902c8c1..4dad877ba 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -56,5 +56,9 @@ func CheckTestRegs(regs *arch.Registers, full bool) (err error) { err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need) } } + // Check tls. + if need := ^uint64(11); regs.TPIDR_EL0 != need { + err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need) + } return } diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s index 07658144e..6caf7282d 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s @@ -52,6 +52,8 @@ start: TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS + // gc will touch fpsimd, so we should test it. + // such as in . FMOVD $(9.9), F0 MOVD $SYS_GETPID, R8 // getpid SVC @@ -102,11 +104,15 @@ isNaN: TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 TWIDDLE_REGS() + MSR R10, TPIDR_EL0 + // Trapped in el0_svc. SVC RET // never reached TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 - TWIDDLE_REGS() - // Branch to Register branches unconditionally to an address in . - JMP (R4) // <=> br x4, must fault - RET // never reached + TWIDDLE_REGS() + MSR R10, TPIDR_EL0 + // Trapped in el0_ia. + // Branch to Register branches unconditionally to an address in . + JMP (R6) // <=> br x6, must fault + RET // never reached -- cgit v1.2.3 From c43305731e04ce8d4f2c9b2949c326abc9c2b77e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 30 Jul 2020 10:21:12 -0700 Subject: Fix SETOWN_EX return value. Return on success should be 0, not size of the struct copied out. PiperOrigin-RevId: 324029193 --- pkg/sentry/syscalls/linux/sys_file.go | 8 ++++---- pkg/sentry/syscalls/linux/vfs2/fd.go | 4 ++-- test/syscalls/linux/fcntl.cc | 38 ++++++++++++++++++++--------------- 3 files changed, 28 insertions(+), 22 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 2797c6a72..8cf6401e7 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -1057,7 +1057,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETOWN_EX: addr := args[2].Pointer() var owner linux.FOwnerEx - n, err := t.CopyIn(addr, &owner) + _, err := t.CopyIn(addr, &owner) if err != nil { return 0, nil, err } @@ -1069,21 +1069,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ESRCH } a.SetOwnerTask(t, task) - return uintptr(n), nil, nil + return 0, nil, nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) if tg == nil { return 0, nil, syserror.ESRCH } a.SetOwnerThreadGroup(t, tg) - return uintptr(n), nil, nil + return 0, nil, nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) if pg == nil { return 0, nil, syserror.ESRCH } a.SetOwnerProcessGroup(t, pg) - return uintptr(n), nil, nil + return 0, nil, nil default: return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 517394ba9..67f191551 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -185,11 +185,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err case linux.F_SETOWN_EX: var owner linux.FOwnerEx - n, err := t.CopyIn(args[2].Pointer(), &owner) + _, err := t.CopyIn(args[2].Pointer(), &owner) if err != nil { return 0, nil, err } - return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID) + return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID) case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 5467fa2c8..34016d4bd 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -1004,7 +1004,8 @@ TEST(FcntlTest, SetOwnPid) { pid_t pid; EXPECT_THAT(pid = getpid(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(pid)); @@ -1018,7 +1019,8 @@ TEST(FcntlTest, SetOwnPgrp) { pid_t pgid; EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), + SyscallSucceedsWithValue(0)); // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the // negative return value as an error, converting the return value to -1 and @@ -1038,8 +1040,10 @@ TEST(FcntlTest, SetOwnUnset) { // Set and unset pid. pid_t pid; EXPECT_THAT(pid = getpid(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), + SyscallSucceedsWithValue(0)); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1047,8 +1051,10 @@ TEST(FcntlTest, SetOwnUnset) { // Set and unset pgid. pid_t pgid; EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds()); - ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds()); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), + SyscallSucceedsWithValue(0)); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1120,7 +1126,7 @@ TEST(FcntlTest, SetOwnExTid) { EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(owner.pid)); @@ -1136,7 +1142,7 @@ TEST(FcntlTest, SetOwnExPid) { EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(owner.pid)); @@ -1152,7 +1158,7 @@ TEST(FcntlTest, SetOwnExPgrp) { EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the // negative return value as an error, converting the return value to -1 and @@ -1176,10 +1182,10 @@ TEST(FcntlTest, SetOwnExUnset) { owner.type = F_OWNER_PID; EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); owner.pid = 0; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1188,10 +1194,10 @@ TEST(FcntlTest, SetOwnExUnset) { owner.type = F_OWNER_PGRP; EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); owner.pid = 0; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); @@ -1207,7 +1213,7 @@ TEST(FcntlTest, GetOwnExTid) { EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); f_owner_ex got_owner = {}; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), @@ -1225,7 +1231,7 @@ TEST(FcntlTest, GetOwnExPid) { EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); f_owner_ex got_owner = {}; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), @@ -1243,7 +1249,7 @@ TEST(FcntlTest, GetOwnExPgrp) { EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds()); ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner), - SyscallSucceeds()); + SyscallSucceedsWithValue(0)); f_owner_ex got_owner = {}; ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner), -- cgit v1.2.3 From 68a7da9549ac4f6290fe85d2dab550e11f16b209 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 31 Jul 2020 00:38:57 -0700 Subject: Clean up vfs2 fallocate. Move to setstat.go and add a FileDescription wrapper method. PiperOrigin-RevId: 324165277 --- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 50 ---------------------------- pkg/sentry/syscalls/linux/vfs2/setstat.go | 50 ++++++++++++++++++++++++++++ pkg/sentry/vfs/file_description.go | 7 +++- 3 files changed, 56 insertions(+), 51 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 6b14c2bef..b6d2ddd65 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -245,55 +244,6 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd }) } -// Fallocate implements linux system call fallocate(2). -func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - mode := args[1].Uint64() - offset := args[2].Int64() - length := args[3].Int64() - - file := t.GetFileVFS2(fd) - - if file == nil { - return 0, nil, syserror.EBADF - } - defer file.DecRef() - - if !file.IsWritable() { - return 0, nil, syserror.EBADF - } - - if mode != 0 { - return 0, nil, syserror.ENOTSUP - } - - if offset < 0 || length <= 0 { - return 0, nil, syserror.EINVAL - } - - size := offset + length - - if size < 0 { - return 0, nil, syserror.EFBIG - } - - limit := limits.FromContext(t).Get(limits.FileSize).Cur - - if uint64(size) >= limit { - t.SendSignal(&arch.SignalInfo{ - Signo: int32(linux.SIGXFSZ), - Code: arch.SignalInfoUser, - }) - return 0, nil, syserror.EFBIG - } - - return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length)) - - // File length modified, generate notification. - // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. - // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) -} - // Rmdir implements Linux syscall rmdir(2). func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 6daedd173..37fa56c19 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -211,6 +212,55 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, handleSetSizeError(t, err) } +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) + + // File length modified, generate notification. + // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. + // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) +} + // Utime implements Linux syscall utime(2). func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 0c42574db..93861fb4a 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -354,7 +354,7 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) - // Allocate grows file represented by FileDescription to offset + length bytes. + // Allocate grows the file to offset + length bytes. // Only mode == 0 is supported currently. Allocate(ctx context.Context, mode, offset, length uint64) error @@ -563,6 +563,11 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.impl.StatFS(ctx) } +// Allocate grows file represented by FileDescription to offset + length bytes. +func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.impl.Allocate(ctx, mode, offset, length) +} + // Readiness implements waiter.Waitable.Readiness. // // It returns fd's I/O readiness. -- cgit v1.2.3 From 2a7b2a61e3ea32129c26eeaa6fab3d81a5d8ad6e Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Thu, 11 Jun 2020 20:33:56 -0700 Subject: iptables: support SO_ORIGINAL_DST Envoy (#170) uses this to get the original destination of redirected packets. --- pkg/abi/linux/netfilter.go | 8 +- pkg/abi/linux/socket.go | 4 +- pkg/sentry/socket/netstack/netstack.go | 17 ++++ pkg/sentry/strace/socket.go | 1 + pkg/tcpip/stack/conntrack.go | 26 ++++++ pkg/tcpip/stack/iptables.go | 11 ++- pkg/tcpip/tcpip.go | 4 + pkg/tcpip/transport/tcp/endpoint.go | 11 +++ test/iptables/BUILD | 1 + test/iptables/iptables_test.go | 8 ++ test/iptables/iptables_unsafe.go | 63 ++++++++++++++ test/iptables/iptables_util.go | 51 ++++++++++- test/iptables/nat.go | 152 ++++++++++++++++++++++++++++++++- 13 files changed, 344 insertions(+), 13 deletions(-) create mode 100644 test/iptables/iptables_unsafe.go (limited to 'pkg/sentry') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index a91f9f018..9c27f7bb2 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -59,7 +59,7 @@ var VerdictStrings = map[int32]string{ NF_RETURN: "RETURN", } -// Socket options. These correspond to values in +// Socket options for SOL_SOCKET. These correspond to values in // include/uapi/linux/netfilter_ipv4/ip_tables.h. const ( IPT_BASE_CTL = 64 @@ -74,6 +74,12 @@ const ( IPT_SO_GET_MAX = IPT_SO_GET_REVISION_TARGET ) +// Socket option for SOL_IP. This corresponds to the value in +// include/uapi/linux/netfilter_ipv4.h. +const ( + SO_ORIGINAL_DST = 80 +) + // Name lengths. These correspond to values in // include/uapi/linux/netfilter/x_tables.h. const ( diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index c24a8216e..d6946bb82 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -239,11 +239,13 @@ const SockAddrMax = 128 type InetAddr [4]byte // SockAddrInet is struct sockaddr_in, from uapi/linux/in.h. +// +// +marshal type SockAddrInet struct { Family uint16 Port uint16 Addr InetAddr - Zero [8]uint8 // pad to sizeof(struct sockaddr). + _ [8]uint8 // pad to sizeof(struct sockaddr). } // InetMulticastRequest is struct ip_mreq, from uapi/linux/in.h. diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index f86e6cd7a..31a168f7e 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -1490,6 +1490,10 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha vP := primitive.Int32(boolToInt32(v)) return &vP, nil + case linux.SO_ORIGINAL_DST: + // TODO(gvisor.dev/issue/170): ip6tables. + return nil, syserr.ErrInvalidArgument + default: emitUnimplementedEventIPv6(t, name) } @@ -1600,6 +1604,19 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in vP := primitive.Int32(boolToInt32(v)) return &vP, nil + case linux.SO_ORIGINAL_DST: + if outLen < int(binary.Size(linux.SockAddrInet{})) { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.OriginalDestinationOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) + return a.(*linux.SockAddrInet), nil + default: emitUnimplementedEventIP(t, name) } diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index c0512de89..b51c4c941 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -521,6 +521,7 @@ var sockOptNames = map[uint64]abi.ValueSet{ linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT", linux.IP_PKTOPTIONS: "IP_PKTOPTIONS", linux.IP_MTU: "IP_MTU", + linux.SO_ORIGINAL_DST: "SO_ORIGINAL_DST", }, linux.SOL_SOCKET: { linux.SO_ERROR: "SO_ERROR", diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index 559a1c4dd..470c265aa 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -240,7 +240,10 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { if err != nil { return nil, dirOriginal } + return ct.connForTID(tid) +} +func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) { bucket := ct.bucket(tid) now := time.Now() @@ -604,3 +607,26 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo return true } + +func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) { + // Lookup the connection. The reply's original destination + // describes the original address. + tid := tupleID{ + srcAddr: epID.LocalAddress, + srcPort: epID.LocalPort, + dstAddr: epID.RemoteAddress, + dstPort: epID.RemotePort, + transProto: header.TCPProtocolNumber, + netProto: header.IPv4ProtocolNumber, + } + conn, _ := ct.connForTID(tid) + if conn == nil { + // Not a tracked connection. + return "", 0, tcpip.ErrNotConnected + } else if conn.manip == manipNone { + // Unmanipulated connection. + return "", 0, tcpip.ErrInvalidOptionValue + } + + return conn.original.dstAddr, conn.original.dstPort, nil +} diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index cbbae4224..110ba073d 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -218,19 +218,16 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr // Many users never configure iptables. Spare them the cost of rule // traversal if rules have never been set. it.mu.RLock() + defer it.mu.RUnlock() if !it.modified { - it.mu.RUnlock() return true } - it.mu.RUnlock() // Packets are manipulated only if connection and matching // NAT rule exists. shouldTrack := it.connections.handlePacket(pkt, hook, gso, r) // Go through each table containing the hook. - it.mu.RLock() - defer it.mu.RUnlock() priorities := it.priorities[hook] for _, tableID := range priorities { // If handlePacket already NATed the packet, we don't need to @@ -418,3 +415,9 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx // All the matchers matched, so run the target. return rule.Target.Action(pkt, &it.connections, hook, gso, r, address) } + +// OriginalDst returns the original destination of redirected connections. It +// returns an error if the connection doesn't exist or isn't redirected. +func (it *IPTables) OriginalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) { + return it.connections.originalDst(epID) +} diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index a634b9b60..45f59b60f 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -954,6 +954,10 @@ type DefaultTTLOption uint8 // classic BPF filter on a given endpoint. type SocketDetachFilterOption int +// OriginalDestinationOption is used to get the original destination address +// and port of a redirected packet. +type OriginalDestinationOption FullAddress + // IPPacketInfo is the message structure for IP_PKTINFO. // // +stateify savable diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 0f7487963..682687ebe 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2017,6 +2017,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { *o = tcpip.TCPDeferAcceptOption(e.deferAccept) e.UnlockUser() + case *tcpip.OriginalDestinationOption: + ipt := e.stack.IPTables() + addr, port, err := ipt.OriginalDst(e.ID) + if err != nil { + return err + } + *o = tcpip.OriginalDestinationOption{ + Addr: addr, + Port: port, + } + default: return tcpip.ErrUnknownProtocolOption } diff --git a/test/iptables/BUILD b/test/iptables/BUILD index 40b63ebbe..66453772a 100644 --- a/test/iptables/BUILD +++ b/test/iptables/BUILD @@ -9,6 +9,7 @@ go_library( "filter_input.go", "filter_output.go", "iptables.go", + "iptables_unsafe.go", "iptables_util.go", "nat.go", ], diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index 550b6198a..fda5f694f 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -371,3 +371,11 @@ func TestFilterAddrs(t *testing.T) { } } } + +func TestNATPreOriginalDst(t *testing.T) { + singleTest(t, NATPreOriginalDst{}) +} + +func TestNATOutOriginalDst(t *testing.T) { + singleTest(t, NATOutOriginalDst{}) +} diff --git a/test/iptables/iptables_unsafe.go b/test/iptables/iptables_unsafe.go new file mode 100644 index 000000000..bd85a8fea --- /dev/null +++ b/test/iptables/iptables_unsafe.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + "syscall" + "unsafe" +) + +type originalDstError struct { + errno syscall.Errno +} + +func (e originalDstError) Error() string { + return fmt.Sprintf("errno (%d) when calling getsockopt(SO_ORIGINAL_DST): %v", int(e.errno), e.errno.Error()) +} + +// SO_ORIGINAL_DST gets the original destination of a redirected packet via +// getsockopt. +const SO_ORIGINAL_DST = 80 + +func originalDestination4(connfd int) (syscall.RawSockaddrInet4, error) { + var addr syscall.RawSockaddrInet4 + var addrLen uint32 = syscall.SizeofSockaddrInet4 + if errno := originalDestination(connfd, syscall.SOL_IP, unsafe.Pointer(&addr), &addrLen); errno != 0 { + return syscall.RawSockaddrInet4{}, originalDstError{errno} + } + return addr, nil +} + +func originalDestination6(connfd int) (syscall.RawSockaddrInet6, error) { + var addr syscall.RawSockaddrInet6 + var addrLen uint32 = syscall.SizeofSockaddrInet6 + if errno := originalDestination(connfd, syscall.SOL_IPV6, unsafe.Pointer(&addr), &addrLen); errno != 0 { + return syscall.RawSockaddrInet6{}, originalDstError{errno} + } + return addr, nil +} + +func originalDestination(connfd int, level uintptr, optval unsafe.Pointer, optlen *uint32) syscall.Errno { + _, _, errno := syscall.Syscall6( + syscall.SYS_GETSOCKOPT, + uintptr(connfd), + level, + SO_ORIGINAL_DST, + uintptr(optval), + uintptr(unsafe.Pointer(optlen)), + 0) + return errno +} diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go index ca80a4b5f..5125fe47b 100644 --- a/test/iptables/iptables_util.go +++ b/test/iptables/iptables_util.go @@ -15,6 +15,8 @@ package iptables import ( + "encoding/binary" + "errors" "fmt" "net" "os/exec" @@ -218,17 +220,58 @@ func filterAddrs(addrs []string, ipv6 bool) []string { // getInterfaceName returns the name of the interface other than loopback. func getInterfaceName() (string, bool) { - var ifname string + iface, ok := getNonLoopbackInterface() + if !ok { + return "", false + } + return iface.Name, true +} + +func getInterfaceAddrs(ipv6 bool) ([]net.IP, error) { + iface, ok := getNonLoopbackInterface() + if !ok { + return nil, errors.New("no non-loopback interface found") + } + addrs, err := iface.Addrs() + if err != nil { + return nil, err + } + + // Get only IPv4 or IPv6 addresses. + ips := make([]net.IP, 0, len(addrs)) + for _, addr := range addrs { + parts := strings.Split(addr.String(), "/") + var ip net.IP + // To16() returns IPv4 addresses as IPv4-mapped IPv6 addresses. + // So we check whether To4() returns nil to test whether the + // address is v4 or v6. + if v4 := net.ParseIP(parts[0]).To4(); ipv6 && v4 == nil { + ip = net.ParseIP(parts[0]).To16() + } else { + ip = v4 + } + if ip != nil { + ips = append(ips, ip) + } + } + return ips, nil +} + +func getNonLoopbackInterface() (net.Interface, bool) { if interfaces, err := net.Interfaces(); err == nil { for _, intf := range interfaces { if intf.Name != "lo" { - ifname = intf.Name - break + return intf, true } } } + return net.Interface{}, false +} - return ifname, ifname != "" +func htons(x uint16) uint16 { + buf := make([]byte, 2) + binary.BigEndian.PutUint16(buf, x) + return binary.LittleEndian.Uint16(buf) } func localIP(ipv6 bool) string { diff --git a/test/iptables/nat.go b/test/iptables/nat.go index ac0d91bb2..b7fea2527 100644 --- a/test/iptables/nat.go +++ b/test/iptables/nat.go @@ -18,12 +18,11 @@ import ( "errors" "fmt" "net" + "syscall" "time" ) -const ( - redirectPort = 42 -) +const redirectPort = 42 func init() { RegisterTestCase(NATPreRedirectUDPPort{}) @@ -42,6 +41,8 @@ func init() { RegisterTestCase(NATOutRedirectInvert{}) RegisterTestCase(NATRedirectRequiresProtocol{}) RegisterTestCase(NATLoopbackSkipsPrerouting{}) + RegisterTestCase(NATPreOriginalDst{}) + RegisterTestCase(NATOutOriginalDst{}) } // NATPreRedirectUDPPort tests that packets are redirected to different port. @@ -471,6 +472,151 @@ func (NATLoopbackSkipsPrerouting) LocalAction(ip net.IP, ipv6 bool) error { return nil } +// NATPreOriginalDst tests that SO_ORIGINAL_DST returns the pre-NAT destination +// of PREROUTING NATted packets. +type NATPreOriginalDst struct{} + +// Name implements TestCase.Name. +func (NATPreOriginalDst) Name() string { + return "NATPreOriginalDst" +} + +// ContainerAction implements TestCase.ContainerAction. +func (NATPreOriginalDst) ContainerAction(ip net.IP, ipv6 bool) error { + // Redirect incoming TCP connections to acceptPort. + if err := natTable(ipv6, "-A", "PREROUTING", + "-p", "tcp", + "--destination-port", fmt.Sprintf("%d", dropPort), + "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort)); err != nil { + return err + } + + addrs, err := getInterfaceAddrs(ipv6) + if err != nil { + return err + } + return listenForRedirectedConn(ipv6, addrs) +} + +// LocalAction implements TestCase.LocalAction. +func (NATPreOriginalDst) LocalAction(ip net.IP, ipv6 bool) error { + return connectTCP(ip, dropPort, sendloopDuration) +} + +// NATOutOriginalDst tests that SO_ORIGINAL_DST returns the pre-NAT destination +// of OUTBOUND NATted packets. +type NATOutOriginalDst struct{} + +// Name implements TestCase.Name. +func (NATOutOriginalDst) Name() string { + return "NATOutOriginalDst" +} + +// ContainerAction implements TestCase.ContainerAction. +func (NATOutOriginalDst) ContainerAction(ip net.IP, ipv6 bool) error { + // Redirect incoming TCP connections to acceptPort. + if err := natTable(ipv6, "-A", "OUTPUT", "-p", "tcp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort)); err != nil { + return err + } + + connCh := make(chan error) + go func() { + connCh <- connectTCP(ip, dropPort, sendloopDuration) + }() + + if err := listenForRedirectedConn(ipv6, []net.IP{ip}); err != nil { + return err + } + return <-connCh +} + +// LocalAction implements TestCase.LocalAction. +func (NATOutOriginalDst) LocalAction(ip net.IP, ipv6 bool) error { + // No-op. + return nil +} + +func listenForRedirectedConn(ipv6 bool, originalDsts []net.IP) error { + // The net package doesn't give guarantee access to the connection's + // underlying FD, and thus we cannot call getsockopt. We have to use + // traditional syscalls for SO_ORIGINAL_DST. + + // Create the listening socket, bind, listen, and accept. + family := syscall.AF_INET + if ipv6 { + family = syscall.AF_INET6 + } + sockfd, err := syscall.Socket(family, syscall.SOCK_STREAM, 0) + if err != nil { + return err + } + defer syscall.Close(sockfd) + + var bindAddr syscall.Sockaddr + if ipv6 { + bindAddr = &syscall.SockaddrInet6{ + Port: acceptPort, + Addr: [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // in6addr_any + } + } else { + bindAddr = &syscall.SockaddrInet4{ + Port: acceptPort, + Addr: [4]byte{0, 0, 0, 0}, // INADDR_ANY + } + } + if err := syscall.Bind(sockfd, bindAddr); err != nil { + return err + } + + if err := syscall.Listen(sockfd, 1); err != nil { + return err + } + + connfd, _, err := syscall.Accept(sockfd) + if err != nil { + return err + } + defer syscall.Close(connfd) + + // Verify that, despite listening on acceptPort, SO_ORIGINAL_DST + // indicates the packet was sent to originalDst:dropPort. + if ipv6 { + got, err := originalDestination6(connfd) + if err != nil { + return err + } + // The original destination could be any of our IPs. + for _, dst := range originalDsts { + want := syscall.RawSockaddrInet6{ + Family: syscall.AF_INET6, + Port: htons(dropPort), + } + copy(want.Addr[:], dst.To16()) + if got == want { + return nil + } + } + return fmt.Errorf("SO_ORIGINAL_DST returned %+v, but wanted one of %+v (note: port numbers are in network byte order)", got, originalDsts) + } else { + got, err := originalDestination4(connfd) + if err != nil { + return err + } + // The original destination could be any of our IPs. + for _, dst := range originalDsts { + want := syscall.RawSockaddrInet4{ + Family: syscall.AF_INET, + Port: htons(dropPort), + } + copy(want.Addr[:], dst.To4()) + if got == want { + return nil + } + } + return fmt.Errorf("SO_ORIGINAL_DST returned %+v, but wanted one of %+v (note: port numbers are in network byte order)", got, originalDsts) + } +} + // loopbackTests runs an iptables rule and ensures that packets sent to // dest:dropPort are received by localhost:acceptPort. func loopbackTest(ipv6 bool, dest net.IP, args ...string) error { -- cgit v1.2.3 From 8908baaf79fdd137241596fd2444828d1a33fe27 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 31 Jul 2020 12:23:01 -0700 Subject: Internal change. PiperOrigin-RevId: 324259991 --- pkg/sentry/fdimport/fdimport.go | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index a4199f9e9..b8686adb4 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -15,6 +15,8 @@ package fdimport import ( + "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" @@ -84,6 +86,9 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) { k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("cannot find kernel from context") + } var ttyFile *vfs.FileDescription for appFD, hostFD := range stdioFDs { -- cgit v1.2.3