From e91c1675cd49254936d04f01b814a0cd802ff6de Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Tue, 26 Nov 2019 15:32:45 +0800 Subject: passed the kvm test case of "TestKernelFault" on Arm64 platform Signed-off-by: Bin Lu --- pkg/sentry/platform/ring0/entry_arm64.s | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'pkg/sentry') diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index 29c475882..0ba4c6b73 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -385,6 +385,16 @@ TEXT ·El1_sync(SB),NOSPLIT,$0 B el1_invalid el1_da: + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + WORD $0xd538601a //MRS FAR_EL1, R26 + + MOVD R26, CPU_FAULT_ADDR(RSV_REG) + + MOVD $0, CPU_ERROR_TYPE(RSV_REG) + + MOVD $PageFault, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + B ·Halt(SB) el1_ia: -- cgit v1.2.3 From fdfa05ff2c99b4a2f7c0b22fc491a268f1f2e164 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 9 Jan 2020 09:01:17 +0000 Subject: Avoid panic when c.PCIDs is nil When PCID is disabled, there would throw a panic when dropPageTables() access to c.PCID without check. Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/kvm/machine_amd64.go | 4 +++- pkg/sentry/platform/kvm/machine_arm64.go | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index b99fe425e..873e39dc7 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -90,7 +90,9 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) { // Clear from all PCIDs. for _, c := range m.vCPUs { - c.PCIDs.Drop(pt) + if c.PCIDs != nil { + c.PCIDs.Drop(pt) + } } } diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 7ae47f291..3b1f20219 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -97,7 +97,9 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) { // Clear from all PCIDs. for _, c := range m.vCPUs { - c.PCIDs.Drop(pt) + if c.PCIDs != nil { + c.PCIDs.Drop(pt) + } } } -- cgit v1.2.3 From debd213da61cf35d7c91346820e93fc87bfa5896 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Mon, 13 Jan 2020 14:45:31 -0800 Subject: Allow dual stack sockets to operate on AF_INET Fixes #1490 Fixes #1495 PiperOrigin-RevId: 289523250 --- pkg/sentry/socket/netstack/netstack.go | 65 +++++++++--- pkg/sentry/socket/unix/unix.go | 5 +- pkg/sentry/strace/socket.go | 2 +- pkg/tcpip/stack/stack.go | 43 ++++++++ pkg/tcpip/transport/icmp/endpoint.go | 22 ++-- pkg/tcpip/transport/tcp/endpoint.go | 23 +--- pkg/tcpip/transport/udp/endpoint.go | 41 ++------ scripts/common.sh | 2 +- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/socket_inet_loopback.cc | 156 ++++++++++++++++++++++++++++ 10 files changed, 278 insertions(+), 82 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 099319327..c020c11cb 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -324,22 +324,15 @@ func bytesToIPAddress(addr []byte) tcpip.Address { // converts it to the FullAddress format. It supports AF_UNIX, AF_INET, // AF_INET6, and AF_PACKET addresses. // -// strict indicates whether addresses with the AF_UNSPEC family are accepted of not. -// // AddressAndFamily returns an address and its family. -func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) { +func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument } - family := usermem.ByteOrder.Uint16(addr) - if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) { - return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported - } - // Get the rest of the fields based on the address family. - switch family { + switch family := usermem.ByteOrder.Uint16(addr); family { case linux.AF_UNIX: path := addr[2:] if len(path) > linux.UnixPathMax { @@ -638,10 +631,40 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { return r } +func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error { + if family == uint16(s.family) { + return nil + } + if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { + v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption) + if err != nil { + return syserr.TranslateNetstackError(err) + } + if !v { + return nil + } + } + return syserr.ErrInvalidArgument +} + +// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the +// receiver's family is AF_INET6. +// +// This is a hack to work around the fact that both IPv4 and IPv6 ANY are +// represented by the empty string. +// +// TODO(gvisor.dev/issues/1556): remove this function. +func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { + if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { + addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00" + } + return addr +} + // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { - addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */) + addr, family, err := AddressAndFamily(sockaddr) if err != nil { return err } @@ -653,6 +676,12 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } return syserr.TranslateNetstackError(err) } + + if err := s.checkFamily(family, false /* exact */); err != nil { + return err + } + addr = s.mapFamily(addr, family) + // Always return right away in the non-blocking case. if !blocking { return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) @@ -681,10 +710,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */) + addr, family, err := AddressAndFamily(sockaddr) if err != nil { return err } + if err := s.checkFamily(family, true /* exact */); err != nil { + return err + } + addr = s.mapFamily(addr, family) // Issue the bind request to the endpoint. return syserr.TranslateNetstackError(s.Endpoint.Bind(addr)) @@ -2080,8 +2113,8 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) case linux.AF_INET6: var out linux.SockAddrInet6 - if len(addr.Addr) == 4 { - // Copy address is v4-mapped format. + if len(addr.Addr) == header.IPv4AddressSize { + // Copy address in v4-mapped format. copy(out.Addr[12:], addr.Addr) out.Addr[10] = 0xff out.Addr[11] = 0xff @@ -2395,10 +2428,14 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] var addr *tcpip.FullAddress if len(to) > 0 { - addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */) + addrBuf, family, err := AddressAndFamily(to) if err != nil { return 0, err } + if err := s.checkFamily(family, false /* exact */); err != nil { + return 0, err + } + addrBuf = s.mapFamily(addrBuf, family) addr = &addrBuf } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 91effe89a..7f49ba864 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -116,13 +116,16 @@ func (s *SocketOperations) Endpoint() transport.Endpoint { // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { - addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */) + addr, family, err := netstack.AddressAndFamily(sockaddr) if err != nil { if err == syserr.ErrAddressFamilyNotSupported { err = syserr.ErrInvalidArgument } return "", err } + if family != linux.AF_UNIX { + return "", syserr.ErrInvalidArgument + } // The address is trimmed by GetAddress. p := string(addr.Addr) diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index 51f2efb39..b6d7177f4 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -341,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string { switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: - fa, _, err := netstack.AddressAndFamily(int(family), b, true /* strict */) + fa, _, err := netstack.AddressAndFamily(b) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index a47ceba54..113b457fb 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -547,6 +547,49 @@ type TransportEndpointInfo struct { RegisterNICID tcpip.NICID } +// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address +// and returns the network protocol number to be used to communicate with the +// specified address. It returns an error if the passed address is incompatible +// with the receiver. +func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) { + netProto := e.NetProto + switch len(addr.Addr) { + case header.IPv4AddressSize: + netProto = header.IPv4ProtocolNumber + case header.IPv6AddressSize: + if header.IsV4MappedAddress(addr.Addr) { + netProto = header.IPv4ProtocolNumber + addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] + if addr.Addr == header.IPv4Any { + addr.Addr = "" + } + } + } + + switch len(e.ID.LocalAddress) { + case header.IPv4AddressSize: + if len(addr.Addr) == header.IPv6AddressSize { + return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState + } + case header.IPv6AddressSize: + if len(addr.Addr) == header.IPv4AddressSize { + return tcpip.FullAddress{}, 0, tcpip.ErrNetworkUnreachable + } + } + + switch { + case netProto == e.NetProto: + case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber: + if v6only { + return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute + } + default: + return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState + } + + return addr, netProto, nil +} + // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo // marker interface. func (*TransportEndpointInfo) IsEndpointInfo() {} diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 330786f4c..42afb3f5b 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -288,7 +288,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c toCopy := *to to = &toCopy - netProto, err := e.checkV4Mapped(to, true) + netProto, err := e.checkV4Mapped(to) if err != nil { return 0, nil, err } @@ -475,18 +475,12 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err }) } -func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { - netProto := e.NetProto - if header.IsV4MappedAddress(addr.Addr) { - return 0, tcpip.ErrNoRoute - } - - // Fail if we're bound to an address length different from the one we're - // checking. - if l := len(e.ID.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) { - return 0, tcpip.ErrInvalidEndpointState +func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) { + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */) + if err != nil { + return 0, err } - + *addr = unwrapped return netProto, nil } @@ -518,7 +512,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { return tcpip.ErrInvalidEndpointState } - netProto, err := e.checkV4Mapped(&addr, false) + netProto, err := e.checkV4Mapped(&addr) if err != nil { return err } @@ -631,7 +625,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { return tcpip.ErrInvalidEndpointState } - netProto, err := e.checkV4Mapped(&addr, false) + netProto, err := e.checkV4Mapped(&addr) if err != nil { return err } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index cca511fb9..cc8b533c8 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1691,26 +1691,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) { - netProto := e.NetProto - if header.IsV4MappedAddress(addr.Addr) { - // Fail if using a v4 mapped address on a v6only endpoint. - if e.v6only { - return 0, tcpip.ErrNoRoute - } - - netProto = header.IPv4ProtocolNumber - addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] - if addr.Addr == header.IPv4Any { - addr.Addr = "" - } - } - - // Fail if we're bound to an address length different from the one we're - // checking. - if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) { - return 0, tcpip.ErrInvalidEndpointState + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only) + if err != nil { + return 0, err } - + *addr = unwrapped return netProto, nil } diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index a4ff29a7d..13446f5d9 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -402,7 +402,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c return 0, nil, tcpip.ErrBroadcastDisabled } - netProto, err := e.checkV4Mapped(to, false) + netProto, err := e.checkV4Mapped(to) if err != nil { return 0, nil, err } @@ -501,7 +501,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { defer e.mu.Unlock() fa := tcpip.FullAddress{Addr: v.InterfaceAddr} - netProto, err := e.checkV4Mapped(&fa, false) + netProto, err := e.checkV4Mapped(&fa) if err != nil { return err } @@ -839,35 +839,12 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u return nil } -func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { - netProto := e.NetProto - if len(addr.Addr) == 0 { - return netProto, nil - } - if header.IsV4MappedAddress(addr.Addr) { - // Fail if using a v4 mapped address on a v6only endpoint. - if e.v6only { - return 0, tcpip.ErrNoRoute - } - - netProto = header.IPv4ProtocolNumber - addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] - if addr.Addr == header.IPv4Any { - addr.Addr = "" - } - - // Fail if we are bound to an IPv6 address. - if !allowMismatch && len(e.ID.LocalAddress) == 16 { - return 0, tcpip.ErrNetworkUnreachable - } - } - - // Fail if we're bound to an address length different from the one we're - // checking. - if l := len(e.ID.LocalAddress); l != 0 && l != len(addr.Addr) { - return 0, tcpip.ErrInvalidEndpointState +func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) { + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only) + if err != nil { + return 0, err } - + *addr = unwrapped return netProto, nil } @@ -916,7 +893,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { - netProto, err := e.checkV4Mapped(&addr, false) + netProto, err := e.checkV4Mapped(&addr) if err != nil { return err } @@ -1074,7 +1051,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { return tcpip.ErrInvalidEndpointState } - netProto, err := e.checkV4Mapped(&addr, true) + netProto, err := e.checkV4Mapped(&addr) if err != nil { return err } diff --git a/scripts/common.sh b/scripts/common.sh index 6dabad141..fdb1aa142 100755 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -73,7 +73,7 @@ function install_runsc() { sudo "${RUNSC_BIN}" install --experimental=true --runtime="${runtime}" -- --debug-log "${RUNSC_LOGS}" "$@" # Clear old logs files that may exist. - sudo rm -f "${RUNSC_LOGS_DIR}"/* + sudo rm -f "${RUNSC_LOGS_DIR}"/'*' # Restart docker to pick up the new runtime configuration. sudo systemctl restart docker diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index ce8abe217..4c7ec3f06 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -2693,6 +2693,7 @@ cc_binary( srcs = ["socket_inet_loopback.cc"], linkstatic = 1, deps = [ + ":ip_socket_test_util", ":socket_test_util", "//test/util:file_descriptor", "//test/util:posix_error", diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index 138024d9e..5d114d460 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -32,6 +32,7 @@ #include "absl/strings/str_cat.h" #include "absl/time/clock.h" #include "absl/time/time.h" +#include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" @@ -102,6 +103,161 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) { SyscallFailsWithErrno(EAFNOSUPPORT)); } +enum class Operation { + Bind, + Connect, + SendTo, +}; + +std::string OperationToString(Operation operation) { + switch (operation) { + case Operation::Bind: + return "Bind"; + case Operation::Connect: + return "Connect"; + case Operation::SendTo: + return "SendTo"; + } +} + +using OperationSequence = std::vector; + +using DualStackSocketTest = + ::testing::TestWithParam>; + +TEST_P(DualStackSocketTest, AddressOperations) { + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_DGRAM, 0)); + + const TestAddress& addr = std::get<0>(GetParam()); + const OperationSequence& operations = std::get<1>(GetParam()); + + auto addr_in = reinterpret_cast(&addr.addr); + + // sockets may only be bound once. Both `connect` and `sendto` cause a socket + // to be bound. + bool bound = false; + for (const Operation& operation : operations) { + bool sockname = false; + bool peername = false; + switch (operation) { + case Operation::Bind: { + ASSERT_NO_ERRNO(SetAddrPort( + addr.family(), const_cast(&addr.addr), 0)); + + int bind_ret = bind(fd.get(), addr_in, addr.addr_len); + + // Dual stack sockets may only be bound to AF_INET6. + if (!bound && addr.family() == AF_INET6) { + EXPECT_THAT(bind_ret, SyscallSucceeds()); + bound = true; + + sockname = true; + } else { + EXPECT_THAT(bind_ret, SyscallFailsWithErrno(EINVAL)); + } + break; + } + case Operation::Connect: { + ASSERT_NO_ERRNO(SetAddrPort( + addr.family(), const_cast(&addr.addr), 1337)); + + EXPECT_THAT(connect(fd.get(), addr_in, addr.addr_len), + SyscallSucceeds()) + << GetAddrStr(addr_in); + bound = true; + + sockname = true; + peername = true; + + break; + } + case Operation::SendTo: { + const char payload[] = "hello"; + ASSERT_NO_ERRNO(SetAddrPort( + addr.family(), const_cast(&addr.addr), 1337)); + + ssize_t sendto_ret = sendto(fd.get(), &payload, sizeof(payload), 0, + addr_in, addr.addr_len); + + EXPECT_THAT(sendto_ret, SyscallSucceedsWithValue(sizeof(payload))); + sockname = !bound; + bound = true; + break; + } + } + + if (sockname) { + sockaddr_storage sock_addr; + socklen_t addrlen = sizeof(sock_addr); + ASSERT_THAT(getsockname(fd.get(), reinterpret_cast(&sock_addr), + &addrlen), + SyscallSucceeds()); + ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6)); + + auto sock_addr_in6 = reinterpret_cast(&sock_addr); + + if (operation == Operation::SendTo) { + EXPECT_EQ(sock_addr_in6->sin6_family, AF_INET6); + EXPECT_TRUE(IN6_IS_ADDR_UNSPECIFIED(sock_addr_in6->sin6_addr.s6_addr32)) + << OperationToString(operation) << " getsocknam=" + << GetAddrStr(reinterpret_cast(&sock_addr)); + + EXPECT_NE(sock_addr_in6->sin6_port, 0); + } else if (IN6_IS_ADDR_V4MAPPED( + reinterpret_cast(addr_in) + ->sin6_addr.s6_addr32)) { + EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(sock_addr_in6->sin6_addr.s6_addr32)) + << OperationToString(operation) << " getsocknam=" + << GetAddrStr(reinterpret_cast(&sock_addr)); + } + } + + if (peername) { + sockaddr_storage peer_addr; + socklen_t addrlen = sizeof(peer_addr); + ASSERT_THAT(getpeername(fd.get(), reinterpret_cast(&peer_addr), + &addrlen), + SyscallSucceeds()); + ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6)); + + if (addr.family() == AF_INET || + IN6_IS_ADDR_V4MAPPED(reinterpret_cast(addr_in) + ->sin6_addr.s6_addr32)) { + EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED( + reinterpret_cast(&peer_addr) + ->sin6_addr.s6_addr32)) + << OperationToString(operation) << " getpeername=" + << GetAddrStr(reinterpret_cast(&peer_addr)); + } + } + } +} + +// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny. +INSTANTIATE_TEST_SUITE_P( + All, DualStackSocketTest, + ::testing::Combine( + ::testing::Values(V4Any(), V4Loopback(), /*V4MappedAny(),*/ + V4MappedLoopback(), V6Any(), V6Loopback()), + ::testing::ValuesIn( + {{Operation::Bind, Operation::Connect, Operation::SendTo}, + {Operation::Bind, Operation::SendTo, Operation::Connect}, + {Operation::Connect, Operation::Bind, Operation::SendTo}, + {Operation::Connect, Operation::SendTo, Operation::Bind}, + {Operation::SendTo, Operation::Bind, Operation::Connect}, + {Operation::SendTo, Operation::Connect, Operation::Bind}})), + [](::testing::TestParamInfo< + std::tuple> const& info) { + const TestAddress& addr = std::get<0>(info.param); + const OperationSequence& operations = std::get<1>(info.param); + std::string s = addr.description; + for (const Operation& operation : operations) { + absl::StrAppend(&s, OperationToString(operation)); + } + return s; + }); + void tcpSimpleConnectTest(TestAddress const& listener, TestAddress const& connector, bool unbound) { // Create the listening socket. -- cgit v1.2.3 From 50625cee59aaff834c7968771ab385ad0e7b0e1f Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Tue, 14 Jan 2020 13:31:52 -0800 Subject: Implement {g,s}etsockopt(IP_RECVTOS) for UDP sockets PiperOrigin-RevId: 289718534 --- pkg/sentry/socket/control/control.go | 2 +- pkg/sentry/socket/netstack/netstack.go | 36 +++++++++++++-- pkg/tcpip/checker/checker.go | 16 +++++++ pkg/tcpip/stack/nic.go | 2 +- pkg/tcpip/stack/stack.go | 2 +- pkg/tcpip/tcpip.go | 8 +++- pkg/tcpip/transport/udp/endpoint.go | 40 +++++++++++++++-- pkg/tcpip/transport/udp/udp_test.go | 67 ++++++++++++++++++++++++---- test/syscalls/linux/socket_ip_udp_generic.cc | 40 +++++++++++++++++ test/syscalls/linux/udp_socket_test_cases.cc | 8 ++-- 10 files changed, 197 insertions(+), 24 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 4301b697c..1684dfc24 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte { } // PackTOS packs an IP_TOS socket control message. -func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte { +func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IP, diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index c020c11cb..d2f7e987d 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -1268,11 +1268,11 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if err != nil { return nil, syserr.TranslateNetstackError(err) } - var o uint32 + var o int32 if v { o = 1 } - return int32(o), nil + return o, nil case linux.IPV6_PATHMTU: t.Kernel().EmitUnimplementedEvent(t) @@ -1377,6 +1377,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in } return int32(v), nil + case linux.IP_RECVTOS: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + var o int32 + if v { + o = 1 + } + return o, nil + default: emitUnimplementedEventIP(t, name) } @@ -1895,6 +1910,13 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v))) + case linux.IP_RECVTOS: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0)) + case linux.IP_ADD_SOURCE_MEMBERSHIP, linux.IP_BIND_ADDRESS_NO_PORT, linux.IP_BLOCK_SOURCE, @@ -1915,7 +1937,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s linux.IP_RECVFRAGSIZE, linux.IP_RECVOPTS, linux.IP_RECVORIGDSTADDR, - linux.IP_RECVTOS, linux.IP_RECVTTL, linux.IP_RETOPTS, linux.IP_TRANSPARENT, @@ -2335,7 +2356,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe } func (s *SocketOperations) controlMessages() socket.ControlMessages { - return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}} + return socket.ControlMessages{ + IP: tcpip.ControlMessages{ + HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, + Timestamp: s.readCM.Timestamp, + HasTOS: s.readCM.HasTOS, + TOS: s.readCM.TOS, + }, + } } // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go index 2f15bf1f1..542abc99d 100644 --- a/pkg/tcpip/checker/checker.go +++ b/pkg/tcpip/checker/checker.go @@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network) // TransportChecker is a function to check a property of a transport packet. type TransportChecker func(*testing.T, header.Transport) +// ControlMessagesChecker is a function to check a property of ancillary data. +type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages) + // IPv4 checks the validity and properties of the given IPv4 packet. It is // expected to be used in conjunction with other network checkers for specific // properties. For example, to check the source and destination address, one @@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker { } } +// ReceiveTOS creates a checker that checks the TOS field in ControlMessages. +func ReceiveTOS(want uint8) ControlMessagesChecker { + return func(t *testing.T, cm tcpip.ControlMessages) { + t.Helper() + if !cm.HasTOS { + t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want) + } + if got := cm.TOS; got != want { + t.Fatalf("got cm.TOS = %d, want %d", got, want) + } + } +} + // TOS creates a checker that checks the TOS field. func TOS(tos uint8, label uint32) NetworkChecker { return func(t *testing.T, h []header.Network) { diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index abf73fe33..071221d5a 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -763,7 +763,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) { n.mu.Unlock() } -// Subnets returns the Subnets associated with this NIC. +// AddressRanges returns the Subnets associated with this NIC. func (n *NIC) AddressRanges() []tcpip.Subnet { n.mu.RLock() defer n.mu.RUnlock() diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index f8d89248e..386eb6eec 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -912,7 +912,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool { return false } -// NICSubnets returns a map of NICIDs to their associated subnets. +// NICAddressRanges returns a map of NICIDs to their associated subnets. func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet { s.mu.RLock() defer s.mu.RUnlock() diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 4a090ac86..b7813cbc0 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -322,7 +322,7 @@ type ControlMessages struct { HasTOS bool // TOS is the IPv4 type of service of the associated packet. - TOS int8 + TOS uint8 // HasTClass indicates whether Tclass is valid/set. HasTClass bool @@ -500,9 +500,13 @@ type WriteOptions struct { type SockOptBool int const ( + // ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS + // ancillary message is passed with incoming packets. + ReceiveTOSOption SockOptBool = iota + // V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6 // socket is to be restricted to sending and receiving IPv6 packets only. - V6OnlyOption SockOptBool = iota + V6OnlyOption ) // SockOptInt represents socket options which values have the int type. diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 13446f5d9..c9cbed8f4 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -31,6 +31,7 @@ type udpPacket struct { senderAddress tcpip.FullAddress data buffer.VectorisedView `state:".(buffer.VectorisedView)"` timestamp int64 + tos uint8 } // EndpointState represents the state of a UDP endpoint. @@ -113,6 +114,10 @@ type endpoint struct { // applied while sending packets. Defaults to 0 as on Linux. sendTOS uint8 + // receiveTOS determines if the incoming IPv4 TOS header field is passed + // as ancillary data to ControlMessages on Read. + receiveTOS bool + // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags @@ -243,7 +248,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess *addr = p.senderAddress } - return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil + cm := tcpip.ControlMessages{ + HasTimestamp: true, + Timestamp: p.timestamp, + } + e.mu.RLock() + receiveTOS := e.receiveTOS + e.mu.RUnlock() + if receiveTOS { + cm.HasTOS = true + cm.TOS = p.tos + } + return p.data.ToView(), cm, nil } // prepareForWrite prepares the endpoint for sending data. In particular, it @@ -458,6 +474,12 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { switch opt { + case tcpip.ReceiveTOSOption: + e.mu.Lock() + e.receiveTOS = v + e.mu.Unlock() + return nil + case tcpip.V6OnlyOption: // We only recognize this option on v6 endpoints. if e.NetProto != header.IPv6ProtocolNumber { @@ -664,15 +686,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { switch opt { + case tcpip.ReceiveTOSOption: + e.mu.RLock() + v := e.receiveTOS + e.mu.RUnlock() + return v, nil + case tcpip.V6OnlyOption: // We only recognize this option on v6 endpoints. if e.NetProto != header.IPv6ProtocolNumber { return false, tcpip.ErrUnknownProtocolOption } - e.mu.Lock() + e.mu.RLock() v := e.v6only - e.mu.Unlock() + e.mu.RUnlock() return v, nil } @@ -1215,6 +1243,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk e.rcvList.PushBack(packet) e.rcvBufSize += pkt.Data.Size() + // Save any useful information from the network header to the packet. + switch r.NetProto { + case header.IPv4ProtocolNumber: + packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS() + } + packet.timestamp = e.stack.NowNanoseconds() e.rcvMu.Unlock() diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 0a82bc4fa..ee9d10555 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -56,6 +56,7 @@ const ( multicastAddr = "\xe8\x2b\xd3\xea" multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" broadcastAddr = header.IPv4Broadcast + testTOS = 0x80 // defaultMTU is the MTU, in bytes, used throughout the tests, except // where another value is explicitly used. It is chosen to match the MTU @@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool ip := header.IPv4(buf) ip.Encode(&header.IPv4Fields{ IHL: header.IPv4MinimumSize, + TOS: testTOS, TotalLength: uint16(len(buf)), TTL: 65, Protocol: uint8(udp.ProtocolNumber), @@ -552,8 +554,8 @@ func TestBindToDeviceOption(t *testing.T) { // testReadInternal sends a packet of the given test flow into the stack by // injecting it into the link endpoint. It then attempts to read it from the // UDP endpoint and depending on if this was expected to succeed verifies its -// correctness. -func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) { +// correctness including any additional checker functions provided. +func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) { c.t.Helper() payload := newPayload() @@ -568,12 +570,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone() var addr tcpip.FullAddress - v, _, err := c.ep.Read(&addr) + v, cm, err := c.ep.Read(&addr) if err == tcpip.ErrWouldBlock { // Wait for data to become available. select { case <-ch: - v, _, err = c.ep.Read(&addr) + v, cm, err = c.ep.Read(&addr) case <-time.After(300 * time.Millisecond): if packetShouldBeDropped { @@ -606,15 +608,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe if !bytes.Equal(payload, v) { c.t.Fatalf("bad payload: got %x, want %x", v, payload) } + + // Run any checkers against the ControlMessages. + for _, f := range checkers { + f(c.t, cm) + } + c.checkEndpointReadStats(1, epstats, err) } // testRead sends a packet of the given test flow into the stack by injecting it // into the link endpoint. It then reads it from the UDP endpoint and verifies -// its correctness. -func testRead(c *testContext, flow testFlow) { +// its correctness including any additional checker functions provided. +func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) { c.t.Helper() - testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */) + testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...) } // testFailingRead sends a packet of the given test flow into the stack by @@ -1282,7 +1290,7 @@ func TestTOSV4(t *testing.T) { c.createEndpointForFlow(flow) - const tos = 0xC0 + const tos = testTOS var v tcpip.IPv4TOSOption if err := c.ep.GetSockOpt(&v); err != nil { c.t.Errorf("GetSockopt failed: %s", err) @@ -1317,7 +1325,7 @@ func TestTOSV6(t *testing.T) { c.createEndpointForFlow(flow) - const tos = 0xC0 + const tos = testTOS var v tcpip.IPv6TrafficClassOption if err := c.ep.GetSockOpt(&v); err != nil { c.t.Errorf("GetSockopt failed: %s", err) @@ -1344,6 +1352,47 @@ func TestTOSV6(t *testing.T) { } } +func TestReceiveTOSV4(t *testing.T) { + for _, flow := range []testFlow{unicastV4, broadcast} { + t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) { + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + c.createEndpointForFlow(flow) + + // Verify that setting and reading the option works. + v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption) + if err != nil { + c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err) + } + // Test for expected default value. + if v != false { + c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false) + } + + want := true + if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil { + c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err) + } + + got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption) + if err != nil { + c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err) + } + if got != want { + c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want) + } + + // Verify that the correct received TOS is handed through as + // ancillary data to the ControlMessages struct. + if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil { + c.t.Fatal("Bind failed:", err) + } + testRead(c, flow, checker.ReceiveTOS(testTOS)) + }) + } +} + func TestMulticastInterfaceOption(t *testing.T) { for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} { t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) { diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc index 66eb68857..53290bed7 100644 --- a/test/syscalls/linux/socket_ip_udp_generic.cc +++ b/test/syscalls/linux/socket_ip_udp_generic.cc @@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) { EXPECT_EQ(get, kSockOptOn); } +// Ensure that Receiving TOS is off by default. +TEST_P(UDPSocketPairTest, RecvTosDefault) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kSockOptOff); +} + +// Test that setting and getting IP_RECVTOS works as expected. +TEST_P(UDPSocketPairTest, SetRecvTos) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, + &kSockOptOff, sizeof(kSockOptOff)), + SyscallSucceeds()); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kSockOptOff); + + ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + + ASSERT_THAT( + getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kSockOptOn); +} + TEST_P(UDPSocketPairTest, ReuseAddrDefault) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc index dc35c2f50..68e0a8109 100644 --- a/test/syscalls/linux/udp_socket_test_cases.cc +++ b/test/syscalls/linux/udp_socket_test_cases.cc @@ -1349,8 +1349,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) { // outgoing packets, and that a receiving socket with IP_RECVTOS or // IPV6_RECVTCLASS will create the corresponding control message. TEST_P(UdpSocketTest, SetAndReceiveTOS) { - // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack. - SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet()); + // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack. + SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() && + !IsRunningWithHostinet()); ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); @@ -1421,7 +1422,8 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) { // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or // IPV6_RECVTCLASS will create the corresponding control message. TEST_P(UdpSocketTest, SendAndReceiveTOS) { - // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack. + // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack. + // TODO(b/146661005): Setting TOS via cmsg not supported for netstack. SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet()); ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds()); ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds()); -- cgit v1.2.3 From ff78a721700f8b7d3c8dae14fc14c04f3a82b970 Mon Sep 17 00:00:00 2001 From: lubinszARM <34124929+lubinszARM@users.noreply.github.com> Date: Tue, 14 Jan 2020 22:22:45 -0800 Subject: enable pkg/sentry/arch to support arm64 basically Signed-off-by: Bin Lu Change-Id: I9cce23db4e5caec82ce42b4970fdb7f7e8c08f1d COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/773 from lubinszARM:pr_arch_basic 3fe2fd8e6286766bbe489ef971dce204f924feba PiperOrigin-RevId: 289795569 --- pkg/sentry/arch/BUILD | 6 + pkg/sentry/arch/arch_aarch64.go | 293 ++++++++++++++++++++++++++++++++++ pkg/sentry/arch/arch_arm64.go | 266 ++++++++++++++++++++++++++++++ pkg/sentry/arch/arch_state_aarch64.go | 38 +++++ pkg/sentry/arch/arch_state_x86.go | 2 + pkg/sentry/arch/registers.proto | 37 +++++ pkg/sentry/arch/signal.go | 250 +++++++++++++++++++++++++++++ pkg/sentry/arch/signal_amd64.go | 230 -------------------------- pkg/sentry/arch/signal_arm64.go | 126 +++++++++++++++ pkg/sentry/arch/signal_stack.go | 2 +- pkg/sentry/arch/syscalls_arm64.go | 62 +++++++ 11 files changed, 1081 insertions(+), 231 deletions(-) create mode 100644 pkg/sentry/arch/arch_aarch64.go create mode 100644 pkg/sentry/arch/arch_arm64.go create mode 100644 pkg/sentry/arch/arch_state_aarch64.go create mode 100644 pkg/sentry/arch/signal.go create mode 100644 pkg/sentry/arch/signal_arm64.go create mode 100644 pkg/sentry/arch/syscalls_arm64.go (limited to 'pkg/sentry') diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index ae3e364cd..65f22af2b 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -9,17 +9,23 @@ go_library( srcs = [ "aligned.go", "arch.go", + "arch_aarch64.go", "arch_amd64.go", "arch_amd64.s", + "arch_arm64.go", + "arch_state_aarch64.go", "arch_state_x86.go", "arch_x86.go", "auxv.go", + "signal.go", "signal_act.go", "signal_amd64.go", + "signal_arm64.go", "signal_info.go", "signal_stack.go", "stack.go", "syscalls_amd64.go", + "syscalls_arm64.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/arch", visibility = ["//:sandbox"], diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go new file mode 100644 index 000000000..ea4dedbdf --- /dev/null +++ b/pkg/sentry/arch/arch_aarch64.go @@ -0,0 +1,293 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package arch + +import ( + "fmt" + "io" + "syscall" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/log" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + // SyscallWidth is the width of insturctions. + SyscallWidth = 4 +) + +// aarch64FPState is aarch64 floating point state. +type aarch64FPState []byte + +// initAarch64FPState (defined in asm files) sets up initial state. +func initAarch64FPState(data *FloatingPointData) { + // TODO(gvisor.dev/issue/1238): floating-point is not supported. +} + +func newAarch64FPStateSlice() []byte { + return alignedBytes(4096, 32)[:4096] +} + +// newAarch64FPState returns an initialized floating point state. +// +// The returned state is large enough to store all floating point state +// supported by host, even if the app won't use much of it due to a restricted +// FeatureSet. Since they may still be able to see state not advertised by +// CPUID we must ensure it does not contain any sentry state. +func newAarch64FPState() aarch64FPState { + f := aarch64FPState(newAarch64FPStateSlice()) + initAarch64FPState(f.FloatingPointData()) + return f +} + +// fork creates and returns an identical copy of the aarch64 floating point state. +func (f aarch64FPState) fork() aarch64FPState { + n := aarch64FPState(newAarch64FPStateSlice()) + copy(n, f) + return n +} + +// FloatingPointData returns the raw data pointer. +func (f aarch64FPState) FloatingPointData() *FloatingPointData { + return (*FloatingPointData)(&f[0]) +} + +// NewFloatingPointData returns a new floating point data blob. +// +// This is primarily for use in tests. +func NewFloatingPointData() *FloatingPointData { + return (*FloatingPointData)(&(newAarch64FPState()[0])) +} + +// State contains the common architecture bits for aarch64 (the build tag of this +// file ensures it's only built on aarch64). +type State struct { + // The system registers. + Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"` + + // Our floating point state. + aarch64FPState `state:"wait"` + + // FeatureSet is a pointer to the currently active feature set. + FeatureSet *cpuid.FeatureSet +} + +// Proto returns a protobuf representation of the system registers in State. +func (s State) Proto() *rpb.Registers { + regs := &rpb.ARM64Registers{ + R0: s.Regs.Regs[0], + R1: s.Regs.Regs[1], + R2: s.Regs.Regs[2], + R3: s.Regs.Regs[3], + R4: s.Regs.Regs[4], + R5: s.Regs.Regs[5], + R6: s.Regs.Regs[6], + R7: s.Regs.Regs[7], + R8: s.Regs.Regs[8], + R9: s.Regs.Regs[9], + R10: s.Regs.Regs[10], + R11: s.Regs.Regs[11], + R12: s.Regs.Regs[12], + R13: s.Regs.Regs[13], + R14: s.Regs.Regs[14], + R15: s.Regs.Regs[15], + R16: s.Regs.Regs[16], + R17: s.Regs.Regs[17], + R18: s.Regs.Regs[18], + R19: s.Regs.Regs[19], + R20: s.Regs.Regs[20], + R21: s.Regs.Regs[21], + R22: s.Regs.Regs[22], + R23: s.Regs.Regs[23], + R24: s.Regs.Regs[24], + R25: s.Regs.Regs[25], + R26: s.Regs.Regs[26], + R27: s.Regs.Regs[27], + R28: s.Regs.Regs[28], + R29: s.Regs.Regs[29], + R30: s.Regs.Regs[30], + Sp: s.Regs.Sp, + Pc: s.Regs.Pc, + Pstate: s.Regs.Pstate, + } + return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}} +} + +// Fork creates and returns an identical copy of the state. +func (s *State) Fork() State { + // TODO(gvisor.dev/issue/1238): floating-point is not supported. + return State{ + Regs: s.Regs, + FeatureSet: s.FeatureSet, + } +} + +// StateData implements Context.StateData. +func (s *State) StateData() *State { + return s +} + +// CPUIDEmulate emulates a cpuid instruction. +func (s *State) CPUIDEmulate(l log.Logger) { + // TODO(gvisor.dev/issue/1255): cpuid is not supported. +} + +// SingleStep implements Context.SingleStep. +func (s *State) SingleStep() bool { + return false +} + +// SetSingleStep enables single stepping. +func (s *State) SetSingleStep() { + // Set the trap flag. + // TODO(gvisor.dev/issue/1239): ptrace single-step is not supported. +} + +// ClearSingleStep enables single stepping. +func (s *State) ClearSingleStep() { + // Clear the trap flag. + // TODO(gvisor.dev/issue/1239): ptrace single-step is not supported. +} + +// RegisterMap returns a map of all registers. +func (s *State) RegisterMap() (map[string]uintptr, error) { + return map[string]uintptr{ + "R0": uintptr(s.Regs.Regs[0]), + "R1": uintptr(s.Regs.Regs[1]), + "R2": uintptr(s.Regs.Regs[2]), + "R3": uintptr(s.Regs.Regs[3]), + "R4": uintptr(s.Regs.Regs[4]), + "R5": uintptr(s.Regs.Regs[5]), + "R6": uintptr(s.Regs.Regs[6]), + "R7": uintptr(s.Regs.Regs[7]), + "R8": uintptr(s.Regs.Regs[8]), + "R9": uintptr(s.Regs.Regs[9]), + "R10": uintptr(s.Regs.Regs[10]), + "R11": uintptr(s.Regs.Regs[11]), + "R12": uintptr(s.Regs.Regs[12]), + "R13": uintptr(s.Regs.Regs[13]), + "R14": uintptr(s.Regs.Regs[14]), + "R15": uintptr(s.Regs.Regs[15]), + "R16": uintptr(s.Regs.Regs[16]), + "R17": uintptr(s.Regs.Regs[17]), + "R18": uintptr(s.Regs.Regs[18]), + "R19": uintptr(s.Regs.Regs[19]), + "R20": uintptr(s.Regs.Regs[20]), + "R21": uintptr(s.Regs.Regs[21]), + "R22": uintptr(s.Regs.Regs[22]), + "R23": uintptr(s.Regs.Regs[23]), + "R24": uintptr(s.Regs.Regs[24]), + "R25": uintptr(s.Regs.Regs[25]), + "R26": uintptr(s.Regs.Regs[26]), + "R27": uintptr(s.Regs.Regs[27]), + "R28": uintptr(s.Regs.Regs[28]), + "R29": uintptr(s.Regs.Regs[29]), + "R30": uintptr(s.Regs.Regs[30]), + "Sp": uintptr(s.Regs.Sp), + "Pc": uintptr(s.Regs.Pc), + "Pstate": uintptr(s.Regs.Pstate), + }, nil +} + +// PtraceGetRegs implements Context.PtraceGetRegs. +func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { + return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs())) +} + +func (s *State) ptraceGetRegs() syscall.PtraceRegs { + return s.Regs +} + +var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{})) + +// PtraceSetRegs implements Context.PtraceSetRegs. +func (s *State) PtraceSetRegs(src io.Reader) (int, error) { + var regs syscall.PtraceRegs + buf := make([]byte, ptraceRegsSize) + if _, err := io.ReadFull(src, buf); err != nil { + return 0, err + } + binary.Unmarshal(buf, usermem.ByteOrder, ®s) + s.Regs = regs + return ptraceRegsSize, nil +} + +// PtraceGetFPRegs implements Context.PtraceGetFPRegs. +func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) { + // TODO(gvisor.dev/issue/1238): floating-point is not supported. + return 0, nil +} + +// PtraceSetFPRegs implements Context.PtraceSetFPRegs. +func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) { + // TODO(gvisor.dev/issue/1238): floating-point is not supported. + return 0, nil +} + +// Register sets defined in include/uapi/linux/elf.h. +const ( + _NT_PRSTATUS = 1 + _NT_PRFPREG = 2 +) + +// PtraceGetRegSet implements Context.PtraceGetRegSet. +func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { + switch regset { + case _NT_PRSTATUS: + if maxlen < ptraceRegsSize { + return 0, syserror.EFAULT + } + return s.PtraceGetRegs(dst) + default: + return 0, syserror.EINVAL + } +} + +// PtraceSetRegSet implements Context.PtraceSetRegSet. +func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { + switch regset { + case _NT_PRSTATUS: + if maxlen < ptraceRegsSize { + return 0, syserror.EFAULT + } + return s.PtraceSetRegs(src) + default: + return 0, syserror.EINVAL + } +} + +// FullRestore indicates whether a full restore is required. +func (s *State) FullRestore() bool { + return false +} + +// New returns a new architecture context. +func New(arch Arch, fs *cpuid.FeatureSet) Context { + switch arch { + case ARM64: + return &context64{ + State{ + FeatureSet: fs, + }, + } + } + panic(fmt.Sprintf("unknown architecture %v", arch)) +} diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go new file mode 100644 index 000000000..0d5b7d317 --- /dev/null +++ b/pkg/sentry/arch/arch_arm64.go @@ -0,0 +1,266 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "fmt" + "math/rand" + "syscall" + + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// Host specifies the host architecture. +const Host = ARM64 + +// These constants come directly from Linux. +const ( + // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux + // for a 64-bit process. + maxAddr64 usermem.Addr = (1 << 48) + + // maxStackRand64 is the maximum randomization to apply to the stack. + // It is defined by arch/arm64/mm/mmap.c:(STACK_RND_MASK << PAGE_SHIFT) in Linux. + maxStackRand64 = 0x3ffff << 12 // 16 GB + + // maxMmapRand64 is the maximum randomization to apply to the mmap + // layout. It is defined by arch/arm64/mm/mmap.c:arch_mmap_rnd in Linux. + maxMmapRand64 = (1 << 33) * usermem.PageSize + + // minGap64 is the minimum gap to leave at the top of the address space + // for the stack. It is defined by arch/arm64/mm/mmap.c:MIN_GAP in Linux. + minGap64 = (128 << 20) + maxStackRand64 + + // preferredPIELoadAddr is the standard Linux position-independent + // executable base load address. It is ELF_ET_DYN_BASE in Linux. + // + // The Platform {Min,Max}UserAddress() may preclude loading at this + // address. See other preferredFoo comments below. + preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5 +) + +// These constants are selected as heuristics to help make the Platform's +// potentially limited address space conform as closely to Linux as possible. +const ( + preferredTopDownAllocMin usermem.Addr = 0x7e8000000000 + preferredAllocationGap = 128 << 30 // 128 GB + preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap + + // minMmapRand64 is the smallest we are willing to make the + // randomization to stay above preferredTopDownBaseMin. + minMmapRand64 = (1 << 18) * usermem.PageSize +) + +// context64 represents an ARM64 context. +type context64 struct { + State +} + +// Arch implements Context.Arch. +func (c *context64) Arch() Arch { + return ARM64 +} + +// Fork returns an exact copy of this context. +func (c *context64) Fork() Context { + return &context64{ + State: c.State.Fork(), + } +} + +// General purpose registers usage on Arm64: +// R0...R7: parameter/result registers. +// R8: indirect result location register. +// R9...R15: temporary rgisters. +// R16: the first intra-procedure-call scratch register. +// R17: the second intra-procedure-call scratch register. +// R18: the platform register. +// R19...R28: callee-saved registers. +// R29: the frame pointer. +// R30: the link register. + +// Return returns the current syscall return value. +func (c *context64) Return() uintptr { + return uintptr(c.Regs.Regs[0]) +} + +// SetReturn sets the syscall return value. +func (c *context64) SetReturn(value uintptr) { + c.Regs.Regs[0] = uint64(value) +} + +// IP returns the current instruction pointer. +func (c *context64) IP() uintptr { + return uintptr(c.Regs.Pc) +} + +// SetIP sets the current instruction pointer. +func (c *context64) SetIP(value uintptr) { + c.Regs.Pc = uint64(value) +} + +// Stack returns the current stack pointer. +func (c *context64) Stack() uintptr { + return uintptr(c.Regs.Sp) +} + +// SetStack sets the current stack pointer. +func (c *context64) SetStack(value uintptr) { + c.Regs.Sp = uint64(value) +} + +// TLS returns the current TLS pointer. +func (c *context64) TLS() uintptr { + // TODO(gvisor.dev/issue/1238): TLS is not supported. + // MRS_TPIDR_EL0 + return 0 +} + +// SetTLS sets the current TLS pointer. Returns false if value is invalid. +func (c *context64) SetTLS(value uintptr) bool { + // TODO(gvisor.dev/issue/1238): TLS is not supported. + // MSR_TPIDR_EL0 + return false +} + +// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. +func (c *context64) SetRSEQInterruptedIP(value uintptr) { + c.Regs.Regs[3] = uint64(value) +} + +// Native returns the native type for the given val. +func (c *context64) Native(val uintptr) interface{} { + v := uint64(val) + return &v +} + +// Value returns the generic val for the given native type. +func (c *context64) Value(val interface{}) uintptr { + return uintptr(*val.(*uint64)) +} + +// Width returns the byte width of this architecture. +func (c *context64) Width() uint { + return 8 +} + +// FeatureSet returns the FeatureSet in use. +func (c *context64) FeatureSet() *cpuid.FeatureSet { + return c.State.FeatureSet +} + +// mmapRand returns a random adjustment for randomizing an mmap layout. +func mmapRand(max uint64) usermem.Addr { + return usermem.Addr(rand.Int63n(int64(max))).RoundDown() +} + +// NewMmapLayout implements Context.NewMmapLayout consistently with Linux. +func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) { + min, ok := min.RoundUp() + if !ok { + return MmapLayout{}, syscall.EINVAL + } + if max > maxAddr64 { + max = maxAddr64 + } + max = max.RoundDown() + + if min > max { + return MmapLayout{}, syscall.EINVAL + } + + stackSize := r.Get(limits.Stack) + + // MAX_GAP in Linux. + maxGap := (max / 6) * 5 + gap := usermem.Addr(stackSize.Cur) + if gap < minGap64 { + gap = minGap64 + } + if gap > maxGap { + gap = maxGap + } + defaultDir := MmapTopDown + if stackSize.Cur == limits.Infinity { + defaultDir = MmapBottomUp + } + + topDownMin := max - gap - maxMmapRand64 + maxRand := usermem.Addr(maxMmapRand64) + if topDownMin < preferredTopDownBaseMin { + // Try to keep TopDownBase above preferredTopDownBaseMin by + // shrinking maxRand. + maxAdjust := maxRand - minMmapRand64 + needAdjust := preferredTopDownBaseMin - topDownMin + if needAdjust <= maxAdjust { + maxRand -= needAdjust + } + } + + rnd := mmapRand(uint64(maxRand)) + l := MmapLayout{ + MinAddr: min, + MaxAddr: max, + // TASK_UNMAPPED_BASE in Linux. + BottomUpBase: (max/3 + rnd).RoundDown(), + TopDownBase: (max - gap - rnd).RoundDown(), + DefaultDirection: defaultDir, + // We may have reduced the maximum randomization to keep + // TopDownBase above preferredTopDownBaseMin while maintaining + // our stack gap. Stack allocations must use that max + // randomization to avoiding eating into the gap. + MaxStackRand: uint64(maxRand), + } + + // Final sanity check on the layout. + if !l.Valid() { + panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) + } + + return l, nil +} + +// PIELoadAddress implements Context.PIELoadAddress. +func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr { + base := preferredPIELoadAddr + max, ok := base.AddLength(maxMmapRand64) + if !ok { + panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) + } + + if max > l.MaxAddr { + // preferredPIELoadAddr won't fit; fall back to the standard + // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. + // + // Don't bother trying to shrink the randomization for now. + base = l.TopDownBase / 3 * 2 + } + + return base + mmapRand(maxMmapRand64) +} + +// PtracePeekUser implements Context.PtracePeekUser. +func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { + // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64. + return c.Native(0), nil +} + +// PtracePokeUser implements Context.PtracePokeUser. +func (c *context64) PtracePokeUser(addr, data uintptr) error { + // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64. + return nil +} diff --git a/pkg/sentry/arch/arch_state_aarch64.go b/pkg/sentry/arch/arch_state_aarch64.go new file mode 100644 index 000000000..0136a85ad --- /dev/null +++ b/pkg/sentry/arch/arch_state_aarch64.go @@ -0,0 +1,38 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package arch + +import ( + "syscall" +) + +type syscallPtraceRegs struct { + Regs [31]uint64 + Sp uint64 + Pc uint64 + Pstate uint64 +} + +// saveRegs is invoked by stateify. +func (s *State) saveRegs() syscallPtraceRegs { + return syscallPtraceRegs(s.Regs) +} + +// loadRegs is invoked by stateify. +func (s *State) loadRegs(r syscallPtraceRegs) { + s.Regs = syscall.PtraceRegs(r) +} diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index 9061fcc86..84f11b0d1 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 i386 + package arch import ( diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto index 9dc83e241..60c027aab 100644 --- a/pkg/sentry/arch/registers.proto +++ b/pkg/sentry/arch/registers.proto @@ -48,8 +48,45 @@ message AMD64Registers { uint64 gs_base = 27; } +message ARM64Registers { + uint64 r0 = 1; + uint64 r1 = 2; + uint64 r2 = 3; + uint64 r3 = 4; + uint64 r4 = 5; + uint64 r5 = 6; + uint64 r6 = 7; + uint64 r7 = 8; + uint64 r8 = 9; + uint64 r9 = 10; + uint64 r10 = 11; + uint64 r11 = 12; + uint64 r12 = 13; + uint64 r13 = 14; + uint64 r14 = 15; + uint64 r15 = 16; + uint64 r16 = 17; + uint64 r17 = 18; + uint64 r18 = 19; + uint64 r19 = 20; + uint64 r20 = 21; + uint64 r21 = 22; + uint64 r22 = 23; + uint64 r23 = 24; + uint64 r24 = 25; + uint64 r25 = 26; + uint64 r26 = 27; + uint64 r27 = 28; + uint64 r28 = 29; + uint64 r29 = 30; + uint64 r30 = 31; + uint64 sp = 32; + uint64 pc = 33; + uint64 pstate = 34; +} message Registers { oneof arch { AMD64Registers amd64 = 1; + ARM64Registers arm64 = 2; } } diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go new file mode 100644 index 000000000..402e46025 --- /dev/null +++ b/pkg/sentry/arch/signal.go @@ -0,0 +1,250 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// SignalAct represents the action that should be taken when a signal is +// delivered, and is equivalent to struct sigaction. +// +// +stateify savable +type SignalAct struct { + Handler uint64 + Flags uint64 + Restorer uint64 // Only used on amd64. + Mask linux.SignalSet +} + +// SerializeFrom implements NativeSignalAct.SerializeFrom. +func (s *SignalAct) SerializeFrom(other *SignalAct) { + *s = *other +} + +// DeserializeTo implements NativeSignalAct.DeserializeTo. +func (s *SignalAct) DeserializeTo(other *SignalAct) { + *other = *s +} + +// SignalStack represents information about a user stack, and is equivalent to +// stack_t. +// +// +stateify savable +type SignalStack struct { + Addr uint64 + Flags uint32 + _ uint32 + Size uint64 +} + +// SerializeFrom implements NativeSignalStack.SerializeFrom. +func (s *SignalStack) SerializeFrom(other *SignalStack) { + *s = *other +} + +// DeserializeTo implements NativeSignalStack.DeserializeTo. +func (s *SignalStack) DeserializeTo(other *SignalStack) { + *other = *s +} + +// SignalInfo represents information about a signal being delivered, and is +// equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h). +// +// +stateify savable +type SignalInfo struct { + Signo int32 // Signal number + Errno int32 // Errno value + Code int32 // Signal code + _ uint32 + + // struct siginfo::_sifields is a union. In SignalInfo, fields in the union + // are accessed through methods. + // + // For reference, here is the definition of _sifields: (_sigfault._trapno, + // which does not exist on x86, omitted for clarity) + // + // union { + // int _pad[SI_PAD_SIZE]; + // + // /* kill() */ + // struct { + // __kernel_pid_t _pid; /* sender's pid */ + // __ARCH_SI_UID_T _uid; /* sender's uid */ + // } _kill; + // + // /* POSIX.1b timers */ + // struct { + // __kernel_timer_t _tid; /* timer id */ + // int _overrun; /* overrun count */ + // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; + // sigval_t _sigval; /* same as below */ + // int _sys_private; /* not to be passed to user */ + // } _timer; + // + // /* POSIX.1b signals */ + // struct { + // __kernel_pid_t _pid; /* sender's pid */ + // __ARCH_SI_UID_T _uid; /* sender's uid */ + // sigval_t _sigval; + // } _rt; + // + // /* SIGCHLD */ + // struct { + // __kernel_pid_t _pid; /* which child */ + // __ARCH_SI_UID_T _uid; /* sender's uid */ + // int _status; /* exit code */ + // __ARCH_SI_CLOCK_T _utime; + // __ARCH_SI_CLOCK_T _stime; + // } _sigchld; + // + // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ + // struct { + // void *_addr; /* faulting insn/memory ref. */ + // short _addr_lsb; /* LSB of the reported address */ + // } _sigfault; + // + // /* SIGPOLL */ + // struct { + // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + // int _fd; + // } _sigpoll; + // + // /* SIGSYS */ + // struct { + // void *_call_addr; /* calling user insn */ + // int _syscall; /* triggering system call number */ + // unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + // } _sigsys; + // } _sifields; + // + // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128 + // bytes. + Fields [128 - 16]byte +} + +// FixSignalCodeForUser fixes up si_code. +// +// The si_code we get from Linux may contain the kernel-specific code in the +// top 16 bits if it's positive (e.g., from ptrace). Linux's +// copy_siginfo_to_user does +// err |= __put_user((short)from->si_code, &to->si_code); +// to mask out those bits and we need to do the same. +func (s *SignalInfo) FixSignalCodeForUser() { + if s.Code > 0 { + s.Code &= 0x0000ffff + } +} + +// Pid returns the si_pid field. +func (s *SignalInfo) Pid() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[0:4])) +} + +// SetPid mutates the si_pid field. +func (s *SignalInfo) SetPid(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) +} + +// Uid returns the si_uid field. +func (s *SignalInfo) Uid() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[4:8])) +} + +// SetUid mutates the si_uid field. +func (s *SignalInfo) SetUid(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) +} + +// Sigval returns the sigval field, which is aliased to both si_int and si_ptr. +func (s *SignalInfo) Sigval() uint64 { + return usermem.ByteOrder.Uint64(s.Fields[8:16]) +} + +// SetSigval mutates the sigval field. +func (s *SignalInfo) SetSigval(val uint64) { + usermem.ByteOrder.PutUint64(s.Fields[8:16], val) +} + +// TimerID returns the si_timerid field. +func (s *SignalInfo) TimerID() linux.TimerID { + return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4])) +} + +// SetTimerID sets the si_timerid field. +func (s *SignalInfo) SetTimerID(val linux.TimerID) { + usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) +} + +// Overrun returns the si_overrun field. +func (s *SignalInfo) Overrun() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[4:8])) +} + +// SetOverrun sets the si_overrun field. +func (s *SignalInfo) SetOverrun(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) +} + +// Addr returns the si_addr field. +func (s *SignalInfo) Addr() uint64 { + return usermem.ByteOrder.Uint64(s.Fields[0:8]) +} + +// SetAddr sets the si_addr field. +func (s *SignalInfo) SetAddr(val uint64) { + usermem.ByteOrder.PutUint64(s.Fields[0:8], val) +} + +// Status returns the si_status field. +func (s *SignalInfo) Status() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[8:12])) +} + +// SetStatus mutates the si_status field. +func (s *SignalInfo) SetStatus(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) +} + +// CallAddr returns the si_call_addr field. +func (s *SignalInfo) CallAddr() uint64 { + return usermem.ByteOrder.Uint64(s.Fields[0:8]) +} + +// SetCallAddr mutates the si_call_addr field. +func (s *SignalInfo) SetCallAddr(val uint64) { + usermem.ByteOrder.PutUint64(s.Fields[0:8], val) +} + +// Syscall returns the si_syscall field. +func (s *SignalInfo) Syscall() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[8:12])) +} + +// SetSyscall mutates the si_syscall field. +func (s *SignalInfo) SetSyscall(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) +} + +// Arch returns the si_arch field. +func (s *SignalInfo) Arch() uint32 { + return usermem.ByteOrder.Uint32(s.Fields[12:16]) +} + +// SetArch mutates the si_arch field. +func (s *SignalInfo) SetArch(val uint32) { + usermem.ByteOrder.PutUint32(s.Fields[12:16], val) +} diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index febd6f9b9..1e4f9c3c2 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -26,236 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usermem" ) -// SignalAct represents the action that should be taken when a signal is -// delivered, and is equivalent to struct sigaction on 64-bit x86. -// -// +stateify savable -type SignalAct struct { - Handler uint64 - Flags uint64 - Restorer uint64 - Mask linux.SignalSet -} - -// SerializeFrom implements NativeSignalAct.SerializeFrom. -func (s *SignalAct) SerializeFrom(other *SignalAct) { - *s = *other -} - -// DeserializeTo implements NativeSignalAct.DeserializeTo. -func (s *SignalAct) DeserializeTo(other *SignalAct) { - *other = *s -} - -// SignalStack represents information about a user stack, and is equivalent to -// stack_t on 64-bit x86. -// -// +stateify savable -type SignalStack struct { - Addr uint64 - Flags uint32 - _ uint32 - Size uint64 -} - -// SerializeFrom implements NativeSignalStack.SerializeFrom. -func (s *SignalStack) SerializeFrom(other *SignalStack) { - *s = *other -} - -// DeserializeTo implements NativeSignalStack.DeserializeTo. -func (s *SignalStack) DeserializeTo(other *SignalStack) { - *other = *s -} - -// SignalInfo represents information about a signal being delivered, and is -// equivalent to struct siginfo on 64-bit x86. -// -// +stateify savable -type SignalInfo struct { - Signo int32 // Signal number - Errno int32 // Errno value - Code int32 // Signal code - _ uint32 - - // struct siginfo::_sifields is a union. In SignalInfo, fields in the union - // are accessed through methods. - // - // For reference, here is the definition of _sifields: (_sigfault._trapno, - // which does not exist on x86, omitted for clarity) - // - // union { - // int _pad[SI_PAD_SIZE]; - // - // /* kill() */ - // struct { - // __kernel_pid_t _pid; /* sender's pid */ - // __ARCH_SI_UID_T _uid; /* sender's uid */ - // } _kill; - // - // /* POSIX.1b timers */ - // struct { - // __kernel_timer_t _tid; /* timer id */ - // int _overrun; /* overrun count */ - // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; - // sigval_t _sigval; /* same as below */ - // int _sys_private; /* not to be passed to user */ - // } _timer; - // - // /* POSIX.1b signals */ - // struct { - // __kernel_pid_t _pid; /* sender's pid */ - // __ARCH_SI_UID_T _uid; /* sender's uid */ - // sigval_t _sigval; - // } _rt; - // - // /* SIGCHLD */ - // struct { - // __kernel_pid_t _pid; /* which child */ - // __ARCH_SI_UID_T _uid; /* sender's uid */ - // int _status; /* exit code */ - // __ARCH_SI_CLOCK_T _utime; - // __ARCH_SI_CLOCK_T _stime; - // } _sigchld; - // - // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ - // struct { - // void *_addr; /* faulting insn/memory ref. */ - // short _addr_lsb; /* LSB of the reported address */ - // } _sigfault; - // - // /* SIGPOLL */ - // struct { - // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ - // int _fd; - // } _sigpoll; - // - // /* SIGSYS */ - // struct { - // void *_call_addr; /* calling user insn */ - // int _syscall; /* triggering system call number */ - // unsigned int _arch; /* AUDIT_ARCH_* of syscall */ - // } _sigsys; - // } _sifields; - // - // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128 - // bytes. - Fields [128 - 16]byte -} - -// FixSignalCodeForUser fixes up si_code. -// -// The si_code we get from Linux may contain the kernel-specific code in the -// top 16 bits if it's positive (e.g., from ptrace). Linux's -// copy_siginfo_to_user does -// err |= __put_user((short)from->si_code, &to->si_code); -// to mask out those bits and we need to do the same. -func (s *SignalInfo) FixSignalCodeForUser() { - if s.Code > 0 { - s.Code &= 0x0000ffff - } -} - -// Pid returns the si_pid field. -func (s *SignalInfo) Pid() int32 { - return int32(usermem.ByteOrder.Uint32(s.Fields[0:4])) -} - -// SetPid mutates the si_pid field. -func (s *SignalInfo) SetPid(val int32) { - usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) -} - -// Uid returns the si_uid field. -func (s *SignalInfo) Uid() int32 { - return int32(usermem.ByteOrder.Uint32(s.Fields[4:8])) -} - -// SetUid mutates the si_uid field. -func (s *SignalInfo) SetUid(val int32) { - usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) -} - -// Sigval returns the sigval field, which is aliased to both si_int and si_ptr. -func (s *SignalInfo) Sigval() uint64 { - return usermem.ByteOrder.Uint64(s.Fields[8:16]) -} - -// SetSigval mutates the sigval field. -func (s *SignalInfo) SetSigval(val uint64) { - usermem.ByteOrder.PutUint64(s.Fields[8:16], val) -} - -// TimerID returns the si_timerid field. -func (s *SignalInfo) TimerID() linux.TimerID { - return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4])) -} - -// SetTimerID sets the si_timerid field. -func (s *SignalInfo) SetTimerID(val linux.TimerID) { - usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) -} - -// Overrun returns the si_overrun field. -func (s *SignalInfo) Overrun() int32 { - return int32(usermem.ByteOrder.Uint32(s.Fields[4:8])) -} - -// SetOverrun sets the si_overrun field. -func (s *SignalInfo) SetOverrun(val int32) { - usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) -} - -// Addr returns the si_addr field. -func (s *SignalInfo) Addr() uint64 { - return usermem.ByteOrder.Uint64(s.Fields[0:8]) -} - -// SetAddr sets the si_addr field. -func (s *SignalInfo) SetAddr(val uint64) { - usermem.ByteOrder.PutUint64(s.Fields[0:8], val) -} - -// Status returns the si_status field. -func (s *SignalInfo) Status() int32 { - return int32(usermem.ByteOrder.Uint32(s.Fields[8:12])) -} - -// SetStatus mutates the si_status field. -func (s *SignalInfo) SetStatus(val int32) { - usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) -} - -// CallAddr returns the si_call_addr field. -func (s *SignalInfo) CallAddr() uint64 { - return usermem.ByteOrder.Uint64(s.Fields[0:8]) -} - -// SetCallAddr mutates the si_call_addr field. -func (s *SignalInfo) SetCallAddr(val uint64) { - usermem.ByteOrder.PutUint64(s.Fields[0:8], val) -} - -// Syscall returns the si_syscall field. -func (s *SignalInfo) Syscall() int32 { - return int32(usermem.ByteOrder.Uint32(s.Fields[8:12])) -} - -// SetSyscall mutates the si_syscall field. -func (s *SignalInfo) SetSyscall(val int32) { - usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) -} - -// Arch returns the si_arch field. -func (s *SignalInfo) Arch() uint32 { - return usermem.ByteOrder.Uint32(s.Fields[12:16]) -} - -// SetArch mutates the si_arch field. -func (s *SignalInfo) SetArch(val uint32) { - usermem.ByteOrder.PutUint32(s.Fields[12:16], val) -} - // SignalContext64 is equivalent to struct sigcontext, the type passed as the // second argument to signal handlers set by signal(2). type SignalContext64 struct { diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go new file mode 100644 index 000000000..7d0e98935 --- /dev/null +++ b/pkg/sentry/arch/signal_arm64.go @@ -0,0 +1,126 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "encoding/binary" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// SignalContext64 is equivalent to struct sigcontext, the type passed as the +// second argument to signal handlers set by signal(2). +type SignalContext64 struct { + FaultAddr uint64 + Regs [31]uint64 + Sp uint64 + Pc uint64 + Pstate uint64 + _pad [8]byte // __attribute__((__aligned__(16))) + Reserved [4096]uint8 +} + +// UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h). +type UContext64 struct { + Flags uint64 + Link *UContext64 + Stack SignalStack + Sigset linux.SignalSet + // glibc uses a 1024-bit sigset_t + _pad [(1024 - 64) / 8]byte + // sigcontext must be aligned to 16-byte + _pad2 [8]byte + // last for future expansion + MContext SignalContext64 +} + +// NewSignalAct implements Context.NewSignalAct. +func (c *context64) NewSignalAct() NativeSignalAct { + return &SignalAct{} +} + +// NewSignalStack implements Context.NewSignalStack. +func (c *context64) NewSignalStack() NativeSignalStack { + return &SignalStack{} +} + +// SignalSetup implements Context.SignalSetup. +func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error { + sp := st.Bottom + + if !(alt.IsEnabled() && sp == alt.Top()) { + sp -= 128 + } + + // Construct the UContext64 now since we need its size. + uc := &UContext64{ + Flags: 0, + Stack: *alt, + MContext: SignalContext64{ + Regs: c.Regs.Regs, + Sp: c.Regs.Sp, + Pc: c.Regs.Pc, + Pstate: c.Regs.Pstate, + }, + Sigset: sigset, + } + + ucSize := binary.Size(uc) + if ucSize < 0 { + panic("can't get size of UContext64") + } + // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128. + frameSize := int(st.Arch.Width()) + ucSize + 128 + frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8 + sp = frameBottom + usermem.Addr(frameSize) + st.Bottom = sp + + // Prior to proceeding, figure out if the frame will exhaust the range + // for the signal stack. This is not allowed, and should immediately + // force signal delivery (reverting to the default handler). + if act.IsOnStack() && alt.IsEnabled() && !alt.Contains(frameBottom) { + return syscall.EFAULT + } + + // Adjust the code. + info.FixSignalCodeForUser() + + // Set up the stack frame. + infoAddr, err := st.Push(info) + if err != nil { + return err + } + ucAddr, err := st.Push(uc) + if err != nil { + return err + } + + // Set up registers. + c.Regs.Sp = uint64(st.Bottom) + c.Regs.Pc = act.Handler + c.Regs.Regs[0] = uint64(info.Signo) + c.Regs.Regs[1] = uint64(infoAddr) + c.Regs.Regs[2] = uint64(ucAddr) + + return nil +} + +// SignalRestore implements Context.SignalRestore. +// Only used on intel. +func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) { + return 0, SignalStack{}, nil +} diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go index 5a3228113..d324da705 100644 --- a/pkg/sentry/arch/signal_stack.go +++ b/pkg/sentry/arch/signal_stack.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build i386 amd64 +// +build i386 amd64 arm64 package arch diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go new file mode 100644 index 000000000..00d5ef461 --- /dev/null +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -0,0 +1,62 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package arch + +const restartSyscallNr = uintptr(128) + +// SyscallNo returns the syscall number according to the 64-bit convention. +func (c *context64) SyscallNo() uintptr { + return uintptr(c.Regs.Regs[8]) +} + +// SyscallArgs provides syscall arguments according to the 64-bit convention. +// +// Due to the way addresses are mapped for the sentry this binary *must* be +// built in 64-bit mode. So we can just assume the syscall numbers that come +// back match the expected host system call numbers. +// General purpose registers usage on Arm64: +// R0...R7: parameter/result registers. +// R8: indirect result location register. +// R9...R15: temporary registers. +// R16: the first intra-procedure-call scratch register. +// R17: the second intra-procedure-call scratch register. +// R18: the platform register. +// R19...R28: callee-saved registers. +// R29: the frame pointer. +// R30: the link register. +func (c *context64) SyscallArgs() SyscallArguments { + return SyscallArguments{ + SyscallArgument{Value: uintptr(c.Regs.Regs[0])}, + SyscallArgument{Value: uintptr(c.Regs.Regs[1])}, + SyscallArgument{Value: uintptr(c.Regs.Regs[2])}, + SyscallArgument{Value: uintptr(c.Regs.Regs[3])}, + SyscallArgument{Value: uintptr(c.Regs.Regs[4])}, + SyscallArgument{Value: uintptr(c.Regs.Regs[5])}, + } +} + +// RestartSyscall implements Context.RestartSyscall. +func (c *context64) RestartSyscall() { + c.Regs.Pc -= SyscallWidth + c.Regs.Regs[8] = uint64(restartSyscallNr) +} + +// RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock. +func (c *context64) RestartSyscallWithRestartBlock() { + c.Regs.Pc -= SyscallWidth + c.Regs.Regs[8] = uint64(restartSyscallNr) +} -- cgit v1.2.3 From d6fb1ec6c7c76040dd20e915b32f9ed795ae7077 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 15 Jan 2020 16:31:24 -0800 Subject: Add timestamps to VFS2 tmpfs, and implement some of SetStat. PiperOrigin-RevId: 289962040 --- pkg/abi/linux/time.go | 13 ++ pkg/sentry/fsimpl/tmpfs/BUILD | 2 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 16 +- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 129 ++++++++++++--- pkg/sentry/fsimpl/tmpfs/stat_test.go | 232 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 72 +++++++-- 6 files changed, 425 insertions(+), 39 deletions(-) create mode 100644 pkg/sentry/fsimpl/tmpfs/stat_test.go (limited to 'pkg/sentry') diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index 546668bca..5c5a58cd4 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -234,6 +234,19 @@ type StatxTimestamp struct { _ int32 } +// ToNsec returns the nanosecond representation. +func (sxts StatxTimestamp) ToNsec() int64 { + return int64(sxts.Sec)*1e9 + int64(sxts.Nsec) +} + +// ToNsecCapped returns the safe nanosecond representation. +func (sxts StatxTimestamp) ToNsecCapped() int64 { + if sxts.Sec > maxSecInDuration { + return math.MaxInt64 + } + return sxts.ToNsec() +} + // NsecToStatxTimestamp translates nanoseconds to StatxTimestamp. func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { return StatxTimestamp{ diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 82f5c2f41..7601c7c04 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -40,6 +40,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", @@ -77,6 +78,7 @@ go_test( srcs = [ "pipe_test.go", "regular_file_test.go", + "stat_test.go", ], embed = [":tmpfs"], deps = [ diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 26979729e..4cd7e9aea 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -56,7 +56,8 @@ afterSymlink: } next := nextVFSD.Impl().(*dentry) if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO: symlink traversals update access time + // TODO(gvisor.dev/issues/1197): Symlink traversals updates + // access time. if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -501,7 +502,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.inode.decLinksLocked() newParent.inode.incLinksLocked() } - // TODO: update timestamps and parent directory sizes + // TODO(gvisor.dev/issues/1197): Update timestamps and parent directory + // sizes. vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) return nil } @@ -555,15 +557,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM + return d.inode.setStat(opts.Stat) } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -587,7 +585,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO: actually implement statfs + // TODO(gvisor.dev/issues/1197): Actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 3731c5b6f..7b0a962f0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/abi/linux" @@ -29,10 +30,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If -// the returned err is not nil, then cleanup should be called when the FD is no -// longer needed. -func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) { +// nextFileID is used to generate unique file names. +var nextFileID int64 + +// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error +// is not nil, then cleanup should be called when the root is no longer needed. +func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { creds := auth.CredentialsFromContext(ctx) vfsObj := vfs.New() @@ -41,36 +44,124 @@ func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { - return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } root := mntns.Root() + return vfsObj, root, func() { + root.DecRef() + mntns.DecRef(vfsObj) + }, nil +} + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) // Create the file that will be write/read. fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - FollowFinalSymlink: true, + Root: root, + Start: root, + Path: fspath.Parse(filename), }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: 0644, + Mode: linux.ModeRegular | mode, }) if err != nil { - root.DecRef() - mntns.DecRef(vfsObj) + cleanup() return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) } - return fd, func() { - root.DecRef() - mntns.DecRef(vfsObj) - }, nil + return fd, cleanup, nil +} + +// newDirFD is like newFileFD, but for directories. +func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the dir. + if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.MkdirOptions{ + Mode: linux.ModeDirectory | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) + } + + // Open the dir and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) + } + + return fd, cleanup, nil +} + +// newPipeFD is like newFileFD, but for pipes. +func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the pipe. + if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(pipename), + }, &vfs.MknodOptions{ + Mode: linux.ModeNamedPipe | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err) + } + + // Open the pipe and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(pipename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err) + } + + return fd, cleanup, nil } // Test that we can write some data to a file and read it back.` func TestSimpleWriteRead(t *testing.T) { ctx := contexttest.Context(t) - fd, cleanup, err := newFileFD(ctx, "simpleReadWrite") + fd, cleanup, err := newFileFD(ctx, 0644) if err != nil { t.Fatal(err) } @@ -116,7 +207,7 @@ func TestSimpleWriteRead(t *testing.T) { func TestPWrite(t *testing.T) { ctx := contexttest.Context(t) - fd, cleanup, err := newFileFD(ctx, "PRead") + fd, cleanup, err := newFileFD(ctx, 0644) if err != nil { t.Fatal(err) } @@ -171,7 +262,7 @@ func TestPWrite(t *testing.T) { func TestPRead(t *testing.T) { ctx := contexttest.Context(t) - fd, cleanup, err := newFileFD(ctx, "PRead") + fd, cleanup, err := newFileFD(ctx, 0644) if err != nil { t.Fatal(err) } diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go new file mode 100644 index 000000000..ebe035dee --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -0,0 +1,232 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func TestStatAfterCreate(t *testing.T) { + ctx := contexttest.Context(t) + mode := linux.FileMode(0644) + + // Run with different file types. + // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + for _, typ := range []string{"file", "dir", "pipe"} { + t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { + var ( + fd *vfs.FileDescription + cleanup func() + err error + ) + switch typ { + case "file": + fd, cleanup, err = newFileFD(ctx, mode) + case "dir": + fd, cleanup, err = newDirFD(ctx, mode) + case "pipe": + fd, cleanup, err = newPipeFD(ctx, mode) + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + if err != nil { + t.Fatal(err) + } + defer cleanup() + + got, err := fd.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Atime, Ctime, Mtime should all be current time (non-zero). + atime, ctime, mtime := got.Atime.ToNsec(), got.Ctime.ToNsec(), got.Mtime.ToNsec() + if atime != ctime || ctime != mtime { + t.Errorf("got atime=%d ctime=%d mtime=%d, wanted equal values", atime, ctime, mtime) + } + if atime == 0 { + t.Errorf("got atime=%d, want non-zero", atime) + } + + // Btime should be 0, as it is not set by tmpfs. + if btime := got.Btime.ToNsec(); btime != 0 { + t.Errorf("got btime %d, want 0", got.Btime.ToNsec()) + } + + // Size should be 0. + if got.Size != 0 { + t.Errorf("got size %d, want 0", got.Size) + } + + // Nlink should be 1 for files, 2 for dirs. + wantNlink := uint32(1) + if typ == "dir" { + wantNlink = 2 + } + if got.Nlink != wantNlink { + t.Errorf("got nlink %d, want %d", got.Nlink, wantNlink) + } + + // UID and GID are set from context creds. + creds := auth.CredentialsFromContext(ctx) + if got.UID != uint32(creds.EffectiveKUID) { + t.Errorf("got uid %d, want %d", got.UID, uint32(creds.EffectiveKUID)) + } + if got.GID != uint32(creds.EffectiveKGID) { + t.Errorf("got gid %d, want %d", got.GID, uint32(creds.EffectiveKGID)) + } + + // Mode. + wantMode := uint16(mode) + switch typ { + case "file": + wantMode |= linux.S_IFREG + case "dir": + wantMode |= linux.S_IFDIR + case "pipe": + wantMode |= linux.S_IFIFO + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + + if got.Mode != wantMode { + t.Errorf("got mode %x, want %x", got.Mode, wantMode) + } + + // Ino. + if got.Ino == 0 { + t.Errorf("got ino %d, want not 0", got.Ino) + } + }) + } +} + +func TestSetStatAtime(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL} + + // Get initial stat. + initialStat, err := fd.Stat(ctx, allStatOptions) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Set atime, but without the mask. + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{ + Mask: 0, + Atime: linux.NsecToStatxTimestamp(100), + }}); err != nil { + t.Errorf("SetStat atime without mask failed: %v") + } + // Atime should be unchanged. + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != initialStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime) + } + + // Set atime, this time included in the mask. + setStat := linux.Statx{ + Mask: linux.STATX_ATIME, + Atime: linux.NsecToStatxTimestamp(100), + } + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { + t.Errorf("SetStat atime with mask failed: %v") + } + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != setStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime) + } +} + +func TestSetStat(t *testing.T) { + ctx := contexttest.Context(t) + mode := linux.FileMode(0644) + + // Run with different file types. + // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + for _, typ := range []string{"file", "dir", "pipe"} { + t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { + var ( + fd *vfs.FileDescription + cleanup func() + err error + ) + switch typ { + case "file": + fd, cleanup, err = newFileFD(ctx, mode) + case "dir": + fd, cleanup, err = newDirFD(ctx, mode) + case "pipe": + fd, cleanup, err = newPipeFD(ctx, mode) + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + if err != nil { + t.Fatal(err) + } + defer cleanup() + + allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL} + + // Get initial stat. + initialStat, err := fd.Stat(ctx, allStatOptions) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Set atime, but without the mask. + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{ + Mask: 0, + Atime: linux.NsecToStatxTimestamp(100), + }}); err != nil { + t.Errorf("SetStat atime without mask failed: %v") + } + // Atime should be unchanged. + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != initialStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime) + } + + // Set atime, this time included in the mask. + setStat := linux.Statx{ + Mask: linux.STATX_ATIME, + Atime: linux.NsecToStatxTimestamp(100), + } + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { + t.Errorf("SetStat atime with mask failed: %v") + } + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != setStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 701826f90..d6960ee47 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -31,10 +31,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // FilesystemType implements vfs.FilesystemType. @@ -47,6 +47,9 @@ type filesystem struct { // memFile is used to allocate pages to for regular files. memFile *pgalloc.MemoryFile + // clock is a realtime clock used to set timestamps in file operations. + clock time.Clock + // mu serializes changes to the Dentry tree. mu sync.RWMutex @@ -59,8 +62,10 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } + clock := time.RealtimeClockFromContext(ctx) fs := filesystem{ memFile: memFileProvider.MemoryFile(), + clock: clock, } fs.vfsfs.Init(vfsObj, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) @@ -126,26 +131,36 @@ type inode struct { // filesystem.RmdirAt() drops the reference. refs int64 - // Inode metadata; protected by mu and accessed using atomic memory - // operations unless otherwise specified. - mu sync.RWMutex + // Inode metadata. Writing multiple fields atomically requires holding + // mu, othewise atomic operations can be used. + mu sync.Mutex mode uint32 // excluding file type bits, which are based on impl nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... ino uint64 // immutable + // Linux's tmpfs has no concept of btime. + atime int64 // nanoseconds + ctime int64 // nanoseconds + mtime int64 // nanoseconds + impl interface{} // immutable } const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { + now := fs.clock.Now().Nanoseconds() i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) + // Tmpfs creation sets atime, ctime, and mtime to current time. + i.atime = now + i.ctime = now + i.mtime = now // i.nlink initialized by caller i.impl = impl } @@ -213,15 +228,24 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, i // Go won't inline this function, and returning linux.Statx (which is quite // big) means spending a lot of time in runtime.duffcopy(), so instead it's an // output parameter. +// +// Note that Linux does not guarantee to return consistent data (in the case of +// a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | + linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME stat.Blksize = 1 // usermem.PageSize in tmpfs stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino - // TODO: device number + // Linux's tmpfs has no concept of btime, so zero-value is returned. + stat.Atime = linux.NsecToStatxTimestamp(i.atime) + stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) + stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) + // TODO(gvisor.dev/issues/1197): Device number. switch impl := i.impl.(type) { case *regularFile: stat.Mode |= linux.S_IFREG @@ -245,6 +269,36 @@ func (i *inode) statTo(stat *linux.Statx) { } } +func (i *inode) setStat(stat linux.Statx) error { + // TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking + // the file. + if stat.Mask == 0 { + return nil + } + i.mu.Lock() + mask := stat.Mask + if mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&i.mode, uint32(stat.Mode)) + } + if mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&i.uid, stat.UID) + } + if mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&i.gid, stat.GID) + } + if mask&linux.STATX_ATIME != 0 { + atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + } + if mask&linux.STATX_CTIME != 0 { + atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + } + if mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + } + i.mu.Unlock() + return nil +} + // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block @@ -291,9 +345,5 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM + return fd.inode().setStat(opts.Stat) } -- cgit v1.2.3 From 420d335fc9495ec18a20f710869770d0708d9a49 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Thu, 16 Jan 2020 10:26:23 -0800 Subject: Enable clone syscall support on arm64. sys_clone has many flavors in Linux, and amd64 chose a different one from x86(different arguments order). Ref kernel/fork.c for more info. Signed-off-by: Haibo Xu Change-Id: I6c8cbc685f4a6e786b171715ab68292fc95cbf48 COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1545 from xiaobo55x:clone 156bd2dfbc63ef5291627b0578ddea77997393b2 PiperOrigin-RevId: 290093953 --- pkg/sentry/syscalls/linux/BUILD | 2 ++ pkg/sentry/syscalls/linux/sys_clone_amd64.go | 35 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/sys_clone_arm64.go | 35 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/sys_thread.go | 13 ----------- 4 files changed, 72 insertions(+), 13 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/sys_clone_amd64.go create mode 100644 pkg/sentry/syscalls/linux/sys_clone_arm64.go (limited to 'pkg/sentry') diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index aa05e208a..430d796ba 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -13,6 +13,8 @@ go_library( "sigset.go", "sys_aio.go", "sys_capability.go", + "sys_clone_amd64.go", + "sys_clone_arm64.go", "sys_epoll.go", "sys_eventfd.go", "sys_file.go", diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go new file mode 100644 index 000000000..dd43cf18d --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// Clone implements linux syscall clone(2). +// sys_clone has so many flavors. We implement the default one in linux 3.11 +// x86_64: +// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val) +func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + stack := args[1].Pointer() + parentTID := args[2].Pointer() + childTID := args[3].Pointer() + tls := args[4].Pointer() + return clone(t, flags, stack, parentTID, childTID, tls) +} diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go new file mode 100644 index 000000000..cf68a8949 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// Clone implements linux syscall clone(2). +// sys_clone has so many flavors, and we implement the default one in linux 3.11 +// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file): +// sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr) +func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + stack := args[1].Pointer() + parentTID := args[2].Pointer() + tls := args[3].Pointer() + childTID := args[4].Pointer() + return clone(t, flags, stack, parentTID, childTID, tls) +} diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 4115116ff..b47c3b5c4 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -220,19 +220,6 @@ func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr return uintptr(ntid), ctrl, err } -// Clone implements linux syscall clone(2). -// sys_clone has so many flavors. We implement the default one in linux 3.11 -// x86_64: -// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val) -func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - flags := int(args[0].Int()) - stack := args[1].Pointer() - parentTID := args[2].Pointer() - childTID := args[3].Pointer() - tls := args[4].Pointer() - return clone(t, flags, stack, parentTID, childTID, tls) -} - // Fork implements Linux syscall fork(2). func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // "A call to fork() is equivalent to a call to clone(2) specifying flags -- cgit v1.2.3 From 7b7c31820b83abcfe43f7170eff1f7953f3f27e2 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 16 Jan 2020 12:28:44 -0800 Subject: Add remaining /proc/* and /proc/sys/* files Except for one under /proc/sys/net/ipv4/tcp_sack. /proc/pid/* is still incomplete. Updates #1195 PiperOrigin-RevId: 290120438 --- pkg/cpuid/cpuid.go | 44 ++-- pkg/sentry/fs/proc/cpuinfo.go | 8 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 40 ++++ pkg/sentry/fsimpl/proc/BUILD | 11 +- pkg/sentry/fsimpl/proc/filesystem.go | 11 + pkg/sentry/fsimpl/proc/loadavg.go | 42 ---- pkg/sentry/fsimpl/proc/meminfo.go | 79 ------- pkg/sentry/fsimpl/proc/net.go | 338 ---------------------------- pkg/sentry/fsimpl/proc/net_test.go | 78 ------- pkg/sentry/fsimpl/proc/stat.go | 129 ----------- pkg/sentry/fsimpl/proc/sys.go | 51 ----- pkg/sentry/fsimpl/proc/task.go | 12 +- pkg/sentry/fsimpl/proc/tasks.go | 41 +++- pkg/sentry/fsimpl/proc/tasks_files.go | 245 ++++++++++++++++++++ pkg/sentry/fsimpl/proc/tasks_net.go | 337 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/tasks_sys.go | 143 ++++++++++++ pkg/sentry/fsimpl/proc/tasks_sys_test.go | 78 +++++++ pkg/sentry/fsimpl/proc/tasks_test.go | 3 + pkg/sentry/fsimpl/proc/version.go | 70 ------ 19 files changed, 922 insertions(+), 838 deletions(-) delete mode 100644 pkg/sentry/fsimpl/proc/loadavg.go delete mode 100644 pkg/sentry/fsimpl/proc/meminfo.go delete mode 100644 pkg/sentry/fsimpl/proc/net.go delete mode 100644 pkg/sentry/fsimpl/proc/net_test.go delete mode 100644 pkg/sentry/fsimpl/proc/stat.go delete mode 100644 pkg/sentry/fsimpl/proc/sys.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_net.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_sys.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_sys_test.go delete mode 100644 pkg/sentry/fsimpl/proc/version.go (limited to 'pkg/sentry') diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go index d37047368..cf50ee53f 100644 --- a/pkg/cpuid/cpuid.go +++ b/pkg/cpuid/cpuid.go @@ -657,30 +657,28 @@ func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string { return strings.Join(s, " ") } -// CPUInfo is to generate a section of one cpu in /proc/cpuinfo. This is a -// minimal /proc/cpuinfo, it is missing some fields like "microcode" that are +// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is +// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are // not always printed in Linux. The bogomips field is simply made up. -func (fs FeatureSet) CPUInfo(cpu uint) string { - var b bytes.Buffer - fmt.Fprintf(&b, "processor\t: %d\n", cpu) - fmt.Fprintf(&b, "vendor_id\t: %s\n", fs.VendorID) - fmt.Fprintf(&b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family) - fmt.Fprintf(&b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model) - fmt.Fprintf(&b, "model name\t: %s\n", "unknown") // Unknown for now. - fmt.Fprintf(&b, "stepping\t: %s\n", "unknown") // Unknown for now. - fmt.Fprintf(&b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz) - fmt.Fprintln(&b, "fpu\t\t: yes") - fmt.Fprintln(&b, "fpu_exception\t: yes") - fmt.Fprintf(&b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID. - fmt.Fprintln(&b, "wp\t\t: yes") - fmt.Fprintf(&b, "flags\t\t: %s\n", fs.FlagsString(true)) - fmt.Fprintf(&b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. - fmt.Fprintf(&b, "clflush size\t: %d\n", fs.CacheLine) - fmt.Fprintf(&b, "cache_alignment\t: %d\n", fs.CacheLine) - fmt.Fprintf(&b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48) - fmt.Fprintln(&b, "power management:") // This is always here, but can be blank. - fmt.Fprintln(&b, "") // The /proc/cpuinfo file ends with an extra newline. - return b.String() +func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) { + fmt.Fprintf(b, "processor\t: %d\n", cpu) + fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID) + fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family) + fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model) + fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now. + fmt.Fprintf(b, "stepping\t: %s\n", "unknown") // Unknown for now. + fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz) + fmt.Fprintln(b, "fpu\t\t: yes") + fmt.Fprintln(b, "fpu_exception\t: yes") + fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID. + fmt.Fprintln(b, "wp\t\t: yes") + fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true)) + fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. + fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine) + fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine) + fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48) + fmt.Fprintln(b, "power management:") // This is always here, but can be blank. + fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline. } const ( diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go index 3edf36780..6330337eb 100644 --- a/pkg/sentry/fs/proc/cpuinfo.go +++ b/pkg/sentry/fs/proc/cpuinfo.go @@ -15,6 +15,8 @@ package proc import ( + "bytes" + "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,9 +29,9 @@ func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode { // Kernel is always initialized with a FeatureSet. panic("cpuinfo read with nil FeatureSet") } - contents := make([]byte, 0, 1024) + var buf bytes.Buffer for i, max := uint(0), k.ApplicationCores(); i < max; i++ { - contents = append(contents, []byte(features.CPUInfo(i))...) + features.WriteCPUInfoTo(i, &buf) } - return newStaticProcInode(ctx, msrc, contents) + return newStaticProcInode(ctx, msrc, buf.Bytes()) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 1d469a0db..6aff3d39a 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -510,3 +510,43 @@ type InodeSymlink struct { func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } + +// StaticDirectory is a standard implementation of a directory with static +// contents. +// +// +stateify savable +type StaticDirectory struct { + InodeNotSymlink + InodeDirectoryNoNewChildren + InodeAttrs + InodeNoDynamicLookup + OrderedChildren +} + +var _ Inode = (*StaticDirectory)(nil) + +// NewStaticDir creates a new static directory and returns its dentry. +func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + + inode := &StaticDirectory{} + inode.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + + dentry := &Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, children) + inode.IncLinks(links) + + return dentry +} + +// Open implements kernfs.Inode. +func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 1f44b3217..6cd18cec8 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -7,17 +7,13 @@ go_library( name = "proc", srcs = [ "filesystem.go", - "loadavg.go", - "meminfo.go", "mounts.go", - "net.go", - "stat.go", - "sys.go", "task.go", "task_files.go", "tasks.go", "tasks_files.go", - "version.go", + "tasks_net.go", + "tasks_sys.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", deps = [ @@ -30,6 +26,7 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", "//pkg/sentry/socket", @@ -47,7 +44,7 @@ go_test( size = "small", srcs = [ "boot_test.go", - "net_test.go", + "tasks_sys_test.go", "tasks_test.go", ], embed = [":proc"], diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index d09182c77..e9cb7895f 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -67,3 +67,14 @@ func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode d d.Init(inode) return d } + +type staticFile struct { + kernfs.DynamicBytesFile + vfs.StaticData +} + +var _ dynamicInode = (*staticFile)(nil) + +func newStaticFile(data string) *staticFile { + return &staticFile{StaticData: vfs.StaticData{Data: data}} +} diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go deleted file mode 100644 index 5351d86e8..000000000 --- a/pkg/sentry/fsimpl/proc/loadavg.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" -) - -// loadavgData backs /proc/loadavg. -// -// +stateify savable -type loadavgData struct { - kernfs.DynamicBytesFile -} - -var _ dynamicInode = (*loadavgData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { - // TODO(b/62345059): Include real data in fields. - // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. - // Column 4-5: currently running processes and the total number of processes. - // Column 6: the last process ID used. - fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) - return nil -} diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go deleted file mode 100644 index cbdd4f3fc..000000000 --- a/pkg/sentry/fsimpl/proc/meminfo.go +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. -// -// +stateify savable -type meminfoData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel -} - -var _ dynamicInode = (*meminfoData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - mf := d.k.MemoryFile() - mf.UpdateUsage() - snapshot, totalUsage := usage.MemoryAccounting.Copy() - totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) - anon := snapshot.Anonymous + snapshot.Tmpfs - file := snapshot.PageCache + snapshot.Mapped - // We don't actually have active/inactive LRUs, so just make up numbers. - activeFile := (file / 2) &^ (usermem.PageSize - 1) - inactiveFile := file - activeFile - - fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 - // We use MemFree as MemAvailable because we don't swap. - // TODO(rahat): When reclaim is implemented the value of MemAvailable - // should change. - fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) - fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices - fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) - // Emulate a system with no swap, which disables inactivation of anon pages. - fmt.Fprintf(buf, "SwapCache: 0 kB\n") - fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) - fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) - fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) - fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") - fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) - fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) - fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) - fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) - fmt.Fprintf(buf, "SwapTotal: 0 kB\n") - fmt.Fprintf(buf, "SwapFree: 0 kB\n") - fmt.Fprintf(buf, "Dirty: 0 kB\n") - fmt.Fprintf(buf, "Writeback: 0 kB\n") - fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) - fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know - fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) - return nil -} diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go deleted file mode 100644 index fd46eebf8..000000000 --- a/pkg/sentry/fsimpl/proc/net.go +++ /dev/null @@ -1,338 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/socket/unix" - "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. -// -// +stateify savable -type ifinet6 struct { - s inet.Stack -} - -var _ vfs.DynamicBytesSource = (*ifinet6)(nil) - -func (n *ifinet6) contents() []string { - var lines []string - nics := n.s.Interfaces() - for id, naddrs := range n.s.InterfaceAddrs() { - nic, ok := nics[id] - if !ok { - // NIC was added after NICNames was called. We'll just - // ignore it. - continue - } - - for _, a := range naddrs { - // IPv6 only. - if a.Family != linux.AF_INET6 { - continue - } - - // Fields: - // IPv6 address displayed in 32 hexadecimal chars without colons - // Netlink device number (interface index) in hexadecimal (use nic id) - // Prefix length in hexadecimal - // Scope value (use 0) - // Interface flags - // Device name - lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) - } - } - return lines -} - -// Generate implements vfs.DynamicBytesSource.Generate. -func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { - for _, l := range n.contents() { - buf.WriteString(l) - } - return nil -} - -// netDev implements vfs.DynamicBytesSource for /proc/net/dev. -// -// +stateify savable -type netDev struct { - s inet.Stack -} - -var _ vfs.DynamicBytesSource = (*netDev)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { - interfaces := n.s.Interfaces() - buf.WriteString("Inter-| Receive | Transmit\n") - buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") - - for _, i := range interfaces { - // Implements the same format as - // net/core/net-procfs.c:dev_seq_printf_stats. - var stats inet.StatDev - if err := n.s.Statistics(&stats, i.Name); err != nil { - log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) - continue - } - fmt.Fprintf( - buf, - "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", - i.Name, - // Received - stats[0], // bytes - stats[1], // packets - stats[2], // errors - stats[3], // dropped - stats[4], // fifo - stats[5], // frame - stats[6], // compressed - stats[7], // multicast - // Transmitted - stats[8], // bytes - stats[9], // packets - stats[10], // errors - stats[11], // dropped - stats[12], // fifo - stats[13], // frame - stats[14], // compressed - stats[15], // multicast - ) - } - - return nil -} - -// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. -// -// +stateify savable -type netUnix struct { - k *kernel.Kernel -} - -var _ vfs.DynamicBytesSource = (*netUnix)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { - buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") - for _, se := range n.k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) - continue - } - sfile := s.(*fs.File) - if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { - s.DecRef() - // Not a unix socket. - continue - } - sops := sfile.FileOperations.(*unix.SocketOperations) - - addr, err := sops.Endpoint().GetLocalAddress() - if err != nil { - log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) - addr.Addr = "" - } - - sockFlags := 0 - if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { - if ce.Listening() { - // For unix domain sockets, linux reports a single flag - // value if the socket is listening, of __SO_ACCEPTCON. - sockFlags = linux.SO_ACCEPTCON - } - } - - // In the socket entry below, the value for the 'Num' field requires - // some consideration. Linux prints the address to the struct - // unix_sock representing a socket in the kernel, but may redact the - // value for unprivileged users depending on the kptr_restrict - // sysctl. - // - // One use for this field is to allow a privileged user to - // introspect into the kernel memory to determine information about - // a socket not available through procfs, such as the socket's peer. - // - // In gvisor, returning a pointer to our internal structures would - // be pointless, as it wouldn't match the memory layout for struct - // unix_sock, making introspection difficult. We could populate a - // struct unix_sock with the appropriate data, but even that - // requires consideration for which kernel version to emulate, as - // the definition of this struct changes over time. - // - // For now, we always redact this pointer. - fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", - (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - sfile.ReadRefs()-1, // RefCount, don't count our own ref. - 0, // Protocol, always 0 for UDS. - sockFlags, // Flags. - sops.Endpoint().Type(), // Type. - sops.State(), // State. - sfile.InodeID(), // Inode. - ) - - // Path - if len(addr.Addr) != 0 { - if addr.Addr[0] == 0 { - // Abstract path. - fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) - } else { - fmt.Fprintf(buf, " %s", string(addr.Addr)) - } - } - fmt.Fprintf(buf, "\n") - - s.DecRef() - } - return nil -} - -// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. -// -// +stateify savable -type netTCP struct { - k *kernel.Kernel -} - -var _ vfs.DynamicBytesSource = (*netTCP)(nil) - -func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { - t := kernel.TaskFromContext(ctx) - buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") - for _, se := range n.k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) - continue - } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) - if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) - } - if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { - s.DecRef() - // Not tcp4 sockets. - continue - } - - // Linux's documentation for the fields below can be found at - // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. - // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). - // Note that the header doesn't contain labels for all the fields. - - // Field: sl; entry number. - fmt.Fprintf(buf, "%4d: ", se.ID) - - portBuf := make([]byte, 2) - - // Field: local_adddress. - var localAddr linux.SockAddrInet - if local, _, err := sops.GetSockName(t); err == nil { - localAddr = *local.(*linux.SockAddrInet) - } - binary.LittleEndian.PutUint16(portBuf, localAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(localAddr.Addr[:]), - portBuf) - - // Field: rem_address. - var remoteAddr linux.SockAddrInet - if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = *remote.(*linux.SockAddrInet) - } - binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(remoteAddr.Addr[:]), - portBuf) - - // Field: state; socket state. - fmt.Fprintf(buf, "%02X ", sops.State()) - - // Field: tx_queue, rx_queue; number of packets in the transmit and - // receive queue. Unimplemented. - fmt.Fprintf(buf, "%08X:%08X ", 0, 0) - - // Field: tr, tm->when; timer active state and number of jiffies - // until timer expires. Unimplemented. - fmt.Fprintf(buf, "%02X:%08X ", 0, 0) - - // Field: retrnsmt; number of unrecovered RTO timeouts. - // Unimplemented. - fmt.Fprintf(buf, "%08X ", 0) - - // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) - fmt.Fprintf(buf, "%5d ", 0) - } else { - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - } - - // Field: timeout; number of unanswered 0-window probes. - // Unimplemented. - fmt.Fprintf(buf, "%8d ", 0) - - // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) - - // Field: refcount. Don't count the ref we obtain while deferencing - // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) - - // Field: Socket struct address. Redacted due to the same reason as - // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. - fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) - - // Field: retransmit timeout. Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: predicted tick of soft clock (delayed ACK control data). - // Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: sending congestion window, Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. - // Unimplemented, report as large threshold. - fmt.Fprintf(buf, "%d", -1) - - fmt.Fprintf(buf, "\n") - - s.DecRef() - } - - return nil -} diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/net_test.go deleted file mode 100644 index 20a77a8ca..000000000 --- a/pkg/sentry/fsimpl/proc/net_test.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "reflect" - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/inet" -) - -func newIPv6TestStack() *inet.TestStack { - s := inet.NewTestStack() - s.SupportsIPv6Flag = true - return s -} - -func TestIfinet6NoAddresses(t *testing.T) { - n := &ifinet6{s: newIPv6TestStack()} - var buf bytes.Buffer - n.Generate(contexttest.Context(t), &buf) - if buf.Len() > 0 { - t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{}) - } -} - -func TestIfinet6(t *testing.T) { - s := newIPv6TestStack() - s.InterfacesMap[1] = inet.Interface{Name: "eth0"} - s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ - { - Family: linux.AF_INET6, - PrefixLen: 128, - Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), - }, - } - s.InterfacesMap[2] = inet.Interface{Name: "eth1"} - s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ - { - Family: linux.AF_INET6, - PrefixLen: 128, - Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), - }, - } - want := map[string]struct{}{ - "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, - "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, - } - - n := &ifinet6{s: s} - contents := n.contents() - if len(contents) != len(want) { - t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) - } - got := map[string]struct{}{} - for _, l := range contents { - got[l] = struct{}{} - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("Got n.contents() = %v, want = %v", got, want) - } -} diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go deleted file mode 100644 index 50894a534..000000000 --- a/pkg/sentry/fsimpl/proc/stat.go +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" -) - -// cpuStats contains the breakdown of CPU time for /proc/stat. -type cpuStats struct { - // user is time spent in userspace tasks with non-positive niceness. - user uint64 - - // nice is time spent in userspace tasks with positive niceness. - nice uint64 - - // system is time spent in non-interrupt kernel context. - system uint64 - - // idle is time spent idle. - idle uint64 - - // ioWait is time spent waiting for IO. - ioWait uint64 - - // irq is time spent in interrupt context. - irq uint64 - - // softirq is time spent in software interrupt context. - softirq uint64 - - // steal is involuntary wait time. - steal uint64 - - // guest is time spent in guests with non-positive niceness. - guest uint64 - - // guestNice is time spent in guests with positive niceness. - guestNice uint64 -} - -// String implements fmt.Stringer. -func (c cpuStats) String() string { - return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) -} - -// statData implements vfs.DynamicBytesSource for /proc/stat. -// -// +stateify savable -type statData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel -} - -var _ dynamicInode = (*statData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { - // TODO(b/37226836): We currently export only zero CPU stats. We could - // at least provide some aggregate stats. - var cpu cpuStats - fmt.Fprintf(buf, "cpu %s\n", cpu) - - for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { - fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) - } - - // The total number of interrupts is dependent on the CPUs and PCI - // devices on the system. See arch_probe_nr_irqs. - // - // Since we don't report real interrupt stats, just choose an arbitrary - // value from a representative VM. - const numInterrupts = 256 - - // The Kernel doesn't handle real interrupts, so report all zeroes. - // TODO(b/37226836): We could count page faults as #PF. - fmt.Fprintf(buf, "intr 0") // total - for i := 0; i < numInterrupts; i++ { - fmt.Fprintf(buf, " 0") - } - fmt.Fprintf(buf, "\n") - - // Total number of context switches. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "ctxt 0\n") - - // CLOCK_REALTIME timestamp from boot, in seconds. - fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) - - // Total number of clones. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "processes 0\n") - - // Number of runnable tasks. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "procs_running 0\n") - - // Number of tasks waiting on IO. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "procs_blocked 0\n") - - // Number of each softirq handled. - fmt.Fprintf(buf, "softirq 0") // total - for i := 0; i < linux.NumSoftIRQ; i++ { - fmt.Fprintf(buf, " 0") - } - fmt.Fprintf(buf, "\n") - return nil -} diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go deleted file mode 100644 index b88256e12..000000000 --- a/pkg/sentry/fsimpl/proc/sys.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// mmapMinAddrData implements vfs.DynamicBytesSource for -// /proc/sys/vm/mmap_min_addr. -// -// +stateify savable -type mmapMinAddrData struct { - k *kernel.Kernel -} - -var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) - return nil -} - -// +stateify savable -type overcommitMemory struct{} - -var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "0\n") - return nil -} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 11a64c777..5a384817f 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -50,15 +50,15 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames //"fd": newFdDir(t, msrc), //"fdinfo": newFdInfoDir(t, msrc), //"gid_map": newGIDMap(t, msrc), - "io": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), //"ns": newNamespaceDir(t, msrc), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{t: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{t: task, pidns: pidns}), //"uid_map": newUIDMap(t, msrc), } if isThreadGroup { diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index d8f92d52f..72315d25c 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -15,6 +15,7 @@ package proc import ( + "bytes" "sort" "strconv" @@ -28,9 +29,8 @@ import ( ) const ( - defaultPermission = 0444 - selfName = "self" - threadSelfName = "thread-self" + selfName = "self" + threadSelfName = "thread-self" ) // InoGenerator generates unique inode numbers for a given filesystem. @@ -61,15 +61,15 @@ var _ kernfs.Inode = (*tasksInode)(nil) func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - //"cpuinfo": newCPUInfo(ctx, msrc), - //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), - "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), - "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), - "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), - //"uptime": newUptime(ctx, msrc), - //"version": newVersionData(root, inoGen.NextIno(), k), - "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}), + "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), + //"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), + "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), + "sys": newSysDir(root, inoGen), + "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), 0777, "self/mounts"), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ @@ -216,3 +216,20 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { return stat } + +func cpuInfoData(k *kernel.Kernel) string { + features := k.FeatureSet() + if features == nil { + // Kernel is always initialized with a FeatureSet. + panic("cpuinfo read with nil FeatureSet") + } + var buf bytes.Buffer + for i, max := uint(0), k.ApplicationCores(); i < max; i++ { + features.WriteCPUInfoTo(i, &buf) + } + return buf.String() +} + +func shmData(v uint64) dynamicInode { + return newStaticFile(strconv.FormatUint(v, 10)) +} diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 91f30a798..ad3760e39 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -15,6 +15,7 @@ package proc import ( + "bytes" "fmt" "strconv" @@ -23,6 +24,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -90,3 +94,244 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } + +// cpuStats contains the breakdown of CPU time for /proc/stat. +type cpuStats struct { + // user is time spent in userspace tasks with non-positive niceness. + user uint64 + + // nice is time spent in userspace tasks with positive niceness. + nice uint64 + + // system is time spent in non-interrupt kernel context. + system uint64 + + // idle is time spent idle. + idle uint64 + + // ioWait is time spent waiting for IO. + ioWait uint64 + + // irq is time spent in interrupt context. + irq uint64 + + // softirq is time spent in software interrupt context. + softirq uint64 + + // steal is involuntary wait time. + steal uint64 + + // guest is time spent in guests with non-positive niceness. + guest uint64 + + // guestNice is time spent in guests with positive niceness. + guestNice uint64 +} + +// String implements fmt.Stringer. +func (c cpuStats) String() string { + return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) +} + +// statData implements vfs.DynamicBytesSource for /proc/stat. +// +// +stateify savable +type statData struct { + kernfs.DynamicBytesFile + + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ dynamicInode = (*statData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/37226836): We currently export only zero CPU stats. We could + // at least provide some aggregate stats. + var cpu cpuStats + fmt.Fprintf(buf, "cpu %s\n", cpu) + + for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) + } + + // The total number of interrupts is dependent on the CPUs and PCI + // devices on the system. See arch_probe_nr_irqs. + // + // Since we don't report real interrupt stats, just choose an arbitrary + // value from a representative VM. + const numInterrupts = 256 + + // The Kernel doesn't handle real interrupts, so report all zeroes. + // TODO(b/37226836): We could count page faults as #PF. + fmt.Fprintf(buf, "intr 0") // total + for i := 0; i < numInterrupts; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + + // Total number of context switches. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "ctxt 0\n") + + // CLOCK_REALTIME timestamp from boot, in seconds. + fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + + // Total number of clones. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "processes 0\n") + + // Number of runnable tasks. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_running 0\n") + + // Number of tasks waiting on IO. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_blocked 0\n") + + // Number of each softirq handled. + fmt.Fprintf(buf, "softirq 0") // total + for i := 0; i < linux.NumSoftIRQ; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + return nil +} + +// loadavgData backs /proc/loadavg. +// +// +stateify savable +type loadavgData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*loadavgData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/62345059): Include real data in fields. + // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. + // Column 4-5: currently running processes and the total number of processes. + // Column 6: the last process ID used. + fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) + return nil +} + +// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. +// +// +stateify savable +type meminfoData struct { + kernfs.DynamicBytesFile + + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ dynamicInode = (*meminfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + mf := d.k.MemoryFile() + mf.UpdateUsage() + snapshot, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + anon := snapshot.Anonymous + snapshot.Tmpfs + file := snapshot.PageCache + snapshot.Mapped + // We don't actually have active/inactive LRUs, so just make up numbers. + activeFile := (file / 2) &^ (usermem.PageSize - 1) + inactiveFile := file - activeFile + + fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) + memFree := (totalSize - totalUsage) / 1024 + // We use MemFree as MemAvailable because we don't swap. + // TODO(rahat): When reclaim is implemented the value of MemAvailable + // should change. + fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) + fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices + fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) + // Emulate a system with no swap, which disables inactivation of anon pages. + fmt.Fprintf(buf, "SwapCache: 0 kB\n") + fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) + fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") + fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) + fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "SwapTotal: 0 kB\n") + fmt.Fprintf(buf, "SwapFree: 0 kB\n") + fmt.Fprintf(buf, "Dirty: 0 kB\n") + fmt.Fprintf(buf, "Writeback: 0 kB\n") + fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know + fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) + return nil +} + +// uptimeData implements vfs.DynamicBytesSource for /proc/uptime. +// +// +stateify savable +type uptimeData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*uptimeData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + now := time.NowFromContext(ctx) + + // Pretend that we've spent zero time sleeping (second number). + fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds()) + return nil +} + +// versionData implements vfs.DynamicBytesSource for /proc/version. +// +// +stateify savable +type versionData struct { + kernfs.DynamicBytesFile + + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ dynamicInode = (*versionData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + init := v.k.GlobalInit() + if init == nil { + // Attempted to read before the init Task is created. This can + // only occur during startup, which should never need to read + // this file. + panic("Attempted to read version before initial Task is available") + } + + // /proc/version takes the form: + // + // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) + // (COMPILER_VERSION) VERSION" + // + // where: + // - SYSNAME, RELEASE, and VERSION are the same as returned by + // sys_utsname + // - COMPILE_USER is the user that build the kernel + // - COMPILE_HOST is the hostname of the machine on which the kernel + // was built + // - COMPILER_VERSION is the version reported by the building compiler + // + // Since we don't really want to expose build information to + // applications, those fields are omitted. + // + // FIXME(mpratt): Using Version from the init task SyscallTable + // disregards the different version a task may have (e.g., in a uts + // namespace). + ver := init.Leader().SyscallTable().Version + fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go new file mode 100644 index 000000000..06dc43c26 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -0,0 +1,337 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. +// +// +stateify savable +type ifinet6 struct { + s inet.Stack +} + +var _ vfs.DynamicBytesSource = (*ifinet6)(nil) + +func (n *ifinet6) contents() []string { + var lines []string + nics := n.s.Interfaces() + for id, naddrs := range n.s.InterfaceAddrs() { + nic, ok := nics[id] + if !ok { + // NIC was added after NICNames was called. We'll just ignore it. + continue + } + + for _, a := range naddrs { + // IPv6 only. + if a.Family != linux.AF_INET6 { + continue + } + + // Fields: + // IPv6 address displayed in 32 hexadecimal chars without colons + // Netlink device number (interface index) in hexadecimal (use nic id) + // Prefix length in hexadecimal + // Scope value (use 0) + // Interface flags + // Device name + lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) + } + } + return lines +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { + for _, l := range n.contents() { + buf.WriteString(l) + } + return nil +} + +// netDev implements vfs.DynamicBytesSource for /proc/net/dev. +// +// +stateify savable +type netDev struct { + s inet.Stack +} + +var _ vfs.DynamicBytesSource = (*netDev)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.s.Interfaces() + buf.WriteString("Inter-| Receive | Transmit\n") + buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") + + for _, i := range interfaces { + // Implements the same format as + // net/core/net-procfs.c:dev_seq_printf_stats. + var stats inet.StatDev + if err := n.s.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + fmt.Fprintf( + buf, + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + i.Name, + // Received + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast + // Transmitted + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15], // multicast + ) + } + + return nil +} + +// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. +// +// +stateify savable +type netUnix struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*netUnix)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + s.DecRef() + // Not a unix socket. + continue + } + sops := sfile.FileOperations.(*unix.SocketOperations) + + addr, err := sops.Endpoint().GetLocalAddress() + if err != nil { + log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + addr.Addr = "" + } + + sockFlags := 0 + if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { + if ce.Listening() { + // For unix domain sockets, linux reports a single flag + // value if the socket is listening, of __SO_ACCEPTCON. + sockFlags = linux.SO_ACCEPTCON + } + } + + // In the socket entry below, the value for the 'Num' field requires + // some consideration. Linux prints the address to the struct + // unix_sock representing a socket in the kernel, but may redact the + // value for unprivileged users depending on the kptr_restrict + // sysctl. + // + // One use for this field is to allow a privileged user to + // introspect into the kernel memory to determine information about + // a socket not available through procfs, such as the socket's peer. + // + // In gvisor, returning a pointer to our internal structures would + // be pointless, as it wouldn't match the memory layout for struct + // unix_sock, making introspection difficult. We could populate a + // struct unix_sock with the appropriate data, but even that + // requires consideration for which kernel version to emulate, as + // the definition of this struct changes over time. + // + // For now, we always redact this pointer. + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. + sfile.ReadRefs()-1, // RefCount, don't count our own ref. + 0, // Protocol, always 0 for UDS. + sockFlags, // Flags. + sops.Endpoint().Type(), // Type. + sops.State(), // State. + sfile.InodeID(), // Inode. + ) + + // Path + if len(addr.Addr) != 0 { + if addr.Addr[0] == 0 { + // Abstract path. + fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) + } else { + fmt.Fprintf(buf, " %s", string(addr.Addr)) + } + } + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCP struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*netTCP)(nil) + +func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { + t := kernel.TaskFromContext(ctx) + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(buf, "%4d: ", se.ID) + + portBuf := make([]byte, 2) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, localAddr.Port) + fmt.Fprintf(buf, "%08X:%04X ", + binary.LittleEndian.Uint32(localAddr.Addr[:]), + portBuf) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) + fmt.Fprintf(buf, "%08X:%04X ", + binary.LittleEndian.Uint32(remoteAddr.Addr[:]), + portBuf) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(buf, "%d", -1) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go new file mode 100644 index 000000000..aabf2bf0c --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -0,0 +1,143 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// newSysDir returns the dentry corresponding to /proc/sys directory. +func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}), + "shmall": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)), + "shmmax": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)), + "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), + }), + "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}), + "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), + }), + "net": newSysNetDir(root, inoGen), + }) +} + +// newSysNetDir returns the dentry corresponding to /proc/sys/net directory. +func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + // Add tcp_sack. + // TODO(gvisor.dev/issue/1195): tcp_sack allows write(2) + // "tcp_sack": newTCPSackInode(ctx, msrc, s), + + // The following files are simple stubs until they are implemented in + // netstack, most of these files are configuration related. We use the + // value closest to the actual netstack behavior or any empty file, all + // of these files will have mode 0444 (read-only for all users). + "ip_local_port_range": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), + "ipfrag_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")), + "ip_nonlocal_bind": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "ip_no_pmtu_disc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + + // tcp_allowed_congestion_control tell the user what they are able to + // do as an unprivledged process so we leave it empty. + "tcp_allowed_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), + "tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + "tcp_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + + // Many of the following stub files are features netstack doesn't + // support. The unsupported features return "0" to indicate they are + // disabled. + "tcp_base_mss": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")), + "tcp_dsack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_early_retrans": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_fack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen_key": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), + "tcp_invalid_ratelimit": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_intvl": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_probes": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")), + "tcp_mtu_probing": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_no_metrics_save": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_probe_interval": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_probe_threshold": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_retries1": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), + "tcp_retries2": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")), + "tcp_rfc1337": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_synack_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), + "tcp_syn_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), + "tcp_timestamps": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + }), + "core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")), + "message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")), + "message_cost": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), + "optmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "rmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "rmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "somaxconn": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")), + "wmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "wmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + }), + }), + }) +} + +// mmapMinAddrData implements vfs.DynamicBytesSource for +// /proc/sys/vm/mmap_min_addr. +// +// +stateify savable +type mmapMinAddrData struct { + kernfs.DynamicBytesFile + + k *kernel.Kernel +} + +var _ dynamicInode = (*mmapMinAddrData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) + return nil +} + +// hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname. +// +// +stateify savable +type hostnameData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*hostnameData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { + utsns := kernel.UTSNamespaceFromContext(ctx) + buf.WriteString(utsns.HostName()) + buf.WriteString("\n") + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go new file mode 100644 index 000000000..20a77a8ca --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -0,0 +1,78 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/inet" +) + +func newIPv6TestStack() *inet.TestStack { + s := inet.NewTestStack() + s.SupportsIPv6Flag = true + return s +} + +func TestIfinet6NoAddresses(t *testing.T) { + n := &ifinet6{s: newIPv6TestStack()} + var buf bytes.Buffer + n.Generate(contexttest.Context(t), &buf) + if buf.Len() > 0 { + t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{}) + } +} + +func TestIfinet6(t *testing.T) { + s := newIPv6TestStack() + s.InterfacesMap[1] = inet.Interface{Name: "eth0"} + s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), + }, + } + s.InterfacesMap[2] = inet.Interface{Name: "eth1"} + s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), + }, + } + want := map[string]struct{}{ + "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, + "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, + } + + n := &ifinet6{s: s} + contents := n.contents() + if len(contents) != len(want) { + t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) + } + got := map[string]struct{}{} + for _, l := range contents { + got[l] = struct{}{} + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Got n.contents() = %v, want = %v", got, want) + } +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index ca8c87ec2..76eafe593 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -69,12 +69,15 @@ func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ + "cpuinfo": {Type: linux.DT_REG}, "loadavg": {Type: linux.DT_REG}, "meminfo": {Type: linux.DT_REG}, "mounts": {Type: linux.DT_LNK}, "self": selfLink, "stat": {Type: linux.DT_REG}, + "sys": {Type: linux.DT_DIR}, "thread-self": threadSelfLink, + "uptime": {Type: linux.DT_REG}, "version": {Type: linux.DT_REG}, } return checkFiles(gots, wants) diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go deleted file mode 100644 index 367f2396b..000000000 --- a/pkg/sentry/fsimpl/proc/version.go +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" -) - -// versionData implements vfs.DynamicBytesSource for /proc/version. -// -// +stateify savable -type versionData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel -} - -var _ dynamicInode = (*versionData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { - init := v.k.GlobalInit() - if init == nil { - // Attempted to read before the init Task is created. This can - // only occur during startup, which should never need to read - // this file. - panic("Attempted to read version before initial Task is available") - } - - // /proc/version takes the form: - // - // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) - // (COMPILER_VERSION) VERSION" - // - // where: - // - SYSNAME, RELEASE, and VERSION are the same as returned by - // sys_utsname - // - COMPILE_USER is the user that build the kernel - // - COMPILE_HOST is the hostname of the machine on which the kernel - // was built - // - COMPILER_VERSION is the version reported by the building compiler - // - // Since we don't really want to expose build information to - // applications, those fields are omitted. - // - // FIXME(mpratt): Using Version from the init task SyscallTable - // disregards the different version a task may have (e.g., in a uts - // namespace). - ver := init.Leader().SyscallTable().Version - fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) - return nil -} -- cgit v1.2.3 From 07f258497932e53f4651b80a086117ffda843fe3 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 16 Jan 2020 12:33:07 -0800 Subject: Plumb getting/setting xattrs through InodeOperations and 9p gofer interfaces. There was a very bare get/setxattr in the InodeOperations interface. Add context.Context to both, size to getxattr, and flags to setxattr. Note that extended attributes are passed around as strings in this implementation, so size is automatically encoded into the value. Size is added in getxattr so that implementations can return ERANGE if a value is larger than can fit in the user-allocated buffer. This prevents us from unnecessarily passing around an arbitrarily large xattr when the user buffer is actually too small. Don't use the existing xattrwalk and xattrcreate messages and define our own, mainly for the sake of simplicity. Extended attributes will be implemented in future commits. PiperOrigin-RevId: 290121300 --- pkg/p9/client_file.go | 23 +++++ pkg/p9/file.go | 16 ++++ pkg/p9/handlers.go | 29 +++++++ pkg/p9/messages.go | 129 +++++++++++++++++++++++++++++ pkg/p9/messages_test.go | 15 ++++ pkg/p9/p9.go | 4 + pkg/sentry/fs/copy_up.go | 9 +- pkg/sentry/fs/file_overlay.go | 2 +- pkg/sentry/fs/fsutil/inode.go | 41 +++++---- pkg/sentry/fs/gofer/context_file.go | 14 ++++ pkg/sentry/fs/gofer/inode.go | 18 +++- pkg/sentry/fs/inode.go | 24 +++--- pkg/sentry/fs/inode_operations.go | 25 ++++-- pkg/sentry/fs/inode_overlay.go | 30 +++---- pkg/sentry/fs/inode_overlay_test.go | 4 +- pkg/sentry/fs/tmpfs/tmpfs.go | 18 ++-- pkg/sentry/syscalls/linux/linux64_amd64.go | 4 +- pkg/sentry/syscalls/linux/linux64_arm64.go | 4 +- pkg/sentry/syscalls/linux/sys_xattr.go | 48 ++++++----- runsc/fsgofer/fsgofer.go | 10 +++ 20 files changed, 374 insertions(+), 93 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go index de9357389..04b584383 100644 --- a/pkg/p9/client_file.go +++ b/pkg/p9/client_file.go @@ -165,6 +165,29 @@ func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error { return c.client.sendRecv(&Tsetattr{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattr{}) } +// GetXattr implements File.GetXattr. +func (c *clientFile) GetXattr(name string, size uint64) (string, error) { + if atomic.LoadUint32(&c.closed) != 0 { + return "", syscall.EBADF + } + + rgetxattr := Rgetxattr{} + if err := c.client.sendRecv(&Tgetxattr{FID: c.fid, Name: name, Size: size}, &rgetxattr); err != nil { + return "", err + } + + return rgetxattr.Value, nil +} + +// SetXattr implements File.SetXattr. +func (c *clientFile) SetXattr(name, value string, flags uint32) error { + if atomic.LoadUint32(&c.closed) != 0 { + return syscall.EBADF + } + + return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{}) +} + // Allocate implements File.Allocate. func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error { if atomic.LoadUint32(&c.closed) != 0 { diff --git a/pkg/p9/file.go b/pkg/p9/file.go index 96d1f2a8e..4607cfcdf 100644 --- a/pkg/p9/file.go +++ b/pkg/p9/file.go @@ -89,6 +89,22 @@ type File interface { // On the server, SetAttr has a write concurrency guarantee. SetAttr(valid SetAttrMask, attr SetAttr) error + // GetXattr returns extended attributes of this node. + // + // Size indicates the size of the buffer that has been allocated to hold the + // attribute value. If the value is larger than size, implementations may + // return ERANGE to indicate that the buffer is too small, but they are also + // free to ignore the hint entirely (i.e. the value returned may be larger + // than size). All size checking is done independently at the syscall layer. + // + // TODO(b/127675828): Determine concurrency guarantees once implemented. + GetXattr(name string, size uint64) (string, error) + + // SetXattr sets extended attributes on this node. + // + // TODO(b/127675828): Determine concurrency guarantees once implemented. + SetXattr(name, value string, flags uint32) error + // Allocate allows the caller to directly manipulate the allocated disk space // for the file. See fallocate(2) for more details. Allocate(mode AllocateMode, offset, length uint64) error diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go index b9582c07f..7d6653a07 100644 --- a/pkg/p9/handlers.go +++ b/pkg/p9/handlers.go @@ -912,6 +912,35 @@ func (t *Txattrcreate) handle(cs *connState) message { return newErr(syscall.ENOSYS) } +// handle implements handler.handle. +func (t *Tgetxattr) handle(cs *connState) message { + ref, ok := cs.LookupFID(t.FID) + if !ok { + return newErr(syscall.EBADF) + } + defer ref.DecRef() + + val, err := ref.file.GetXattr(t.Name, t.Size) + if err != nil { + return newErr(err) + } + return &Rgetxattr{Value: val} +} + +// handle implements handler.handle. +func (t *Tsetxattr) handle(cs *connState) message { + ref, ok := cs.LookupFID(t.FID) + if !ok { + return newErr(syscall.EBADF) + } + defer ref.DecRef() + + if err := ref.file.SetXattr(t.Name, t.Value, t.Flags); err != nil { + return newErr(err) + } + return &Rsetxattr{} +} + // handle implements handler.handle. func (t *Treaddir) handle(cs *connState) message { ref, ok := cs.LookupFID(t.Directory) diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go index ffdd7e8c6..ceb723d86 100644 --- a/pkg/p9/messages.go +++ b/pkg/p9/messages.go @@ -1611,6 +1611,131 @@ func (r *Rxattrcreate) String() string { return fmt.Sprintf("Rxattrcreate{}") } +// Tgetxattr is a getxattr request. +type Tgetxattr struct { + // FID refers to the file for which to get xattrs. + FID FID + + // Name is the xattr to get. + Name string + + // Size is the buffer size for the xattr to get. + Size uint64 +} + +// Decode implements encoder.Decode. +func (t *Tgetxattr) Decode(b *buffer) { + t.FID = b.ReadFID() + t.Name = b.ReadString() + t.Size = b.Read64() +} + +// Encode implements encoder.Encode. +func (t *Tgetxattr) Encode(b *buffer) { + b.WriteFID(t.FID) + b.WriteString(t.Name) + b.Write64(t.Size) +} + +// Type implements message.Type. +func (*Tgetxattr) Type() MsgType { + return MsgTgetxattr +} + +// String implements fmt.Stringer. +func (t *Tgetxattr) String() string { + return fmt.Sprintf("Tgetxattr{FID: %d, Name: %s, Size: %d}", t.FID, t.Name, t.Size) +} + +// Rgetxattr is a getxattr response. +type Rgetxattr struct { + // Value is the extended attribute value. + Value string +} + +// Decode implements encoder.Decode. +func (r *Rgetxattr) Decode(b *buffer) { + r.Value = b.ReadString() +} + +// Encode implements encoder.Encode. +func (r *Rgetxattr) Encode(b *buffer) { + b.WriteString(r.Value) +} + +// Type implements message.Type. +func (*Rgetxattr) Type() MsgType { + return MsgRgetxattr +} + +// String implements fmt.Stringer. +func (r *Rgetxattr) String() string { + return fmt.Sprintf("Rgetxattr{Value: %s}", r.Value) +} + +// Tsetxattr sets extended attributes. +type Tsetxattr struct { + // FID refers to the file on which to set xattrs. + FID FID + + // Name is the attribute name. + Name string + + // Value is the attribute value. + Value string + + // Linux setxattr(2) flags. + Flags uint32 +} + +// Decode implements encoder.Decode. +func (t *Tsetxattr) Decode(b *buffer) { + t.FID = b.ReadFID() + t.Name = b.ReadString() + t.Value = b.ReadString() + t.Flags = b.Read32() +} + +// Encode implements encoder.Encode. +func (t *Tsetxattr) Encode(b *buffer) { + b.WriteFID(t.FID) + b.WriteString(t.Name) + b.WriteString(t.Value) + b.Write32(t.Flags) +} + +// Type implements message.Type. +func (*Tsetxattr) Type() MsgType { + return MsgTsetxattr +} + +// String implements fmt.Stringer. +func (t *Tsetxattr) String() string { + return fmt.Sprintf("Tsetxattr{FID: %d, Name: %s, Value: %s, Flags: %d}", t.FID, t.Name, t.Value, t.Flags) +} + +// Rsetxattr is a setxattr response. +type Rsetxattr struct { +} + +// Decode implements encoder.Decode. +func (r *Rsetxattr) Decode(b *buffer) { +} + +// Encode implements encoder.Encode. +func (r *Rsetxattr) Encode(b *buffer) { +} + +// Type implements message.Type. +func (*Rsetxattr) Type() MsgType { + return MsgRsetxattr +} + +// String implements fmt.Stringer. +func (r *Rsetxattr) String() string { + return fmt.Sprintf("Rsetxattr{}") +} + // Treaddir is a readdir request. type Treaddir struct { // Directory is the directory FID to read. @@ -2363,6 +2488,10 @@ func init() { msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} }) msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} }) msgRegistry.register(MsgRxattrcreate, func() message { return &Rxattrcreate{} }) + msgRegistry.register(MsgTgetxattr, func() message { return &Tgetxattr{} }) + msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} }) + msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} }) + msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} }) msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} }) msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} }) msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} }) diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go index 6ba6a1654..825c939da 100644 --- a/pkg/p9/messages_test.go +++ b/pkg/p9/messages_test.go @@ -194,6 +194,21 @@ func TestEncodeDecode(t *testing.T) { Flags: 3, }, &Rxattrcreate{}, + &Tgetxattr{ + FID: 1, + Name: "abc", + Size: 2, + }, + &Rgetxattr{ + Value: "xyz", + }, + &Tsetxattr{ + FID: 1, + Name: "abc", + Value: "xyz", + Flags: 2, + }, + &Rsetxattr{}, &Treaddir{ Directory: 1, Offset: 2, diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index d3090535a..5ab00d625 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -339,6 +339,10 @@ const ( MsgRxattrwalk = 31 MsgTxattrcreate = 32 MsgRxattrcreate = 33 + MsgTgetxattr = 34 + MsgRgetxattr = 35 + MsgTsetxattr = 36 + MsgRsetxattr = 37 MsgTreaddir = 40 MsgRreaddir = 41 MsgTfsync = 50 diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index 734177e90..e03e3e417 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -18,6 +18,7 @@ import ( "fmt" "io" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -395,12 +396,12 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in // Size and permissions are set on upper when the file content is copied // and when the file is created respectively. func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error { - // Extract attributes fro the lower filesystem. + // Extract attributes from the lower filesystem. lowerAttr, err := lower.UnstableAttr(ctx) if err != nil { return err } - lowerXattr, err := lower.Listxattr() + lowerXattr, err := lower.ListXattr(ctx) if err != nil && err != syserror.EOPNOTSUPP { return err } @@ -421,11 +422,11 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error if isXattrOverlay(name) { continue } - value, err := lower.Getxattr(name) + value, err := lower.GetXattr(ctx, name, linux.XATTR_SIZE_MAX) if err != nil { return err } - if err := upper.InodeOperations.Setxattr(upper, name, value); err != nil { + if err := upper.InodeOperations.SetXattr(ctx, upper, name, value, 0 /* flags */); err != nil { return err } } diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 8a633b1ba..8991207b4 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -475,7 +475,7 @@ func readdirEntries(ctx context.Context, o *overlayEntry) (*SortedDentryMap, err // Skip this name if it is a negative entry in the // upper or there exists a whiteout for it. if o.upper != nil { - if overlayHasWhiteout(o.upper, name) { + if overlayHasWhiteout(ctx, o.upper, name) { continue } } diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index adf5ec69c..df7b74855 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -15,6 +15,7 @@ package fsutil import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -202,7 +203,7 @@ func (i *InodeSimpleAttributes) NotifyModificationAndStatusChange(ctx context.Co } // InodeSimpleExtendedAttributes implements -// fs.InodeOperations.{Get,Set,List}xattr. +// fs.InodeOperations.{Get,Set,List}Xattr. // // +stateify savable type InodeSimpleExtendedAttributes struct { @@ -211,8 +212,8 @@ type InodeSimpleExtendedAttributes struct { xattrs map[string]string } -// Getxattr implements fs.InodeOperations.Getxattr. -func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) (string, error) { +// GetXattr implements fs.InodeOperations.GetXattr. +func (i *InodeSimpleExtendedAttributes) GetXattr(_ context.Context, _ *fs.Inode, name string, _ uint64) (string, error) { i.mu.RLock() value, ok := i.xattrs[name] i.mu.RUnlock() @@ -222,19 +223,31 @@ func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) (stri return value, nil } -// Setxattr implements fs.InodeOperations.Setxattr. -func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name, value string) error { +// SetXattr implements fs.InodeOperations.SetXattr. +func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode, name, value string, flags uint32) error { i.mu.Lock() + defer i.mu.Unlock() if i.xattrs == nil { + if flags&linux.XATTR_REPLACE != 0 { + return syserror.ENODATA + } i.xattrs = make(map[string]string) } + + _, ok := i.xattrs[name] + if ok && flags&linux.XATTR_CREATE != 0 { + return syserror.EEXIST + } + if !ok && flags&linux.XATTR_REPLACE != 0 { + return syserror.ENODATA + } + i.xattrs[name] = value - i.mu.Unlock() return nil } -// Listxattr implements fs.InodeOperations.Listxattr. -func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struct{}, error) { +// ListXattr implements fs.InodeOperations.ListXattr. +func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { i.mu.RLock() names := make(map[string]struct{}, len(i.xattrs)) for name := range i.xattrs { @@ -436,18 +449,18 @@ func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { // extended attributes. type InodeNoExtendedAttributes struct{} -// Getxattr implements fs.InodeOperations.Getxattr. -func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) (string, error) { +// GetXattr implements fs.InodeOperations.GetXattr. +func (InodeNoExtendedAttributes) GetXattr(context.Context, *fs.Inode, string, uint64) (string, error) { return "", syserror.EOPNOTSUPP } -// Setxattr implements fs.InodeOperations.Setxattr. -func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, string) error { +// SetXattr implements fs.InodeOperations.SetXattr. +func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, string, uint32) error { return syserror.EOPNOTSUPP } -// Listxattr implements fs.InodeOperations.Listxattr. -func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) { +// ListXattr implements fs.InodeOperations.ListXattr. +func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { return nil, syserror.EOPNOTSUPP } diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go index 44b72582a..2125dafef 100644 --- a/pkg/sentry/fs/gofer/context_file.go +++ b/pkg/sentry/fs/gofer/context_file.go @@ -59,6 +59,20 @@ func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9 return err } +func (c *contextFile) getXattr(ctx context.Context, name string, size uint64) (string, error) { + ctx.UninterruptibleSleepStart(false) + val, err := c.file.GetXattr(name, size) + ctx.UninterruptibleSleepFinish(false) + return val, err +} + +func (c *contextFile) setXattr(ctx context.Context, name, value string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := c.file.SetXattr(name, value, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := c.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 245fe2ef1..98d1a8a48 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -38,8 +38,7 @@ import ( // // +stateify savable type inodeOperations struct { - fsutil.InodeNotVirtual `state:"nosave"` - fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNotVirtual `state:"nosave"` // fileState implements fs.CachedFileObject. It exists // to break a circular load dependency between inodeOperations @@ -604,6 +603,21 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)}) } +// GetXattr implements fs.InodeOperations.GetXattr. +func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) { + return i.fileState.file.getXattr(ctx, name, size) +} + +// SetXattr implements fs.InodeOperations.SetXattr. +func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error { + return i.fileState.file.setXattr(ctx, name, value, flags) +} + +// ListXattr implements fs.InodeOperations.ListXattr. +func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { + return nil, syscall.EOPNOTSUPP +} + // Allocate implements fs.InodeOperations.Allocate. func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error { // This can only be called for files anyway. diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 468043df0..ee9d301ef 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -261,28 +261,28 @@ func (i *Inode) UnstableAttr(ctx context.Context) (UnstableAttr, error) { return i.InodeOperations.UnstableAttr(ctx, i) } -// Getxattr calls i.InodeOperations.Getxattr with i as the Inode. -func (i *Inode) Getxattr(name string) (string, error) { +// GetXattr calls i.InodeOperations.GetXattr with i as the Inode. +func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string, error) { if i.overlay != nil { - return overlayGetxattr(i.overlay, name) + return overlayGetXattr(ctx, i.overlay, name, size) } - return i.InodeOperations.Getxattr(i, name) + return i.InodeOperations.GetXattr(ctx, i, name, size) } -// Setxattr calls i.InodeOperations.Setxattr with i as the Inode. -func (i *Inode) Setxattr(name, value string) error { +// SetXattr calls i.InodeOperations.SetXattr with i as the Inode. +func (i *Inode) SetXattr(ctx context.Context, name, value string, flags uint32) error { if i.overlay != nil { - return overlaySetxattr(i.overlay, name, value) + return overlaySetxattr(ctx, i.overlay, name, value, flags) } - return i.InodeOperations.Setxattr(i, name, value) + return i.InodeOperations.SetXattr(ctx, i, name, value, flags) } -// Listxattr calls i.InodeOperations.Listxattr with i as the Inode. -func (i *Inode) Listxattr() (map[string]struct{}, error) { +// ListXattr calls i.InodeOperations.ListXattr with i as the Inode. +func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) { if i.overlay != nil { - return overlayListxattr(i.overlay) + return overlayListXattr(ctx, i.overlay) } - return i.InodeOperations.Listxattr(i) + return i.InodeOperations.ListXattr(ctx, i) } // CheckPermission will check if the caller may access this file in the diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 5cde9d215..13261cb81 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -170,20 +170,27 @@ type InodeOperations interface { // file system events. UnstableAttr(ctx context.Context, inode *Inode) (UnstableAttr, error) - // Getxattr retrieves the value of extended attribute name. Inodes that - // do not support extended attributes return EOPNOTSUPP. Inodes that - // support extended attributes but don't have a value at name return + // GetXattr retrieves the value of extended attribute specified by name. + // Inodes that do not support extended attributes return EOPNOTSUPP. Inodes + // that support extended attributes but don't have a value at name return // ENODATA. - Getxattr(inode *Inode, name string) (string, error) + // + // If this is called through the getxattr(2) syscall, size indicates the + // size of the buffer that the application has allocated to hold the + // attribute value. If the value is larger than size, implementations may + // return ERANGE to indicate that the buffer is too small, but they are also + // free to ignore the hint entirely (i.e. the value returned may be larger + // than size). All size checking is done independently at the syscall layer. + GetXattr(ctx context.Context, inode *Inode, name string, size uint64) (string, error) - // Setxattr sets the value of extended attribute name. Inodes that - // do not support extended attributes return EOPNOTSUPP. - Setxattr(inode *Inode, name, value string) error + // SetXattr sets the value of extended attribute specified by name. Inodes + // that do not support extended attributes return EOPNOTSUPP. + SetXattr(ctx context.Context, inode *Inode, name, value string, flags uint32) error - // Listxattr returns the set of all extended attributes names that + // ListXattr returns the set of all extended attributes names that // have values. Inodes that do not support extended attributes return // EOPNOTSUPP. - Listxattr(inode *Inode) (map[string]struct{}, error) + ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error) // Check determines whether an Inode can be accessed with the // requested permission mask using the context (which gives access diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 13d11e001..b90da20d0 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -25,13 +25,13 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func overlayHasWhiteout(parent *Inode, name string) bool { - s, err := parent.Getxattr(XattrOverlayWhiteout(name)) +func overlayHasWhiteout(ctx context.Context, parent *Inode, name string) bool { + s, err := parent.GetXattr(ctx, XattrOverlayWhiteout(name), 1) return err == nil && s == "y" } -func overlayCreateWhiteout(parent *Inode, name string) error { - return parent.InodeOperations.Setxattr(parent, XattrOverlayWhiteout(name), "y") +func overlayCreateWhiteout(ctx context.Context, parent *Inode, name string) error { + return parent.InodeOperations.SetXattr(ctx, parent, XattrOverlayWhiteout(name), "y", 0 /* flags */) } func overlayWriteOut(ctx context.Context, o *overlayEntry) error { @@ -89,7 +89,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name } // Are we done? - if overlayHasWhiteout(parent.upper, name) { + if overlayHasWhiteout(ctx, parent.upper, name) { if upperInode == nil { parent.copyMu.RUnlock() if negativeUpperChild { @@ -345,7 +345,7 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child * } } if child.Inode.overlay.lowerExists { - if err := overlayCreateWhiteout(o.upper, child.name); err != nil { + if err := overlayCreateWhiteout(ctx, o.upper, child.name); err != nil { return err } } @@ -426,7 +426,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena return err } if renamed.Inode.overlay.lowerExists { - if err := overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName); err != nil { + if err := overlayCreateWhiteout(ctx, oldParent.Inode.overlay.upper, oldName); err != nil { return err } } @@ -528,7 +528,7 @@ func overlayUnstableAttr(ctx context.Context, o *overlayEntry) (UnstableAttr, er return attr, err } -func overlayGetxattr(o *overlayEntry, name string) (string, error) { +func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uint64) (string, error) { // Hot path. This is how the overlay checks for whiteout files. // Avoid defers. var ( @@ -544,31 +544,31 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) { o.copyMu.RLock() if o.upper != nil { - s, err = o.upper.Getxattr(name) + s, err = o.upper.GetXattr(ctx, name, size) } else { - s, err = o.lower.Getxattr(name) + s, err = o.lower.GetXattr(ctx, name, size) } o.copyMu.RUnlock() return s, err } // TODO(b/146028302): Support setxattr for overlayfs. -func overlaySetxattr(o *overlayEntry, name, value string) error { +func overlaySetxattr(ctx context.Context, o *overlayEntry, name, value string, flags uint32) error { return syserror.EOPNOTSUPP } -func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) { +func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) { o.copyMu.RLock() defer o.copyMu.RUnlock() var names map[string]struct{} var err error if o.upper != nil { - names, err = o.upper.Listxattr() + names, err = o.upper.ListXattr(ctx) } else { - names, err = o.lower.Listxattr() + names, err = o.lower.ListXattr(ctx) } for name := range names { - // Same as overlayGetxattr, we shouldn't forward along + // Same as overlayGetXattr, we shouldn't forward along // overlay attributes. if strings.HasPrefix(XattrOverlayPrefix, name) { delete(names, name) diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 8935aad65..493d98c36 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -382,8 +382,8 @@ type dir struct { ReaddirCalled bool } -// Getxattr implements InodeOperations.Getxattr. -func (d *dir) Getxattr(inode *fs.Inode, name string) (string, error) { +// GetXattr implements InodeOperations.GetXattr. +func (d *dir) GetXattr(_ context.Context, _ *fs.Inode, name string, _ uint64) (string, error) { for _, n := range d.negative { if name == fs.XattrOverlayWhiteout(n) { return "y", nil diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 69089c8a8..0f718e236 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -148,19 +148,19 @@ func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms return d.ramfsDir.CreateFifo(ctx, dir, name, perms) } -// Getxattr implements fs.InodeOperations.Getxattr. -func (d *Dir) Getxattr(i *fs.Inode, name string) (string, error) { - return d.ramfsDir.Getxattr(i, name) +// GetXattr implements fs.InodeOperations.GetXattr. +func (d *Dir) GetXattr(ctx context.Context, i *fs.Inode, name string, size uint64) (string, error) { + return d.ramfsDir.GetXattr(ctx, i, name, size) } -// Setxattr implements fs.InodeOperations.Setxattr. -func (d *Dir) Setxattr(i *fs.Inode, name, value string) error { - return d.ramfsDir.Setxattr(i, name, value) +// SetXattr implements fs.InodeOperations.SetXattr. +func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, flags uint32) error { + return d.ramfsDir.SetXattr(ctx, i, name, value, flags) } -// Listxattr implements fs.InodeOperations.Listxattr. -func (d *Dir) Listxattr(i *fs.Inode) (map[string]struct{}, error) { - return d.ramfsDir.Listxattr(i) +// ListXattr implements fs.InodeOperations.ListXattr. +func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) { + return d.ramfsDir.ListXattr(ctx, i) } // Lookup implements fs.InodeOperations.Lookup. diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 479c5f6ff..6b2920900 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), - 188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil), + 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil), + 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index d3f61f5e8..8c1b20911 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{ 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 5: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil), + 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), 6: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 7: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 8: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil), + 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 97d9a65ea..816352218 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -25,12 +25,12 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// Getxattr implements linux syscall getxattr(2). -func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// GetXattr implements linux syscall getxattr(2). +func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() - size := args[3].SizeT() + size := uint64(args[3].SizeT()) path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) if err != nil { @@ -39,22 +39,28 @@ func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc valueLen := 0 err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { - value, err := getxattr(t, d, dirPath, nameAddr) + // If getxattr(2) is called with size 0, the size of the value will be + // returned successfully even if it is nonzero. In that case, we need to + // retrieve the entire attribute value so we can return the correct size. + requestedSize := size + if size == 0 || size > linux.XATTR_SIZE_MAX { + requestedSize = linux.XATTR_SIZE_MAX + } + + value, err := getXattr(t, d, dirPath, nameAddr, uint64(requestedSize)) if err != nil { return err } valueLen = len(value) - if size == 0 { - return nil - } - if size > linux.XATTR_SIZE_MAX { - size = linux.XATTR_SIZE_MAX - } - if valueLen > int(size) { + if uint64(valueLen) > requestedSize { return syserror.ERANGE } + // Skip copying out the attribute value if size is 0. + if size == 0 { + return nil + } _, err = t.CopyOutBytes(valueAddr, []byte(value)) return err }) @@ -64,8 +70,8 @@ func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return uintptr(valueLen), nil, nil } -// getxattr implements getxattr from the given *fs.Dirent. -func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) { +// getXattr implements getxattr(2) from the given *fs.Dirent. +func getXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr, size uint64) (string, error) { if dirPath && !fs.IsDir(d.Inode.StableAttr) { return "", syserror.ENOTDIR } @@ -83,15 +89,15 @@ func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) return "", syserror.EOPNOTSUPP } - return d.Inode.Getxattr(name) + return d.Inode.GetXattr(t, name, size) } -// Setxattr implements linux syscall setxattr(2). -func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// SetXattr implements linux syscall setxattr(2). +func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() - size := args[3].SizeT() + size := uint64(args[3].SizeT()) flags := args[4].Uint() path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) @@ -104,12 +110,12 @@ func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { - return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags) + return setXattr(t, d, dirPath, nameAddr, valueAddr, uint64(size), flags) }) } -// setxattr implements setxattr from the given *fs.Dirent. -func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error { +// setXattr implements setxattr(2) from the given *fs.Dirent. +func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } @@ -136,7 +142,7 @@ func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us return syserror.EOPNOTSUPP } - return d.Inode.Setxattr(name, value) + return d.Inode.SetXattr(t, name, value, flags) } func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 93606d051..4d84ad999 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -767,6 +767,16 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { return err } +// TODO(b/127675828): support getxattr. +func (l *localFile) GetXattr(name string, size uint64) (string, error) { + return "", syscall.EOPNOTSUPP +} + +// TODO(b/127675828): support setxattr. +func (l *localFile) SetXattr(name, value string, flags uint32) error { + return syscall.EOPNOTSUPP +} + // Allocate implements p9.File. func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error { if !l.isOpen() { -- cgit v1.2.3 From 3dd3275da7b665cf2ca297e4bf566fcc77025af8 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 16 Jan 2020 13:13:22 -0800 Subject: Add more files to /proc/[pid]/* Files not implemented require VFSv2 plumbing into the kernel. Also, cgroup is not implemented yet. Updates #1195 PiperOrigin-RevId: 290129176 --- pkg/sentry/fsimpl/kernfs/filesystem.go | 63 ++---- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 16 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/symlink.go | 21 +- pkg/sentry/fsimpl/proc/BUILD | 3 +- pkg/sentry/fsimpl/proc/mounts.go | 33 --- pkg/sentry/fsimpl/proc/subtasks.go | 126 +++++++++++ pkg/sentry/fsimpl/proc/task.go | 69 ++++-- pkg/sentry/fsimpl/proc/task_files.go | 315 +++++++++++++++++++++++++--- pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/proc/tasks_test.go | 20 +- pkg/sentry/vfs/permissions.go | 24 ++- 12 files changed, 549 insertions(+), 145 deletions(-) delete mode 100644 pkg/sentry/fsimpl/proc/mounts.go create mode 100644 pkg/sentry/fsimpl/proc/subtasks.go (limited to 'pkg/sentry') diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 79759e0fc..a4600ad47 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -40,7 +39,7 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP return nil, syserror.ENOTDIR } // Directory searchable? - if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -182,8 +181,8 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // // Preconditions: Filesystem.mu must be locked for at least reading. parentInode // == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. -func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { - if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { +func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return "", err } pc := rp.Component() @@ -206,7 +205,7 @@ func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInod // checkDeleteLocked checks that the file represented by vfsd may be deleted. // // Preconditions: Filesystem.mu must be locked for at least reading. -func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { +func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { parentVFSD := vfsd.Parent() if parentVFSD == nil { return syserror.EBUSY @@ -214,36 +213,12 @@ func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { if parentVFSD.IsDisowned() { return syserror.ENOENT } - if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return nil } -// checkRenameLocked checks that a rename operation may be performed on the -// target dentry across the given set of parent directories. The target dentry -// may be nil. -// -// Precondition: isDir(dstInode) == true. -func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error { - srcDir := src.Parent() - if srcDir == nil { - return syserror.EBUSY - } - if srcDir.IsDisowned() { - return syserror.ENOENT - } - if dstDir.IsDisowned() { - return syserror.ENOENT - } - // Check for creation permissions on dst dir. - if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { - return err - } - - return nil -} - // Release implements vfs.FilesystemImpl.Release. func (fs *Filesystem) Release() { } @@ -269,7 +244,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.isDir() { return nil, syserror.ENOTDIR } - if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -302,7 +277,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -339,7 +314,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -367,7 +342,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -401,7 +376,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return nil, err } - if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } return inode.Open(rp, vfsd, opts.Flags) @@ -420,7 +395,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } return inode.Open(rp, vfsd, opts.Flags) @@ -432,7 +407,7 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -450,7 +425,7 @@ afterTrailingSymlink: } if childVFSD == nil { // Already checked for searchability above; now check for writability. - if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -485,7 +460,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } } - if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil { + if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } return childInode.Open(rp, childVFSD, opts.Flags) @@ -545,13 +520,13 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa srcVFSD := &src.vfsd // Can we remove the src dentry? - if err := checkDeleteLocked(rp, srcVFSD); err != nil { + if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil { return err } // Can we create the dst dentry? var dstVFSD *vfs.Dentry - pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode) + pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) switch err { case nil: // Ok, continue with rename as replacement. @@ -607,7 +582,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } if !vfsd.Impl().(*Dentry).isDir() { @@ -683,7 +658,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -712,7 +687,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } if vfsd.Impl().(*Dentry).isDir() { diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 6aff3d39a..1700fffd9 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -262,7 +262,7 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { } // CheckPermissions implements Inode.CheckPermissions. -func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { +func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := a.Mode() return vfs.GenericCheckPermissions( creds, @@ -527,12 +527,8 @@ var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { - if perm&^linux.PermissionsMask != 0 { - panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) - } - inode := &StaticDirectory{} - inode.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + inode.Init(creds, ino, perm) dentry := &Dentry{} dentry.Init(inode) @@ -544,6 +540,14 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil return dentry } +// Init initializes StaticDirectory. +func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) +} + // Open implements kernfs.Inode. func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &GenericDirectoryFD{} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index bb12f39a2..85bcdcc57 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -320,7 +320,7 @@ type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the the rules of // fs/namei.c:generic_permission(). - CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error + CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 068063f4e..f19f12854 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -20,7 +20,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) -type staticSymlink struct { +// StaticSymlink provides an Inode implementation for symlinks that point to +// a immutable target. +type StaticSymlink struct { InodeAttrs InodeNoopRefCount InodeSymlink @@ -28,18 +30,25 @@ type staticSymlink struct { target string } -var _ Inode = (*staticSymlink)(nil) +var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry { - inode := &staticSymlink{target: target} - inode.Init(creds, ino, linux.ModeSymlink|perm) +func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry { + inode := &StaticSymlink{} + inode.Init(creds, ino, target) d := &Dentry{} d.Init(inode) return d } -func (s *staticSymlink) Readlink(_ context.Context) (string, error) { +// Init initializes the instance. +func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) { + s.target = target + s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777) +} + +// Readlink implements Inode. +func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 6cd18cec8..e92564b5d 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -7,7 +7,7 @@ go_library( name = "proc", srcs = [ "filesystem.go", - "mounts.go", + "subtasks.go", "task.go", "task_files.go", "tasks.go", @@ -29,6 +29,7 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", + "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go deleted file mode 100644 index 8683cf677..000000000 --- a/pkg/sentry/fsimpl/proc/mounts.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import "gvisor.dev/gvisor/pkg/sentry/kernel" - -// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile. - -// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo. -// -// +stateify savable -type mountInfoFile struct { - t *kernel.Task -} - -// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts. -// -// +stateify savable -type mountsFile struct { - t *kernel.Task -} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go new file mode 100644 index 000000000..8892c5a11 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -0,0 +1,126 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// subtasksInode represents the inode for /proc/[pid]/task/ directory. +// +// +stateify savable +type subtasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + + task *kernel.Task + pidns *kernel.PIDNamespace + inoGen InoGenerator +} + +var _ kernfs.Inode = (*subtasksInode)(nil) + +func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator) *kernfs.Dentry { + subInode := &subtasksInode{ + task: task, + pidns: pidns, + inoGen: inoGen, + } + // Note: credentials are overridden by taskOwnedInode. + subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + inode := &taskOwnedInode{Inode: subInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + return dentry +} + +// Valid implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) Valid(ctx context.Context) bool { + return true +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + tid, err := strconv.ParseUint(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + + subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if subTask == nil { + return nil, syserror.ENOENT + } + if subTask.ThreadGroup() != i.task.ThreadGroup() { + return nil, syserror.ENOENT + } + + subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false) + return subTaskDentry.VFSDentry(), nil +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + tasks := i.task.ThreadGroup().MemberIDs(i.pidns) + if len(tasks) == 0 { + return offset, syserror.ENOENT + } + + tids := make([]int, 0, len(tasks)) + for _, tid := range tasks { + tids = append(tids, int(tid)) + } + + sort.Ints(tids) + for _, tid := range tids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + return offset, nil +} + +// Open implements kernfs.Inode. +func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +// Stat implements kernfs.Inode. +func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { + stat := i.InodeAttrs.Stat(vsfs) + stat.Nlink += uint32(i.task.ThreadGroup().Count()) + return stat +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 5a384817f..621c17cfe 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,6 +15,8 @@ package proc import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -42,27 +44,31 @@ var _ kernfs.Inode = (*taskInode)(nil) func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { contents := map[string]*kernfs.Dentry{ - //"auxv": newAuxvec(t, msrc), - //"cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - //"comm": newComm(t, msrc), - //"environ": newExecArgInode(t, msrc, environExecArg), + "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), + "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": newComm(task, inoGen.NextIno(), 0444), + "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), //"exe": newExe(t, msrc), //"fd": newFdDir(t, msrc), //"fdinfo": newFdInfoDir(t, msrc), - //"gid_map": newGIDMap(t, msrc), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), + "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - //"ns": newNamespaceDir(t, msrc), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{t: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{t: task, pidns: pidns}), - //"uid_map": newUIDMap(t, msrc), + "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ + "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), + "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), + "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), + }), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - //contents["task"] = p.newSubtasks(t, msrc) + contents["task"] = newSubtasks(task, pidns, inoGen) } //if len(p.cgroupControllers) > 0 { // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) @@ -127,6 +133,23 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode return d } +func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &kernfs.StaticDirectory{} + + // Note: credentials are overridden by taskOwnedInode. + dir.Init(task.Credentials(), ino, perm) + + inode := &taskOwnedInode{Inode: dir, owner: task} + d := &kernfs.Dentry{} + d.Init(inode) + + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := dir.OrderedChildren.Populate(d, children) + dir.IncLinks(links) + + return d +} + // Stat implements kernfs.Inode. func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { stat := i.Inode.Stat(fs) @@ -137,7 +160,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { } // CheckPermissions implements kernfs.Inode. -func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { +func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) return vfs.GenericCheckPermissions( @@ -188,3 +211,19 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData { } return &ioData{ioUsage: t} } + +func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { + // Namespace symlinks should contain the namespace name and the inode number + // for the namespace instance, so for example user:[123456]. We currently fake + // the inode number by sticking the symlink inode in its place. + target := fmt.Sprintf("%s:[%d]", ns, ino) + + inode := &kernfs.StaticSymlink{} + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, target) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 93f0e1aa8..7bc352ae9 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -17,15 +17,20 @@ package proc import ( "bytes" "fmt" + "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) // mm gets the kernel task's MemoryManager. No additional reference is taken on @@ -41,6 +46,256 @@ func getMM(task *kernel.Task) *mm.MemoryManager { return tmm } +// getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the +// MemoryManager's users count is incremented, and must be decremented by the +// caller when it is no longer in use. +func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { + if task.ExitState() == kernel.TaskExitDead { + return nil, syserror.ESRCH + } + var m *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + m = t.MemoryManager() + }) + if m == nil || !m.IncUsers() { + return nil, io.EOF + } + return m, nil +} + +type bufferWriter struct { + buf *bytes.Buffer +} + +// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns +// the number of bytes written. It may return a partial write without an +// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not +// return a full write with an error (i.e. srcs.NumBytes(), err) where err +// != nil). +func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + written := srcs.NumBytes() + for !srcs.IsEmpty() { + w.buf.Write(srcs.Head().ToSlice()) + srcs = srcs.Tail() + } + return written, nil +} + +// auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. +// +// +stateify savable +type auxvData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*auxvData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { + m, err := getMMIncRef(d.task) + if err != nil { + return err + } + defer m.DecUsers(ctx) + + // Space for buffer with AT_NULL (0) terminator at the end. + auxv := m.Auxv() + buf.Grow((len(auxv) + 1) * 16) + for _, e := range auxv { + var tmp [8]byte + usermem.ByteOrder.PutUint64(tmp[:], e.Key) + buf.Write(tmp[:]) + + usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value)) + buf.Write(tmp[:]) + } + return nil +} + +// execArgType enumerates the types of exec arguments that are exposed through +// proc. +type execArgType int + +const ( + cmdlineDataArg execArgType = iota + environDataArg +) + +// cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline. +// +// +stateify savable +type cmdlineData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + + // arg is the type of exec argument this file contains. + arg execArgType +} + +var _ dynamicInode = (*cmdlineData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { + m, err := getMMIncRef(d.task) + if err != nil { + return err + } + defer m.DecUsers(ctx) + + // Figure out the bounds of the exec arg we are trying to read. + var ar usermem.AddrRange + switch d.arg { + case cmdlineDataArg: + ar = usermem.AddrRange{ + Start: m.ArgvStart(), + End: m.ArgvEnd(), + } + case environDataArg: + ar = usermem.AddrRange{ + Start: m.EnvvStart(), + End: m.EnvvEnd(), + } + default: + panic(fmt.Sprintf("unknown exec arg type %v", d.arg)) + } + if ar.Start == 0 || ar.End == 0 { + // Don't attempt to read before the start/end are set up. + return io.EOF + } + + // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true + // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading + // cmdline and environment"). + writer := &bufferWriter{buf: buf} + if n, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { + // Nothing to copy or something went wrong. + return err + } + + // On Linux, if the NULL byte at the end of the argument vector has been + // overwritten, it continues reading the environment vector as part of + // the argument vector. + if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 { + if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { + // If we found a NULL character somewhere else in argv, truncate the + // return up to the NULL terminator (including it). + buf.Truncate(end) + return nil + } + + // There is no NULL terminator in the string, return into envp. + arEnvv := usermem.AddrRange{ + Start: m.EnvvStart(), + End: m.EnvvEnd(), + } + + // Upstream limits the returned amount to one page of slop. + // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 + // we'll return one page total between argv and envp because of the + // above page restrictions. + if buf.Len() >= usermem.PageSize { + // Returned at least one page already, nothing else to add. + return nil + } + remaining := usermem.PageSize - buf.Len() + if int(arEnvv.Length()) > remaining { + end, ok := arEnvv.Start.AddLength(uint64(remaining)) + if !ok { + return syserror.EFAULT + } + arEnvv.End = end + } + if _, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { + return err + } + + // Linux will return envp up to and including the first NULL character, + // so find it. + if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 { + buf.Truncate(end) + } + } + + return nil +} + +// +stateify savable +type commInode struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { + inode := &commInode{task: task} + inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + // This file can always be read or written by members of the same thread + // group. See fs/proc/base.c:proc_tid_comm_permission. + // + // N.B. This check is currently a no-op as we don't yet support writing and + // this file is world-readable anyways. + t := kernel.TaskFromContext(ctx) + if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { + return nil + } + + return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) +} + +// commData implements vfs.DynamicBytesSource for /proc/[pid]/comm. +// +// +stateify savable +type commData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*commData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(d.task.Name()) + buf.WriteString("\n") + return nil +} + +// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}. +// +// +stateify savable +type idMapData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + gids bool +} + +var _ dynamicInode = (*idMapData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var entries []auth.IDMapEntry + if d.gids { + entries = d.task.UserNamespace().GIDMap() + } else { + entries = d.task.UserNamespace().UIDMap() + } + for _, e := range entries { + fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) + } + return nil +} + // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable @@ -83,7 +338,7 @@ func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { type taskStatData struct { kernfs.DynamicBytesFile - t *kernel.Task + task *kernel.Task // If tgstats is true, accumulate fault stats (not implemented) and CPU // time across all tasks in t's thread group. @@ -98,40 +353,40 @@ var _ dynamicInode = (*taskStatData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) - fmt.Fprintf(buf, "(%s) ", s.t.Name()) - fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) + fmt.Fprintf(buf, "(%s) ", s.task.Name()) + fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { + if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "%d ", ppid) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) fmt.Fprintf(buf, "0 " /* flags */) fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) var cputime usage.CPUStats if s.tgstats { - cputime = s.t.ThreadGroup().CPUStats() + cputime = s.task.ThreadGroup().CPUStats() } else { - cputime = s.t.CPUStats() + cputime = s.task.CPUStats() } fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - cputime = s.t.ThreadGroup().JoinedChildCPUStats() + cputime = s.task.ThreadGroup().JoinedChildCPUStats() fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) + fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) // itrealvalue. Since kernel 2.6.17, this field is no longer // maintained, and is hard coded as 0. fmt.Fprintf(buf, "0 ") // Start time is relative to boot time, expressed in clock ticks. - fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { + s.task.WithMuLocked(func(t *kernel.Task) { if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() @@ -140,14 +395,14 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) // rsslim. - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) + fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) terminationSignal := linux.Signal(0) - if s.t == s.t.ThreadGroup().Leader() { - terminationSignal = s.t.ThreadGroup().TerminationSignal() + if s.task == s.task.ThreadGroup().Leader() { + terminationSignal = s.task.ThreadGroup().TerminationSignal() } fmt.Fprintf(buf, "%d ", terminationSignal) fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) @@ -164,7 +419,7 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { type statmData struct { kernfs.DynamicBytesFile - t *kernel.Task + task *kernel.Task } var _ dynamicInode = (*statmData)(nil) @@ -172,7 +427,7 @@ var _ dynamicInode = (*statmData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { + s.task.WithMuLocked(func(t *kernel.Task) { if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() @@ -189,7 +444,7 @@ func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { type statusData struct { kernfs.DynamicBytesFile - t *kernel.Task + task *kernel.Task pidns *kernel.PIDNamespace } @@ -197,23 +452,23 @@ var _ dynamicInode = (*statusData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) - fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) - fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) - fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { + if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "PPid:\t%d\n", ppid) tpid := kernel.ThreadID(0) - if tracer := s.t.Tracer(); tracer != nil { + if tracer := s.task.Tracer(); tracer != nil { tpid = s.pidns.IDOfTask(tracer) } fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) var fds int var vss, rss, data uint64 - s.t.WithMuLocked(func(t *kernel.Task) { + s.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.Size() } @@ -227,13 +482,13 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) - fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) - creds := s.t.Credentials() + fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) + creds := s.task.Credentials() fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) - fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) // We unconditionally report a single NUMA node. See // pkg/sentry/syscalls/linux/sys_mempolicy.go. fmt.Fprintf(buf, "Mems_allowed:\t1\n") diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 72315d25c..a97b1753a 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -66,7 +66,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), "sys": newSysDir(root, inoGen), "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), 0777, "self/mounts"), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 76eafe593..6b58c16b9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -85,12 +85,20 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ - "io": {Type: linux.DT_REG}, - "maps": {Type: linux.DT_REG}, - "smaps": {Type: linux.DT_REG}, - "stat": {Type: linux.DT_REG}, - "statm": {Type: linux.DT_REG}, - "status": {Type: linux.DT_REG}, + "auxv": {Type: linux.DT_REG}, + "cmdline": {Type: linux.DT_REG}, + "comm": {Type: linux.DT_REG}, + "environ": {Type: linux.DT_REG}, + "gid_map": {Type: linux.DT_REG}, + "io": {Type: linux.DT_REG}, + "maps": {Type: linux.DT_REG}, + "ns": {Type: linux.DT_DIR}, + "smaps": {Type: linux.DT_REG}, + "stat": {Type: linux.DT_REG}, + "statm": {Type: linux.DT_REG}, + "status": {Type: linux.DT_REG}, + "task": {Type: linux.DT_DIR}, + "uid_map": {Type: linux.DT_REG}, } return checkFiles(gots, wants) } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f1edb0680..d279d05ca 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -30,6 +30,26 @@ const ( MayExec = 1 ) +// OnlyRead returns true if access _only_ allows read. +func (a AccessTypes) OnlyRead() bool { + return a == MayRead +} + +// MayRead returns true if access allows read. +func (a AccessTypes) MayRead() bool { + return a&MayRead != 0 +} + +// MayWrite returns true if access allows write. +func (a AccessTypes) MayWrite() bool { + return a&MayWrite != 0 +} + +// MayExec returns true if access allows exec. +func (a AccessTypes) MayExec() bool { + return a&MayExec != 0 +} + // GenericCheckPermissions checks that creds has the given access rights on a // file with the given permissions, UID, and GID, subject to the rules of // fs/namei.c:generic_permission(). isDir is true if the file is a directory. @@ -53,7 +73,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. - if (isDir && (ats&MayWrite == 0)) || ats == MayRead { + if (isDir && !ats.MayWrite()) || ats.OnlyRead() { if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { return nil } @@ -61,7 +81,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write // access to non-directory files, and execute access to non-directory files // for which at least one execute bit is set. - if isDir || (ats&MayExec == 0) || (mode&0111 != 0) { + if isDir || !ats.MayExec() || (mode&0111 != 0) { if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { return nil } -- cgit v1.2.3 From 70d7c52bd7583393d39177a7935cca57372d67f1 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 16 Jan 2020 13:58:25 -0800 Subject: Implement tmpfs.SetStat with a size argument. This is similar to 'Truncate' in vfs1. Updates https://github.com/google/gvisor/issues/1197 PiperOrigin-RevId: 290139140 --- pkg/sentry/fsimpl/tmpfs/regular_file.go | 35 ++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 121 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 54 ++++++++++-- 3 files changed, 205 insertions(+), 5 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index f200e767d..5fa70cc6d 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -63,6 +63,41 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod return &file.inode } +// truncate grows or shrinks the file to the given size. It returns true if the +// file size was updated. +func (rf *regularFile) truncate(size uint64) (bool, error) { + rf.mu.Lock() + defer rf.mu.Unlock() + + if size == rf.size { + // Nothing to do. + return false, nil + } + + if size > rf.size { + // Growing the file. + if rf.seals&linux.F_SEAL_GROW != 0 { + // Seal does not allow growth. + return false, syserror.EPERM + } + rf.size = size + return true, nil + } + + // Shrinking the file + if rf.seals&linux.F_SEAL_SHRINK != 0 { + // Seal does not allow shrink. + return false, syserror.EPERM + } + + // TODO(gvisor.dev/issues/1197): Invalidate mappings once we have + // mappings. + + rf.data.Truncate(size, rf.memFile) + rf.size = size + return true, nil +} + type regularFileFD struct { fileDescription diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 7b0a962f0..034a29fdb 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -313,3 +313,124 @@ func TestPRead(t *testing.T) { } } } + +func TestTruncate(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Fill the file with some data. + data := bytes.Repeat([]byte("gVisor is awsome"), 100) + written, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + + // Size should be same as written. + sizeStatOpts := vfs.StatOptions{Mask: linux.STATX_SIZE} + stat, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := int64(stat.Size), written; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + + // Truncate down. + newSize := uint64(10) + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + // Size should be updated. + statAfterTruncateDown, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := statAfterTruncateDown.Size, newSize; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + // We should only read newSize worth of data. + buf := make([]byte, 1000) + if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } else if uint64(n) != newSize { + t.Errorf("fd.PRead got size %d, want %d", n, newSize) + } + // Mtime and Ctime should be bumped. + if got := statAfterTruncateDown.Mtime.ToNsec(); got <= stat.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want > %v", got, stat.Mtime) + } + if got := statAfterTruncateDown.Ctime.ToNsec(); got <= stat.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime) + } + + // Truncate up. + newSize = 100 + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + // Size should be updated. + statAfterTruncateUp, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := statAfterTruncateUp.Size, newSize; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + // We should read newSize worth of data. + buf = make([]byte, 1000) + if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } else if uint64(n) != newSize { + t.Errorf("fd.PRead got size %d, want %d", n, newSize) + } + // Bytes should be null after 10, since we previously truncated to 10. + for i := uint64(10); i < newSize; i++ { + if buf[i] != 0 { + t.Errorf("fd.PRead got byte %d=%x, want 0", i, buf[i]) + break + } + } + // Mtime and Ctime should be bumped. + if got := statAfterTruncateUp.Mtime.ToNsec(); got <= statAfterTruncateDown.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want > %v", got, statAfterTruncateDown.Mtime) + } + if got := statAfterTruncateUp.Ctime.ToNsec(); got <= statAfterTruncateDown.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime) + } + + // Truncate to the current size. + newSize = statAfterTruncateUp.Size + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + statAfterTruncateNoop, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + // Mtime and Ctime should not be bumped, since operation is a noop. + if got := statAfterTruncateNoop.Mtime.ToNsec(); got != statAfterTruncateUp.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want %v", got, statAfterTruncateUp.Mtime) + } + if got := statAfterTruncateNoop.Ctime.ToNsec(); got != statAfterTruncateUp.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want %v", got, statAfterTruncateUp.Ctime) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index d6960ee47..1d4889c89 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // FilesystemType implements vfs.FilesystemType. @@ -121,6 +122,9 @@ func (d *dentry) DecRef() { // inode represents a filesystem object. type inode struct { + // clock is a realtime clock used to set timestamps in file operations. + clock time.Clock + // refs is a reference count. refs is accessed using atomic memory // operations. // @@ -151,13 +155,14 @@ type inode struct { const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { - now := fs.clock.Now().Nanoseconds() + i.clock = fs.clock i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. + now := i.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now @@ -270,30 +275,69 @@ func (i *inode) statTo(stat *linux.Statx) { } func (i *inode) setStat(stat linux.Statx) error { - // TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking - // the file. if stat.Mask == 0 { return nil } i.mu.Lock() + var ( + needsMtimeBump bool + needsCtimeBump bool + ) mask := stat.Mask if mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&i.mode, uint32(stat.Mode)) + needsCtimeBump = true } if mask&linux.STATX_UID != 0 { atomic.StoreUint32(&i.uid, stat.UID) + needsCtimeBump = true } if mask&linux.STATX_GID != 0 { atomic.StoreUint32(&i.gid, stat.GID) + needsCtimeBump = true + } + if mask&linux.STATX_SIZE != 0 { + switch impl := i.impl.(type) { + case *regularFile: + updated, err := impl.truncate(stat.Size) + if err != nil { + return err + } + if updated { + needsMtimeBump = true + needsCtimeBump = true + } + case *directory: + return syserror.EISDIR + case *symlink: + return syserror.EINVAL + case *namedPipe: + // Nothing. + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } } if mask&linux.STATX_ATIME != 0 { atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + needsCtimeBump = true + } + if mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + needsCtimeBump = true + // Ignore the mtime bump, since we just set it ourselves. + needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + // Ignore the ctime bump, since we just set it ourselves. + needsCtimeBump = false } - if mask&linux.STATX_MTIME != 0 { - atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + now := i.clock.Now().Nanoseconds() + if needsMtimeBump { + atomic.StoreInt64(&i.mtime, now) + } + if needsCtimeBump { + atomic.StoreInt64(&i.ctime, now) } i.mu.Unlock() return nil -- cgit v1.2.3 From ab48112e41427579ecf585f6280be1e2d58acf06 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 16 Jan 2020 13:58:50 -0800 Subject: Add IfChange/ThenChange reminders in fs/proc There is a lot of code duplication for VFSv2 and this serves as remind to keep the copies in sync. Updates #1195 PiperOrigin-RevId: 290139234 --- pkg/sentry/fs/proc/cgroup.go | 4 ++++ pkg/sentry/fs/proc/cpuinfo.go | 4 ++++ pkg/sentry/fs/proc/exec_args.go | 4 ++++ pkg/sentry/fs/proc/fds.go | 4 ++++ pkg/sentry/fs/proc/filesystems.go | 4 ++++ pkg/sentry/fs/proc/fs.go | 4 ++++ pkg/sentry/fs/proc/inode.go | 4 ++++ pkg/sentry/fs/proc/loadavg.go | 4 ++++ pkg/sentry/fs/proc/meminfo.go | 4 ++++ pkg/sentry/fs/proc/mounts.go | 4 ++++ pkg/sentry/fs/proc/net.go | 4 ++++ pkg/sentry/fs/proc/proc.go | 4 ++++ pkg/sentry/fs/proc/stat.go | 4 ++++ pkg/sentry/fs/proc/sys.go | 4 ++++ pkg/sentry/fs/proc/sys_net.go | 4 ++++ pkg/sentry/fs/proc/task.go | 4 ++++ pkg/sentry/fs/proc/uid_gid_map.go | 4 ++++ pkg/sentry/fs/proc/uptime.go | 4 ++++ pkg/sentry/fs/proc/version.go | 4 ++++ 19 files changed, 76 insertions(+) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/proc/cgroup.go b/pkg/sentry/fs/proc/cgroup.go index 05e31c55d..c4abe319d 100644 --- a/pkg/sentry/fs/proc/cgroup.go +++ b/pkg/sentry/fs/proc/cgroup.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" ) +// LINT.IfChange + func newCGroupInode(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string]string) *fs.Inode { // From man 7 cgroups: "For each cgroup hierarchy of which the process // is a member, there is one entry containing three colon-separated @@ -39,3 +41,5 @@ func newCGroupInode(ctx context.Context, msrc *fs.MountSource, cgroupControllers return newStaticProcInode(ctx, msrc, []byte(data)) } + +// LINT.ThenChange(../../fsimpl/proc/tasks_files.go) diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go index 6330337eb..df0c4e3a7 100644 --- a/pkg/sentry/fs/proc/cpuinfo.go +++ b/pkg/sentry/fs/proc/cpuinfo.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ) +// LINT.IfChange + func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode { k := kernel.KernelFromContext(ctx) features := k.FeatureSet() @@ -35,3 +37,5 @@ func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode { } return newStaticProcInode(ctx, msrc, buf.Bytes()) } + +// LINT.ThenChange(../../fsimpl/proc/tasks_files.go) diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 1d3a2d426..9aaeb780b 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -29,6 +29,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // execArgType enumerates the types of exec arguments that are exposed through // proc. type execArgType int @@ -201,3 +203,5 @@ func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen } return int64(n), err } + +// LINT.ThenChange(../../fsimpl/proc/task.go) diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index bee421d76..2fa3cfa7d 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // walkDescriptors finds the descriptor (file-flag pair) for the fd identified // by p, and calls the toInodeOperations callback with that descriptor. This is a helper // method for implementing fs.InodeOperations.Lookup. @@ -277,3 +279,5 @@ func (fdid *fdInfoDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs. } return fs.NewFile(ctx, dirent, flags, fops), nil } + +// LINT.ThenChange(../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go index e9250c51c..7b3b974ab 100644 --- a/pkg/sentry/fs/proc/filesystems.go +++ b/pkg/sentry/fs/proc/filesystems.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" ) +// LINT.IfChange + // filesystemsData backs /proc/filesystems. // // +stateify savable @@ -59,3 +61,5 @@ func (*filesystemsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle // Return the SeqData and advance the generation counter. return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*filesystemsData)(nil)}}, 1 } + +// LINT.ThenChange(../../fsimpl/proc/filesystem.go) diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go index f14833805..761d24462 100644 --- a/pkg/sentry/fs/proc/fs.go +++ b/pkg/sentry/fs/proc/fs.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" ) +// LINT.IfChange + // filesystem is a procfs. // // +stateify savable @@ -79,3 +81,5 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou // never want them cached. return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags), cgroups) } + +// LINT.ThenChange(../../fsimpl/proc/filesystem.go) diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go index 0c04f81fa..723f6b661 100644 --- a/pkg/sentry/fs/proc/inode.go +++ b/pkg/sentry/fs/proc/inode.go @@ -26,6 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usermem" ) +// LINT.IfChange + // taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr // method to return either the task or root as the owner, depending on the // task's dumpability. @@ -131,3 +133,5 @@ func newProcInode(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo } return fs.NewInode(ctx, iops, msrc, sattr) } + +// LINT.ThenChange(../../fsimpl/proc/tasks.go) diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go index 8602b7426..d7d2afcb7 100644 --- a/pkg/sentry/fs/proc/loadavg.go +++ b/pkg/sentry/fs/proc/loadavg.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" ) +// LINT.IfChange + // loadavgData backs /proc/loadavg. // // +stateify savable @@ -53,3 +55,5 @@ func (d *loadavgData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) }, }, 0 } + +// LINT.ThenChange(../../fsimpl/proc/tasks_files.go) diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go index 495f3e3ba..313c6a32b 100644 --- a/pkg/sentry/fs/proc/meminfo.go +++ b/pkg/sentry/fs/proc/meminfo.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usermem" ) +// LINT.IfChange + // meminfoData backs /proc/meminfo. // // +stateify savable @@ -83,3 +85,5 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) fmt.Fprintf(&buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*meminfoData)(nil)}}, 0 } + +// LINT.ThenChange(../../fsimpl/proc/tasks_files.go) diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index e33c4a460..5aedae799 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ) +// LINT.IfChange + // forEachMountSource runs f for the process root mount and each mount that is a // descendant of the root. func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { @@ -195,3 +197,5 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0 } + +// LINT.ThenChange(../../fsimpl/proc/tasks_files.go) diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 402919924..3f17e98ea 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -38,6 +38,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/header" ) +// LINT.IfChange + // newNet creates a new proc net entry. func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode @@ -831,3 +833,5 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se } return data, 0 } + +// LINT.ThenChange(../../fsimpl/proc/tasks_net.go) diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 56e92721e..da9341e4e 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -31,6 +31,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // proc is a root proc node. // // +stateify savable @@ -249,3 +251,5 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent } return offset, nil } + +// LINT.ThenChange(../../fsimpl/proc/tasks.go) diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go index b641effbb..bc5b2bc7b 100644 --- a/pkg/sentry/fs/proc/stat.go +++ b/pkg/sentry/fs/proc/stat.go @@ -24,6 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ) +// LINT.IfChange + // statData backs /proc/stat. // // +stateify savable @@ -140,3 +142,5 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([] }, }, 0 } + +// LINT.ThenChange(../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index cd37776c8..1062bd852 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -31,6 +31,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr. // // +stateify savable @@ -160,3 +162,5 @@ func (hf *hostnameFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequ } var _ fs.FileOperations = (*hostnameFile)(nil) + +// LINT.ThenChange(../../fsimpl/proc/tasks_sys.go) diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index a37e1fa06..b9e8ef35f 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -30,6 +30,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + type tcpMemDir int const ( @@ -364,3 +366,5 @@ func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) } + +// LINT.ThenChange(../../fsimpl/proc/tasks_sys.go) diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 9bf4b4527..7358d6ef9 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -37,6 +37,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's // users count is incremented, and must be decremented by the caller when it is // no longer in use. @@ -800,3 +802,5 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc n, err := dst.CopyOut(ctx, buf[offset:]) return int64(n), err } + +// LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go index eea37d15c..3eacc9265 100644 --- a/pkg/sentry/fs/proc/uid_gid_map.go +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -30,6 +30,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // idMapInodeOperations implements fs.InodeOperations for // /proc/[pid]/{uid,gid}_map. // @@ -177,3 +179,5 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u // count, even if fewer bytes were used. return int64(srclen), nil } + +// LINT.ThenChange(../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go index 4e903917a..adfe58adb 100644 --- a/pkg/sentry/fs/proc/uptime.go +++ b/pkg/sentry/fs/proc/uptime.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // uptime is a file containing the system uptime. // // +stateify savable @@ -85,3 +87,5 @@ func (f *uptimeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc n, err := dst.CopyOut(ctx, s[offset:]) return int64(n), err } + +// LINT.ThenChange(../../fsimpl/proc/tasks_files.go) diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go index a6d2c3cd3..27fd5b1cb 100644 --- a/pkg/sentry/fs/proc/version.go +++ b/pkg/sentry/fs/proc/version.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ) +// LINT.IfChange + // versionData backs /proc/version. // // +stateify savable @@ -76,3 +78,5 @@ func (v *versionData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) }, }, 0 } + +// LINT.ThenChange(../../fsimpl/proc/task_files.go) -- cgit v1.2.3 From 7a45ae7e67438697296fc12345202e3c76304096 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 16 Jan 2020 18:13:27 -0800 Subject: Implement setxattr for overlays. PiperOrigin-RevId: 290186303 --- pkg/sentry/fs/inode.go | 4 ++-- pkg/sentry/fs/inode_overlay.go | 13 ++++++++++--- pkg/sentry/syscalls/linux/sys_xattr.go | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index ee9d301ef..e4cf5a570 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -270,9 +270,9 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string, } // SetXattr calls i.InodeOperations.SetXattr with i as the Inode. -func (i *Inode) SetXattr(ctx context.Context, name, value string, flags uint32) error { +func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error { if i.overlay != nil { - return overlaySetxattr(ctx, i.overlay, name, value, flags) + return overlaySetxattr(ctx, i.overlay, d, name, value, flags) } return i.InodeOperations.SetXattr(ctx, i, name, value, flags) } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index b90da20d0..c477de837 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -552,9 +552,16 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin return s, err } -// TODO(b/146028302): Support setxattr for overlayfs. -func overlaySetxattr(ctx context.Context, o *overlayEntry, name, value string, flags uint32) error { - return syserror.EOPNOTSUPP +func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error { + // Don't allow changes to overlay xattrs through a setxattr syscall. + if strings.HasPrefix(XattrOverlayPrefix, name) { + return syserror.EPERM + } + + if err := copyUp(ctx, d); err != nil { + return err + } + return o.upper.SetXattr(ctx, d, name, value, flags) } func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) { diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 816352218..23d20da6f 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -142,7 +142,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us return syserror.EOPNOTSUPP } - return d.Inode.SetXattr(t, name, value, flags) + return d.Inode.SetXattr(t, d, name, value, flags) } func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { -- cgit v1.2.3 From 19b4653147c8ec1cd2cf6e2a8f9bfc7865a5f850 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 16 Jan 2020 20:19:40 -0800 Subject: Remove unused rpcinet. PiperOrigin-RevId: 290198756 --- pkg/sentry/fs/proc/BUILD | 2 - pkg/sentry/fs/proc/proc.go | 9 +- pkg/sentry/fs/proc/rpcinet_proc.go | 217 ------ pkg/sentry/fs/proc/sys.go | 9 +- pkg/sentry/socket/rpcinet/BUILD | 69 -- pkg/sentry/socket/rpcinet/conn/BUILD | 18 - pkg/sentry/socket/rpcinet/conn/conn.go | 187 ----- pkg/sentry/socket/rpcinet/device.go | 19 - pkg/sentry/socket/rpcinet/notifier/BUILD | 17 - pkg/sentry/socket/rpcinet/notifier/notifier.go | 231 ------- pkg/sentry/socket/rpcinet/rpcinet.go | 16 - pkg/sentry/socket/rpcinet/socket.go | 909 ------------------------- pkg/sentry/socket/rpcinet/stack.go | 177 ----- pkg/sentry/socket/rpcinet/stack_unsafe.go | 193 ------ pkg/sentry/socket/rpcinet/syscall_rpc.proto | 352 ---------- 15 files changed, 2 insertions(+), 2423 deletions(-) delete mode 100644 pkg/sentry/fs/proc/rpcinet_proc.go delete mode 100644 pkg/sentry/socket/rpcinet/BUILD delete mode 100644 pkg/sentry/socket/rpcinet/conn/BUILD delete mode 100644 pkg/sentry/socket/rpcinet/conn/conn.go delete mode 100644 pkg/sentry/socket/rpcinet/device.go delete mode 100644 pkg/sentry/socket/rpcinet/notifier/BUILD delete mode 100644 pkg/sentry/socket/rpcinet/notifier/notifier.go delete mode 100644 pkg/sentry/socket/rpcinet/rpcinet.go delete mode 100644 pkg/sentry/socket/rpcinet/socket.go delete mode 100644 pkg/sentry/socket/rpcinet/stack.go delete mode 100644 pkg/sentry/socket/rpcinet/stack_unsafe.go delete mode 100644 pkg/sentry/socket/rpcinet/syscall_rpc.proto (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 94d46ab1b..cb37c6c6b 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -18,7 +18,6 @@ go_library( "mounts.go", "net.go", "proc.go", - "rpcinet_proc.go", "stat.go", "sys.go", "sys_net.go", @@ -46,7 +45,6 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/mm", "//pkg/sentry/socket", - "//pkg/sentry/socket/rpcinet", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index da9341e4e..29867dc3a 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet" "gvisor.dev/gvisor/pkg/syserror" ) @@ -87,15 +86,9 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string } // Add more contents that need proc to be initialized. + p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc)) p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc)) - // If we're using rpcinet we will let it manage /proc/net. - if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok { - p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc)) - } else { - p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc)) - } - return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil } diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go deleted file mode 100644 index 01ac97530..000000000 --- a/pkg/sentry/fs/proc/rpcinet_proc.go +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" -) - -// rpcInetInode implements fs.InodeOperations. -type rpcInetInode struct { - fsutil.SimpleFileInode - - // filepath is the full path of this rpcInetInode. - filepath string - - k *kernel.Kernel -} - -func newRPCInetInode(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode { - f := &rpcInetInode{ - SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(mode), linux.PROC_SUPER_MAGIC), - filepath: filepath, - k: kernel.KernelFromContext(ctx), - } - return newProcInode(ctx, f, msrc, fs.SpecialFile, nil) -} - -// GetFile implements fs.InodeOperations.GetFile. -func (i *rpcInetInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - flags.Pread = true - flags.Pwrite = true - fops := &rpcInetFile{ - inode: i, - } - return fs.NewFile(ctx, dirent, flags, fops), nil -} - -// rpcInetFile implements fs.FileOperations as RPCs. -type rpcInetFile struct { - fsutil.FileGenericSeek `state:"nosave"` - fsutil.FileNoIoctl `state:"nosave"` - fsutil.FileNoMMap `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNoopFlush `state:"nosave"` - fsutil.FileNoopFsync `state:"nosave"` - fsutil.FileNoopRelease `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - - inode *rpcInetInode -} - -// Read implements fs.FileOperations.Read. -// -// This method can panic if an rpcInetInode was created without an rpcinet -// stack. -func (f *rpcInetFile) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - if offset < 0 { - return 0, syserror.EINVAL - } - s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack) - if !ok { - panic("Network stack is not a rpcinet.") - } - - contents, se := s.RPCReadFile(f.inode.filepath) - if se != nil || offset >= int64(len(contents)) { - return 0, io.EOF - } - - n, err := dst.CopyOut(ctx, contents[offset:]) - return int64(n), err -} - -// Write implements fs.FileOperations.Write. -// -// This method can panic if an rpcInetInode was created without an rpcInet -// stack. -func (f *rpcInetFile) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack) - if !ok { - panic("Network stack is not a rpcinet.") - } - - if src.NumBytes() == 0 { - return 0, nil - } - - b := make([]byte, src.NumBytes(), src.NumBytes()) - n, err := src.CopyIn(ctx, b) - if err != nil { - return int64(n), err - } - - written, se := s.RPCWriteFile(f.inode.filepath, b) - return int64(written), se.ToError() -} - -// newRPCInetProcNet will build an inode for /proc/net. -func newRPCInetProcNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode { - contents := map[string]*fs.Inode{ - "arp": newRPCInetInode(ctx, msrc, "/proc/net/arp", 0444), - "dev": newRPCInetInode(ctx, msrc, "/proc/net/dev", 0444), - "if_inet6": newRPCInetInode(ctx, msrc, "/proc/net/if_inet6", 0444), - "ipv6_route": newRPCInetInode(ctx, msrc, "/proc/net/ipv6_route", 0444), - "netlink": newRPCInetInode(ctx, msrc, "/proc/net/netlink", 0444), - "netstat": newRPCInetInode(ctx, msrc, "/proc/net/netstat", 0444), - "packet": newRPCInetInode(ctx, msrc, "/proc/net/packet", 0444), - "protocols": newRPCInetInode(ctx, msrc, "/proc/net/protocols", 0444), - "psched": newRPCInetInode(ctx, msrc, "/proc/net/psched", 0444), - "ptype": newRPCInetInode(ctx, msrc, "/proc/net/ptype", 0444), - "route": newRPCInetInode(ctx, msrc, "/proc/net/route", 0444), - "tcp": newRPCInetInode(ctx, msrc, "/proc/net/tcp", 0444), - "tcp6": newRPCInetInode(ctx, msrc, "/proc/net/tcp6", 0444), - "udp": newRPCInetInode(ctx, msrc, "/proc/net/udp", 0444), - "udp6": newRPCInetInode(ctx, msrc, "/proc/net/udp6", 0444), - } - - d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) -} - -// newRPCInetProcSysNet will build an inode for /proc/sys/net. -func newRPCInetProcSysNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode { - contents := map[string]*fs.Inode{ - "ipv4": newRPCInetSysNetIPv4Dir(ctx, msrc), - "core": newRPCInetSysNetCore(ctx, msrc), - } - - d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) -} - -// newRPCInetSysNetCore builds the /proc/sys/net/core directory. -func newRPCInetSysNetCore(ctx context.Context, msrc *fs.MountSource) *fs.Inode { - contents := map[string]*fs.Inode{ - "default_qdisc": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444), - "message_burst": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_burst", 0444), - "message_cost": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_cost", 0444), - "optmem_max": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444), - "rmem_default": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444), - "rmem_max": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444), - "somaxconn": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444), - "wmem_default": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444), - "wmem_max": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444), - } - - d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) -} - -// newRPCInetSysNetIPv4Dir builds the /proc/sys/net/ipv4 directory. -func newRPCInetSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { - contents := map[string]*fs.Inode{ - "ip_local_port_range": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444), - "ip_local_reserved_ports": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444), - "ipfrag_time": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444), - "ip_nonlocal_bind": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444), - "ip_no_pmtu_disc": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444), - "tcp_allowed_congestion_control": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444), - "tcp_available_congestion_control": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444), - "tcp_base_mss": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444), - "tcp_congestion_control": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644), - "tcp_dsack": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644), - "tcp_early_retrans": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644), - "tcp_fack": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644), - "tcp_fastopen": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644), - "tcp_fastopen_key": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444), - "tcp_fin_timeout": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644), - "tcp_invalid_ratelimit": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444), - "tcp_keepalive_intvl": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644), - "tcp_keepalive_probes": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644), - "tcp_keepalive_time": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644), - "tcp_mem": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444), - "tcp_mtu_probing": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644), - "tcp_no_metrics_save": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444), - "tcp_probe_interval": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444), - "tcp_probe_threshold": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444), - "tcp_retries1": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644), - "tcp_retries2": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644), - "tcp_rfc1337": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444), - "tcp_rmem": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444), - "tcp_sack": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644), - "tcp_slow_start_after_idle": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644), - "tcp_synack_retries": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644), - "tcp_syn_retries": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644), - "tcp_timestamps": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644), - "tcp_wmem": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444), - } - - d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) -} diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index 1062bd852..2bdcf5f70 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -106,16 +105,10 @@ func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { children := map[string]*fs.Inode{ "kernel": p.newKernelDir(ctx, msrc), + "net": p.newSysNetDir(ctx, msrc), "vm": p.newVMDir(ctx, msrc), } - // If we're using rpcinet we will let it manage /proc/sys/net. - if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok { - children["net"] = newRPCInetProcSysNet(ctx, msrc) - } else { - children["net"] = p.newSysNetDir(ctx, msrc) - } - d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555)) return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) } diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD deleted file mode 100644 index 4668b87d1..000000000 --- a/pkg/sentry/socket/rpcinet/BUILD +++ /dev/null @@ -1,69 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") - -package(licenses = ["notice"]) - -go_library( - name = "rpcinet", - srcs = [ - "device.go", - "rpcinet.go", - "socket.go", - "stack.go", - "stack_unsafe.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet", - visibility = ["//pkg/sentry:internal"], - deps = [ - ":syscall_rpc_go_proto", - "//pkg/abi/linux", - "//pkg/binary", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/device", - "//pkg/sentry/fs", - "//pkg/sentry/fs/fsutil", - "//pkg/sentry/inet", - "//pkg/sentry/kernel", - "//pkg/sentry/kernel/time", - "//pkg/sentry/socket", - "//pkg/sentry/socket/hostinet", - "//pkg/sentry/socket/rpcinet/conn", - "//pkg/sentry/socket/rpcinet/notifier", - "//pkg/sentry/unimpl", - "//pkg/sentry/usermem", - "//pkg/syserr", - "//pkg/syserror", - "//pkg/tcpip", - "//pkg/tcpip/buffer", - "//pkg/tcpip/stack", - "//pkg/unet", - "//pkg/waiter", - ], -) - -proto_library( - name = "syscall_rpc_proto", - srcs = ["syscall_rpc.proto"], - visibility = [ - "//visibility:public", - ], -) - -cc_proto_library( - name = "syscall_rpc_cc_proto", - visibility = [ - "//visibility:public", - ], - deps = [":syscall_rpc_proto"], -) - -go_proto_library( - name = "syscall_rpc_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto", - proto = ":syscall_rpc_proto", - visibility = [ - "//visibility:public", - ], -) diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD deleted file mode 100644 index b2677c659..000000000 --- a/pkg/sentry/socket/rpcinet/conn/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "conn", - srcs = ["conn.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/binary", - "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", - "//pkg/sync", - "//pkg/syserr", - "//pkg/unet", - "@com_github_golang_protobuf//proto:go_default_library", - ], -) diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go deleted file mode 100644 index 02f39c767..000000000 --- a/pkg/sentry/socket/rpcinet/conn/conn.go +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package conn is an RPC connection to a syscall RPC server. -package conn - -import ( - "fmt" - "sync/atomic" - "syscall" - - "github.com/golang/protobuf/proto" - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/unet" - - pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" -) - -type request struct { - response []byte - ready chan struct{} - ignoreResult bool -} - -// RPCConnection represents a single RPC connection to a syscall gofer. -type RPCConnection struct { - // reqID is the ID of the last request and must be accessed atomically. - reqID uint64 - - sendMu sync.Mutex - socket *unet.Socket - - reqMu sync.Mutex - requests map[uint64]request -} - -// NewRPCConnection initializes a RPC connection to a socket gofer. -func NewRPCConnection(s *unet.Socket) *RPCConnection { - conn := &RPCConnection{socket: s, requests: map[uint64]request{}} - go func() { // S/R-FIXME(b/77962828) - var nums [16]byte - for { - for n := 0; n < len(nums); { - nn, err := conn.socket.Read(nums[n:]) - if err != nil { - panic(fmt.Sprint("error reading length from socket rpc gofer: ", err)) - } - n += nn - } - - b := make([]byte, binary.LittleEndian.Uint64(nums[:8])) - id := binary.LittleEndian.Uint64(nums[8:]) - - for n := 0; n < len(b); { - nn, err := conn.socket.Read(b[n:]) - if err != nil { - panic(fmt.Sprint("error reading request from socket rpc gofer: ", err)) - } - n += nn - } - - conn.reqMu.Lock() - r := conn.requests[id] - if r.ignoreResult { - delete(conn.requests, id) - } else { - r.response = b - conn.requests[id] = r - } - conn.reqMu.Unlock() - close(r.ready) - } - }() - return conn -} - -// NewRequest makes a request to the RPC gofer and returns the request ID and a -// channel which will be closed once the request completes. -func (c *RPCConnection) NewRequest(req pb.SyscallRequest, ignoreResult bool) (uint64, chan struct{}) { - b, err := proto.Marshal(&req) - if err != nil { - panic(fmt.Sprint("invalid proto: ", err)) - } - - id := atomic.AddUint64(&c.reqID, 1) - ch := make(chan struct{}) - - c.reqMu.Lock() - c.requests[id] = request{ready: ch, ignoreResult: ignoreResult} - c.reqMu.Unlock() - - c.sendMu.Lock() - defer c.sendMu.Unlock() - - var nums [16]byte - binary.LittleEndian.PutUint64(nums[:8], uint64(len(b))) - binary.LittleEndian.PutUint64(nums[8:], id) - for n := 0; n < len(nums); { - nn, err := c.socket.Write(nums[n:]) - if err != nil { - panic(fmt.Sprint("error writing length and ID to socket gofer: ", err)) - } - n += nn - } - - for n := 0; n < len(b); { - nn, err := c.socket.Write(b[n:]) - if err != nil { - panic(fmt.Sprint("error writing request to socket gofer: ", err)) - } - n += nn - } - - return id, ch -} - -// RPCReadFile will execute the ReadFile helper RPC method which avoids the -// common pattern of open(2), read(2), close(2) by doing all three operations -// as a single RPC. It will read the entire file or return EFBIG if the file -// was too large. -func (c *RPCConnection) RPCReadFile(path string) ([]byte, *syserr.Error) { - req := &pb.SyscallRequest_ReadFile{&pb.ReadFileRequest{ - Path: path, - }} - - id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-ch - - res := c.Request(id).Result.(*pb.SyscallResponse_ReadFile).ReadFile.Result - if e, ok := res.(*pb.ReadFileResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.ReadFileResponse_Data).Data, nil -} - -// RPCWriteFile will execute the WriteFile helper RPC method which avoids the -// common pattern of open(2), write(2), write(2), close(2) by doing all -// operations as a single RPC. -func (c *RPCConnection) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) { - req := &pb.SyscallRequest_WriteFile{&pb.WriteFileRequest{ - Path: path, - Content: data, - }} - - id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-ch - - res := c.Request(id).Result.(*pb.SyscallResponse_WriteFile).WriteFile - if e := res.ErrorNumber; e != 0 { - return int64(res.Written), syserr.FromHost(syscall.Errno(e)) - } - - return int64(res.Written), nil -} - -// Request retrieves the request corresponding to the given request ID. -// -// The channel returned by NewRequest must have been closed before Request can -// be called. This will happen automatically, do not manually close the -// channel. -func (c *RPCConnection) Request(id uint64) pb.SyscallResponse { - c.reqMu.Lock() - r := c.requests[id] - delete(c.requests, id) - c.reqMu.Unlock() - - var resp pb.SyscallResponse - if err := proto.Unmarshal(r.response, &resp); err != nil { - panic(fmt.Sprint("invalid proto: ", err)) - } - - return resp -} diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go deleted file mode 100644 index 8cfd5f6e5..000000000 --- a/pkg/sentry/socket/rpcinet/device.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rpcinet - -import "gvisor.dev/gvisor/pkg/sentry/device" - -var socketDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD deleted file mode 100644 index a5954f22b..000000000 --- a/pkg/sentry/socket/rpcinet/notifier/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "notifier", - srcs = ["notifier.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier", - visibility = ["//:sandbox"], - deps = [ - "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", - "//pkg/sentry/socket/rpcinet/conn", - "//pkg/sync", - "//pkg/waiter", - "@org_golang_x_sys//unix:go_default_library", - ], -) diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go deleted file mode 100644 index 82b75d6dd..000000000 --- a/pkg/sentry/socket/rpcinet/notifier/notifier.go +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package notifier implements an FD notifier implementation over RPC. -package notifier - -import ( - "fmt" - "syscall" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" - pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/waiter" -) - -type fdInfo struct { - queue *waiter.Queue - waiting bool -} - -// Notifier holds all the state necessary to issue notifications when IO events -// occur in the observed FDs. -type Notifier struct { - // rpcConn is the connection that is used for sending RPCs. - rpcConn *conn.RPCConnection - - // epFD is the epoll file descriptor used to register for io - // notifications. - epFD uint32 - - // mu protects fdMap. - mu sync.Mutex - - // fdMap maps file descriptors to their notification queues and waiting - // status. - fdMap map[uint32]*fdInfo -} - -// NewRPCNotifier creates a new notifier object. -func NewRPCNotifier(cn *conn.RPCConnection) (*Notifier, error) { - id, c := cn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCreate1{&pb.EpollCreate1Request{}}}, false /* ignoreResult */) - <-c - - res := cn.Request(id).Result.(*pb.SyscallResponse_EpollCreate1).EpollCreate1.Result - if e, ok := res.(*pb.EpollCreate1Response_ErrorNumber); ok { - return nil, syscall.Errno(e.ErrorNumber) - } - - w := &Notifier{ - rpcConn: cn, - epFD: res.(*pb.EpollCreate1Response_Fd).Fd, - fdMap: make(map[uint32]*fdInfo), - } - - go w.waitAndNotify() // S/R-FIXME(b/77962828) - - return w, nil -} - -// waitFD waits on mask for fd. The fdMap mutex must be hold. -func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error { - if !fi.waiting && mask == 0 { - return nil - } - - e := pb.EpollEvent{ - Events: mask.ToLinux() | unix.EPOLLET, - Fd: fd, - } - - switch { - case !fi.waiting && mask != 0: - id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_ADD, Fd: fd, Event: &e}}}, false /* ignoreResult */) - <-c - - e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber - if e != 0 { - return syscall.Errno(e) - } - - fi.waiting = true - case fi.waiting && mask == 0: - id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_DEL, Fd: fd}}}, false /* ignoreResult */) - <-c - n.rpcConn.Request(id) - - fi.waiting = false - case fi.waiting && mask != 0: - id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_MOD, Fd: fd, Event: &e}}}, false /* ignoreResult */) - <-c - - e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber - if e != 0 { - return syscall.Errno(e) - } - } - - return nil -} - -// addFD adds an FD to the list of FDs observed by n. -func (n *Notifier) addFD(fd uint32, queue *waiter.Queue) { - n.mu.Lock() - defer n.mu.Unlock() - - // Panic if we're already notifying on this FD. - if _, ok := n.fdMap[fd]; ok { - panic(fmt.Sprintf("File descriptor %d added twice", fd)) - } - - // We have nothing to wait for at the moment. Just add it to the map. - n.fdMap[fd] = &fdInfo{queue: queue} -} - -// updateFD updates the set of events the FD needs to be notified on. -func (n *Notifier) updateFD(fd uint32) error { - n.mu.Lock() - defer n.mu.Unlock() - - if fi, ok := n.fdMap[fd]; ok { - return n.waitFD(fd, fi, fi.queue.Events()) - } - - return nil -} - -// RemoveFD removes an FD from the list of FDs observed by n. -func (n *Notifier) removeFD(fd uint32) { - n.mu.Lock() - defer n.mu.Unlock() - - // Remove from map, then from epoll object. - n.waitFD(fd, n.fdMap[fd], 0) - delete(n.fdMap, fd) -} - -// hasFD returns true if the FD is in the list of observed FDs. -func (n *Notifier) hasFD(fd uint32) bool { - n.mu.Lock() - defer n.mu.Unlock() - - _, ok := n.fdMap[fd] - return ok -} - -// waitAndNotify loops waiting for io event notifications from the epoll -// object. Once notifications arrive, they are dispatched to the -// registered queue. -func (n *Notifier) waitAndNotify() error { - for { - id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollWait{&pb.EpollWaitRequest{Fd: n.epFD, NumEvents: 100, Msec: -1}}}, false /* ignoreResult */) - <-c - - res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollWait).EpollWait.Result - if e, ok := res.(*pb.EpollWaitResponse_ErrorNumber); ok { - err := syscall.Errno(e.ErrorNumber) - // NOTE(magi): I don't think epoll_wait can return EAGAIN but I'm being - // conseratively careful here since exiting the notification thread - // would be really bad. - if err == syscall.EINTR || err == syscall.EAGAIN { - continue - } - return err - } - - n.mu.Lock() - for _, e := range res.(*pb.EpollWaitResponse_Events).Events.Events { - if fi, ok := n.fdMap[e.Fd]; ok { - fi.queue.Notify(waiter.EventMaskFromLinux(e.Events)) - } - } - n.mu.Unlock() - } -} - -// AddFD adds an FD to the list of observed FDs. -func (n *Notifier) AddFD(fd uint32, queue *waiter.Queue) error { - n.addFD(fd, queue) - return nil -} - -// UpdateFD updates the set of events the FD needs to be notified on. -func (n *Notifier) UpdateFD(fd uint32) error { - return n.updateFD(fd) -} - -// RemoveFD removes an FD from the list of observed FDs. -func (n *Notifier) RemoveFD(fd uint32) { - n.removeFD(fd) -} - -// HasFD returns true if the FD is in the list of observed FDs. -// -// This should only be used by tests to assert that FDs are correctly -// registered. -func (n *Notifier) HasFD(fd uint32) bool { - return n.hasFD(fd) -} - -// NonBlockingPoll polls the given fd in non-blocking fashion. It is used just -// to query the FD's current state; this method will block on the RPC response -// although the syscall is non-blocking. -func (n *Notifier) NonBlockingPoll(fd uint32, mask waiter.EventMask) waiter.EventMask { - for { - id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Poll{&pb.PollRequest{Fd: fd, Events: mask.ToLinux()}}}, false /* ignoreResult */) - <-c - - res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_Poll).Poll.Result - if e, ok := res.(*pb.PollResponse_ErrorNumber); ok { - if syscall.Errno(e.ErrorNumber) == syscall.EINTR { - continue - } - return mask - } - - return waiter.EventMaskFromLinux(res.(*pb.PollResponse_Events).Events) - } -} diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go deleted file mode 100644 index 5d4fd4dac..000000000 --- a/pkg/sentry/socket/rpcinet/rpcinet.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package rpcinet implements sockets using an RPC for each syscall. -package rpcinet diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go deleted file mode 100644 index ddb76d9d4..000000000 --- a/pkg/sentry/socket/rpcinet/socket.go +++ /dev/null @@ -1,909 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rpcinet - -import ( - "sync/atomic" - "syscall" - "time" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier" - pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" - "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" - "gvisor.dev/gvisor/pkg/waiter" -) - -// socketOperations implements fs.FileOperations and socket.Socket for a socket -// implemented using a host socket. -type socketOperations struct { - fsutil.FilePipeSeek `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoMMap `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNoopFlush `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - socket.SendReceiveTimeout - - family int // Read-only. - stype linux.SockType // Read-only. - protocol int // Read-only. - - fd uint32 // must be O_NONBLOCK - wq *waiter.Queue - rpcConn *conn.RPCConnection - notifier *notifier.Notifier - - // shState is the state of the connection with respect to shutdown. Because - // we're mixing non-blocking semantics on the other side we have to adapt for - // some strange differences between blocking and non-blocking sockets. - shState int32 -} - -// Verify that we actually implement socket.Socket. -var _ = socket.Socket(&socketOperations{}) - -// New creates a new RPC socket. -func newSocketFile(ctx context.Context, stack *Stack, family int, skType linux.SockType, protocol int) (*fs.File, *syserr.Error) { - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */) - <-c - - res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result - if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - fd := res.(*pb.SocketResponse_Fd).Fd - - var wq waiter.Queue - stack.notifier.AddFD(fd, &wq) - - dirent := socket.NewDirent(ctx, socketDevice) - defer dirent.DecRef() - return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{ - family: family, - stype: skType, - protocol: protocol, - wq: &wq, - fd: fd, - rpcConn: stack.rpcConn, - notifier: stack.notifier, - }), nil -} - -func isBlockingErrno(err error) bool { - return err == syscall.EAGAIN || err == syscall.EWOULDBLOCK -} - -func translateIOSyscallError(err error) error { - if isBlockingErrno(err) { - return syserror.ErrWouldBlock - } - return err -} - -// setShutdownFlags will set the shutdown flag so we can handle blocking reads -// after a read shutdown. -func (s *socketOperations) setShutdownFlags(how int) { - var f tcpip.ShutdownFlags - switch how { - case linux.SHUT_RD: - f = tcpip.ShutdownRead - case linux.SHUT_WR: - f = tcpip.ShutdownWrite - case linux.SHUT_RDWR: - f = tcpip.ShutdownWrite | tcpip.ShutdownRead - } - - // Atomically update the flags. - for { - old := atomic.LoadInt32(&s.shState) - if atomic.CompareAndSwapInt32(&s.shState, old, old|int32(f)) { - break - } - } -} - -func (s *socketOperations) resetShutdownFlags() { - atomic.StoreInt32(&s.shState, 0) -} - -func (s *socketOperations) isShutRdSet() bool { - return atomic.LoadInt32(&s.shState)&int32(tcpip.ShutdownRead) != 0 -} - -func (s *socketOperations) isShutWrSet() bool { - return atomic.LoadInt32(&s.shState)&int32(tcpip.ShutdownWrite) != 0 -} - -// Release implements fs.FileOperations.Release. -func (s *socketOperations) Release() { - s.notifier.RemoveFD(s.fd) - - // We always need to close the FD. - _, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: s.fd}}}, true /* ignoreResult */) -} - -// Readiness implements waiter.Waitable.Readiness. -func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { - return s.notifier.NonBlockingPoll(s.fd, mask) -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - s.wq.EventRegister(e, mask) - s.notifier.UpdateFD(s.fd) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (s *socketOperations) EventUnregister(e *waiter.Entry) { - s.wq.EventUnregister(e) - s.notifier.UpdateFD(s.fd) -} - -func rpcRead(t *kernel.Task, req *pb.SyscallRequest_Read) (*pb.ReadResponse_Data, *syserr.Error) { - s := t.NetworkContext().(*Stack) - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Read).Read.Result - if e, ok := res.(*pb.ReadResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.ReadResponse_Data), nil -} - -// Read implements fs.FileOperations.Read. -func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { - req := &pb.SyscallRequest_Read{&pb.ReadRequest{ - Fd: s.fd, - Length: uint32(dst.NumBytes()), - }} - - res, se := rpcRead(ctx.(*kernel.Task), req) - if se == nil { - n, e := dst.CopyOut(ctx, res.Data) - return int64(n), e - } - - return 0, se.ToError() -} - -func rpcWrite(t *kernel.Task, req *pb.SyscallRequest_Write) (uint32, *syserr.Error) { - s := t.NetworkContext().(*Stack) - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Write).Write.Result - if e, ok := res.(*pb.WriteResponse_ErrorNumber); ok { - return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.WriteResponse_Length).Length, nil -} - -// Write implements fs.FileOperations.Write. -func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { - t := ctx.(*kernel.Task) - v := buffer.NewView(int(src.NumBytes())) - - // Copy all the data into the buffer. - if _, err := src.CopyIn(t, v); err != nil { - return 0, err - } - - n, err := rpcWrite(t, &pb.SyscallRequest_Write{&pb.WriteRequest{Fd: s.fd, Data: v}}) - if n > 0 && n < uint32(src.NumBytes()) { - // The FileOperations.Write interface expects us to return ErrWouldBlock in - // the event of a partial write. - return int64(n), syserror.ErrWouldBlock - } - return int64(n), err.ToError() -} - -func rpcConnect(t *kernel.Task, fd uint32, sockaddr []byte) *syserr.Error { - s := t.NetworkContext().(*Stack) - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Connect{&pb.ConnectRequest{Fd: uint32(fd), Address: sockaddr}}}, false /* ignoreResult */) - <-c - - if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Connect).Connect.ErrorNumber; e != 0 { - return syserr.FromHost(syscall.Errno(e)) - } - return nil -} - -// Connect implements socket.Socket.Connect. -func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { - if !blocking { - e := rpcConnect(t, s.fd, sockaddr) - if e == nil { - // Reset the shutdown state on new connects. - s.resetShutdownFlags() - } - return e - } - - // Register for notification when the endpoint becomes writable, then - // initiate the connection. - e, ch := waiter.NewChannelEntry(nil) - s.EventRegister(&e, waiter.EventOut|waiter.EventIn|waiter.EventHUp) - defer s.EventUnregister(&e) - for { - if err := rpcConnect(t, s.fd, sockaddr); err == nil || err != syserr.ErrInProgress && err != syserr.ErrAlreadyInProgress { - if err == nil { - // Reset the shutdown state on new connects. - s.resetShutdownFlags() - } - return err - } - - // It's pending, so we have to wait for a notification, and fetch the - // result once the wait completes. - if err := t.Block(ch); err != nil { - return syserr.FromError(err) - } - } -} - -func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultPayload, *syserr.Error) { - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Accept{&pb.AcceptRequest{Fd: fd, Peer: peer, Flags: syscall.SOCK_NONBLOCK}}}, false /* ignoreResult */) - <-c - - res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Accept).Accept.Result - if e, ok := res.(*pb.AcceptResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - return res.(*pb.AcceptResponse_Payload).Payload, nil -} - -// Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { - payload, se := rpcAccept(t, s.fd, peerRequested) - - // Check if we need to block. - if blocking && se == syserr.ErrTryAgain { - // Register for notifications. - e, ch := waiter.NewChannelEntry(nil) - // FIXME(b/119878986): This waiter.EventHUp is a partial - // measure, need to figure out how to translate linux events to - // internal events. - s.EventRegister(&e, waiter.EventIn|waiter.EventHUp) - defer s.EventUnregister(&e) - - // Try to accept the connection again; if it fails, then wait until we - // get a notification. - for { - if payload, se = rpcAccept(t, s.fd, peerRequested); se != syserr.ErrTryAgain { - break - } - - if err := t.Block(ch); err != nil { - return 0, nil, 0, syserr.FromError(err) - } - } - } - - // Handle any error from accept. - if se != nil { - return 0, nil, 0, se - } - - var wq waiter.Queue - s.notifier.AddFD(payload.Fd, &wq) - - dirent := socket.NewDirent(t, socketDevice) - defer dirent.DecRef() - fileFlags := fs.FileFlags{ - Read: true, - Write: true, - NonSeekable: true, - NonBlocking: flags&linux.SOCK_NONBLOCK != 0, - } - file := fs.NewFile(t, dirent, fileFlags, &socketOperations{ - family: s.family, - stype: s.stype, - protocol: s.protocol, - wq: &wq, - fd: payload.Fd, - rpcConn: s.rpcConn, - notifier: s.notifier, - }) - defer file.DecRef() - - fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ - CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - }) - if err != nil { - return 0, nil, 0, syserr.FromError(err) - } - t.Kernel().RecordSocket(file) - - if peerRequested { - return fd, socket.UnmarshalSockAddr(s.family, payload.Address.Address), payload.Address.Length, nil - } - - return fd, nil, 0, nil -} - -// Bind implements socket.Socket.Bind. -func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: s.fd, Address: sockaddr}}}, false /* ignoreResult */) - <-c - - if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 { - return syserr.FromHost(syscall.Errno(e)) - } - return nil -} - -// Listen implements socket.Socket.Listen. -func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Listen{&pb.ListenRequest{Fd: s.fd, Backlog: int64(backlog)}}}, false /* ignoreResult */) - <-c - - if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Listen).Listen.ErrorNumber; e != 0 { - return syserr.FromHost(syscall.Errno(e)) - } - return nil -} - -// Shutdown implements socket.Socket.Shutdown. -func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { - // We save the shutdown state because of strange differences on linux - // related to recvs on blocking vs. non-blocking sockets after a SHUT_RD. - // We need to emulate that behavior on the blocking side. - // TODO(b/120096741): There is a possible race that can exist with loopback, - // where data could possibly be lost. - s.setShutdownFlags(how) - - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Shutdown{&pb.ShutdownRequest{Fd: s.fd, How: int64(how)}}}, false /* ignoreResult */) - <-c - - if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Shutdown).Shutdown.ErrorNumber; e != 0 { - return syserr.FromHost(syscall.Errno(e)) - } - - return nil -} - -// GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { - // SO_RCVTIMEO and SO_SNDTIMEO are special because blocking is performed - // within the sentry. - if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO { - if outLen < linux.SizeOfTimeval { - return nil, syserr.ErrInvalidArgument - } - - return linux.NsecToTimeval(s.RecvTimeout()), nil - } - if level == linux.SOL_SOCKET && name == linux.SO_SNDTIMEO { - if outLen < linux.SizeOfTimeval { - return nil, syserr.ErrInvalidArgument - } - - return linux.NsecToTimeval(s.SendTimeout()), nil - } - - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockOpt{&pb.GetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Length: uint32(outLen)}}}, false /* ignoreResult */) - <-c - - res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockOpt).GetSockOpt.Result - if e, ok := res.(*pb.GetSockOptResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.GetSockOptResponse_Opt).Opt, nil -} - -// SetSockOpt implements socket.Socket.SetSockOpt. -func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { - // Because blocking actually happens within the sentry we need to inspect - // this socket option to determine if it's a SO_RCVTIMEO or SO_SNDTIMEO, - // and if so, we will save it and use it as the deadline for recv(2) - // or send(2) related syscalls. - if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO { - if len(opt) < linux.SizeOfTimeval { - return syserr.ErrInvalidArgument - } - - var v linux.Timeval - binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v) - if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { - return syserr.ErrDomain - } - s.SetRecvTimeout(v.ToNsecCapped()) - return nil - } - if level == linux.SOL_SOCKET && name == linux.SO_SNDTIMEO { - if len(opt) < linux.SizeOfTimeval { - return syserr.ErrInvalidArgument - } - - var v linux.Timeval - binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v) - if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { - return syserr.ErrDomain - } - s.SetSendTimeout(v.ToNsecCapped()) - return nil - } - - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_SetSockOpt{&pb.SetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Opt: opt}}}, false /* ignoreResult */) - <-c - - if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_SetSockOpt).SetSockOpt.ErrorNumber; e != 0 { - return syserr.FromHost(syscall.Errno(e)) - } - return nil -} - -// GetPeerName implements socket.Socket.GetPeerName. -func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetPeerName{&pb.GetPeerNameRequest{Fd: s.fd}}}, false /* ignoreResult */) - <-c - - res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetPeerName).GetPeerName.Result - if e, ok := res.(*pb.GetPeerNameResponse_ErrorNumber); ok { - return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - addr := res.(*pb.GetPeerNameResponse_Address).Address - return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil -} - -// GetSockName implements socket.Socket.GetSockName. -func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { - stack := t.NetworkContext().(*Stack) - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockName{&pb.GetSockNameRequest{Fd: s.fd}}}, false /* ignoreResult */) - <-c - - res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockName).GetSockName.Result - if e, ok := res.(*pb.GetSockNameResponse_ErrorNumber); ok { - return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - addr := res.(*pb.GetSockNameResponse_Address).Address - return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil -} - -func rpcIoctl(t *kernel.Task, fd, cmd uint32, arg []byte) ([]byte, error) { - stack := t.NetworkContext().(*Stack) - - id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Ioctl{&pb.IOCtlRequest{Fd: fd, Cmd: cmd, Arg: arg}}}, false /* ignoreResult */) - <-c - - res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Ioctl).Ioctl.Result - if e, ok := res.(*pb.IOCtlResponse_ErrorNumber); ok { - return nil, syscall.Errno(e.ErrorNumber) - } - - return res.(*pb.IOCtlResponse_Value).Value, nil -} - -// ifconfIoctlFromStack populates a struct ifconf for the SIOCGIFCONF ioctl. -func ifconfIoctlFromStack(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { - // If Ptr is NULL, return the necessary buffer size via Len. - // Otherwise, write up to Len bytes starting at Ptr containing ifreq - // structs. - t := ctx.(*kernel.Task) - s := t.NetworkContext().(*Stack) - if s == nil { - return syserr.ErrNoDevice.ToError() - } - - if ifc.Ptr == 0 { - ifc.Len = int32(len(s.Interfaces())) * int32(linux.SizeOfIFReq) - return nil - } - - max := ifc.Len - ifc.Len = 0 - for key, ifaceAddrs := range s.InterfaceAddrs() { - iface := s.Interfaces()[key] - for _, ifaceAddr := range ifaceAddrs { - // Don't write past the end of the buffer. - if ifc.Len+int32(linux.SizeOfIFReq) > max { - break - } - if ifaceAddr.Family != linux.AF_INET { - continue - } - - // Populate ifr.ifr_addr. - ifr := linux.IFReq{} - ifr.SetName(iface.Name) - usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) - usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0) - copy(ifr.Data[4:8], ifaceAddr.Addr[:4]) - - // Copy the ifr to userspace. - dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) - ifc.Len += int32(linux.SizeOfIFReq) - if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { - return err - } - } - } - return nil -} - -// Ioctl implements fs.FileOperations.Ioctl. -func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - t := ctx.(*kernel.Task) - - cmd := uint32(args[1].Int()) - arg := args[2].Pointer() - - var buf []byte - switch cmd { - // The following ioctls take 4 byte argument parameters. - case syscall.TIOCINQ, - syscall.TIOCOUTQ: - buf = make([]byte, 4) - // The following ioctls have args which are sizeof(struct ifreq). - case syscall.SIOCGIFADDR, - syscall.SIOCGIFBRDADDR, - syscall.SIOCGIFDSTADDR, - syscall.SIOCGIFFLAGS, - syscall.SIOCGIFHWADDR, - syscall.SIOCGIFINDEX, - syscall.SIOCGIFMAP, - syscall.SIOCGIFMETRIC, - syscall.SIOCGIFMTU, - syscall.SIOCGIFNAME, - syscall.SIOCGIFNETMASK, - syscall.SIOCGIFTXQLEN: - buf = make([]byte, linux.SizeOfIFReq) - case syscall.SIOCGIFCONF: - // SIOCGIFCONF has slightly different behavior than the others, in that it - // will need to populate the array of ifreqs. - var ifc linux.IFConf - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { - return 0, err - } - - if err := ifconfIoctlFromStack(ctx, io, &ifc); err != nil { - return 0, err - } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{ - AddressSpaceActive: true, - }) - - return 0, err - - case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: - unimpl.EmitUnimplementedEvent(ctx) - - default: - return 0, syserror.ENOTTY - } - - _, err := io.CopyIn(ctx, arg, buf, usermem.IOOpts{ - AddressSpaceActive: true, - }) - - if err != nil { - return 0, err - } - - v, err := rpcIoctl(t, s.fd, cmd, buf) - if err != nil { - return 0, err - } - - if len(v) != len(buf) { - return 0, syserror.EINVAL - } - - _, err = io.CopyOut(ctx, arg, v, usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err -} - -func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) { - s := t.NetworkContext().(*Stack) - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result - if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.RecvmsgResponse_Payload).Payload, nil -} - -// Because we only support SO_TIMESTAMP we will search control messages for -// that value and set it if so, all other control messages will be ignored. -func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_ResultPayload) socket.ControlMessages { - c := socket.ControlMessages{} - if len(payload.GetCmsgData()) > 0 { - // Parse the control messages looking for SO_TIMESTAMP. - msgs, e := syscall.ParseSocketControlMessage(payload.GetCmsgData()) - if e != nil { - return socket.ControlMessages{} - } - for _, m := range msgs { - if m.Header.Level != linux.SOL_SOCKET || m.Header.Type != linux.SO_TIMESTAMP { - continue - } - - // Let's parse the time stamp and set it. - if len(m.Data) < linux.SizeOfTimeval { - // Give up on locating the SO_TIMESTAMP option. - return socket.ControlMessages{} - } - - var v linux.Timeval - binary.Unmarshal(m.Data[:linux.SizeOfTimeval], usermem.ByteOrder, &v) - c.IP.HasTimestamp = true - c.IP.Timestamp = v.ToNsecCapped() - break - } - } - return c -} - -// RecvMsg implements socket.Socket.RecvMsg. -func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { - req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{ - Fd: s.fd, - Length: uint32(dst.NumBytes()), - Sender: senderRequested, - Trunc: flags&linux.MSG_TRUNC != 0, - Peek: flags&linux.MSG_PEEK != 0, - CmsgLength: uint32(controlDataLen), - }} - - res, err := rpcRecvMsg(t, req) - if err == nil { - var e error - var n int - if len(res.Data) > 0 { - n, e = dst.CopyOut(t, res.Data) - if e == nil && n != len(res.Data) { - panic("CopyOut failed to copy full buffer") - } - } - c := s.extractControlMessages(res) - return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e) - } - if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 { - return 0, 0, nil, 0, socket.ControlMessages{}, err - } - - // We'll have to block. Register for notifications and keep trying to - // send all the data. - e, ch := waiter.NewChannelEntry(nil) - s.EventRegister(&e, waiter.EventIn) - defer s.EventUnregister(&e) - - for { - res, err := rpcRecvMsg(t, req) - if err == nil { - var e error - var n int - if len(res.Data) > 0 { - n, e = dst.CopyOut(t, res.Data) - if e == nil && n != len(res.Data) { - panic("CopyOut failed to copy full buffer") - } - } - c := s.extractControlMessages(res) - return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e) - } - if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain { - return 0, 0, nil, 0, socket.ControlMessages{}, err - } - - if s.isShutRdSet() { - // Blocking would have caused us to block indefinitely so we return 0, - // this is the same behavior as Linux. - return 0, 0, nil, 0, socket.ControlMessages{}, nil - } - - if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain - } - return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) - } - } -} - -func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) { - s := t.NetworkContext().(*Stack) - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result - if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok { - return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.SendmsgResponse_Length).Length, nil -} - -// SendMsg implements socket.Socket.SendMsg. -func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { - // Whitelist flags. - if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { - return 0, syserr.ErrInvalidArgument - } - - // Reject Unix control messages. - if !controlMessages.Unix.Empty() { - return 0, syserr.ErrInvalidArgument - } - - v := buffer.NewView(int(src.NumBytes())) - - // Copy all the data into the buffer. - if _, err := src.CopyIn(t, v); err != nil { - return 0, syserr.FromError(err) - } - - // TODO(bgeffon): this needs to change to map directly to a SendMsg syscall - // in the RPC. - totalWritten := 0 - n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{ - Fd: uint32(s.fd), - Data: v, - Address: to, - More: flags&linux.MSG_MORE != 0, - EndOfRecord: flags&linux.MSG_EOR != 0, - }}) - - if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 { - return int(n), err - } - - if n > 0 { - totalWritten += int(n) - v.TrimFront(int(n)) - } - - // We'll have to block. Register for notification and keep trying to - // send all the data. - e, ch := waiter.NewChannelEntry(nil) - s.EventRegister(&e, waiter.EventOut) - defer s.EventUnregister(&e) - - for { - n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{ - Fd: uint32(s.fd), - Data: v, - Address: to, - More: flags&linux.MSG_MORE != 0, - EndOfRecord: flags&linux.MSG_EOR != 0, - }}) - - if n > 0 { - totalWritten += int(n) - v.TrimFront(int(n)) - - if err == nil && totalWritten < int(src.NumBytes()) { - continue - } - } - - if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain { - // We eat the error in this situation. - return int(totalWritten), nil - } - - if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - return int(totalWritten), syserr.ErrTryAgain - } - return int(totalWritten), syserr.FromError(err) - } - } -} - -// State implements socket.Socket.State. -func (s *socketOperations) State() uint32 { - // TODO(b/127845868): Define a new rpc to query the socket state. - return 0 -} - -// Type implements socket.Socket.Type. -func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { - return s.family, s.stype, s.protocol -} - -type socketProvider struct { - family int -} - -// Socket implements socket.Provider.Socket. -func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { - // Check that we are using the RPC network stack. - stack := t.NetworkContext() - if stack == nil { - return nil, nil - } - - s, ok := stack.(*Stack) - if !ok { - return nil, nil - } - - // Only accept TCP and UDP. - // - // Try to restrict the flags we will accept to minimize backwards - // incompatibility with netstack. - stype := stypeflags & linux.SOCK_TYPE_MASK - switch stype { - case syscall.SOCK_STREAM: - switch protocol { - case 0, syscall.IPPROTO_TCP: - // ok - default: - return nil, nil - } - case syscall.SOCK_DGRAM: - switch protocol { - case 0, syscall.IPPROTO_UDP: - // ok - default: - return nil, nil - } - default: - return nil, nil - } - - return newSocketFile(t, s, p.family, stype, protocol) -} - -// Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { - // Not supported by AF_INET/AF_INET6. - return nil, nil, nil -} - -func init() { - for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { - socket.RegisterProvider(family, &socketProvider{family}) - } -} diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go deleted file mode 100644 index f7878a760..000000000 --- a/pkg/sentry/socket/rpcinet/stack.go +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rpcinet - -import ( - "fmt" - "syscall" - - "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" - "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier" - "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/tcpip/stack" - "gvisor.dev/gvisor/pkg/unet" -) - -// Stack implements inet.Stack for RPC backed sockets. -type Stack struct { - interfaces map[int32]inet.Interface - interfaceAddrs map[int32][]inet.InterfaceAddr - routes []inet.Route - rpcConn *conn.RPCConnection - notifier *notifier.Notifier -} - -// NewStack returns a Stack containing the current state of the host network -// stack. -func NewStack(fd int32) (*Stack, error) { - sock, err := unet.NewSocket(int(fd)) - if err != nil { - return nil, err - } - - stack := &Stack{ - interfaces: make(map[int32]inet.Interface), - interfaceAddrs: make(map[int32][]inet.InterfaceAddr), - rpcConn: conn.NewRPCConnection(sock), - } - - var e error - stack.notifier, e = notifier.NewRPCNotifier(stack.rpcConn) - if e != nil { - return nil, e - } - - links, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETLINK) - if err != nil { - return nil, fmt.Errorf("RTM_GETLINK failed: %v", err) - } - - addrs, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETADDR) - if err != nil { - return nil, fmt.Errorf("RTM_GETADDR failed: %v", err) - } - - e = hostinet.ExtractHostInterfaces(links, addrs, stack.interfaces, stack.interfaceAddrs) - if e != nil { - return nil, e - } - - routes, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETROUTE) - if err != nil { - return nil, fmt.Errorf("RTM_GETROUTE failed: %v", err) - } - - stack.routes, e = hostinet.ExtractHostRoutes(routes) - if e != nil { - return nil, e - } - - return stack, nil -} - -// RPCReadFile will execute the ReadFile helper RPC method which avoids the -// common pattern of open(2), read(2), close(2) by doing all three operations -// as a single RPC. It will read the entire file or return EFBIG if the file -// was too large. -func (s *Stack) RPCReadFile(path string) ([]byte, *syserr.Error) { - return s.rpcConn.RPCReadFile(path) -} - -// RPCWriteFile will execute the WriteFile helper RPC method which avoids the -// common pattern of open(2), write(2), write(2), close(2) by doing all -// operations as a single RPC. -func (s *Stack) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) { - return s.rpcConn.RPCWriteFile(path, data) -} - -// Interfaces implements inet.Stack.Interfaces. -func (s *Stack) Interfaces() map[int32]inet.Interface { - interfaces := make(map[int32]inet.Interface) - for k, v := range s.interfaces { - interfaces[k] = v - } - return interfaces -} - -// InterfaceAddrs implements inet.Stack.InterfaceAddrs. -func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { - addrs := make(map[int32][]inet.InterfaceAddr) - for k, v := range s.interfaceAddrs { - addrs[k] = append([]inet.InterfaceAddr(nil), v...) - } - return addrs -} - -// SupportsIPv6 implements inet.Stack.SupportsIPv6. -func (s *Stack) SupportsIPv6() bool { - panic("rpcinet handles procfs directly this method should not be called") -} - -// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. -func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { - panic("rpcinet handles procfs directly this method should not be called") -} - -// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. -func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { - panic("rpcinet handles procfs directly this method should not be called") - -} - -// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. -func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { - panic("rpcinet handles procfs directly this method should not be called") - -} - -// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. -func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { - panic("rpcinet handles procfs directly this method should not be called") -} - -// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. -func (s *Stack) TCPSACKEnabled() (bool, error) { - panic("rpcinet handles procfs directly this method should not be called") -} - -// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. -func (s *Stack) SetTCPSACKEnabled(enabled bool) error { - panic("rpcinet handles procfs directly this method should not be called") -} - -// Statistics implements inet.Stack.Statistics. -func (s *Stack) Statistics(stat interface{}, arg string) error { - return syserr.ErrEndpointOperation.ToError() -} - -// RouteTable implements inet.Stack.RouteTable. -func (s *Stack) RouteTable() []inet.Route { - return append([]inet.Route(nil), s.routes...) -} - -// Resume implements inet.Stack.Resume. -func (s *Stack) Resume() {} - -// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. -func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } - -// CleanupEndpoints implements inet.Stack.CleanupEndpoints. -func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } - -// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. -func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go deleted file mode 100644 index a94bdad83..000000000 --- a/pkg/sentry/socket/rpcinet/stack_unsafe.go +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rpcinet - -import ( - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" - pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserr" -) - -// NewNetlinkRouteRequest builds a netlink message for getting the RIB, -// the routing information base. -func newNetlinkRouteRequest(proto, seq, family int) []byte { - rr := &syscall.NetlinkRouteRequest{} - rr.Header.Len = uint32(syscall.NLMSG_HDRLEN + syscall.SizeofRtGenmsg) - rr.Header.Type = uint16(proto) - rr.Header.Flags = syscall.NLM_F_DUMP | syscall.NLM_F_REQUEST - rr.Header.Seq = uint32(seq) - rr.Data.Family = uint8(family) - return netlinkRRtoWireFormat(rr) -} - -func netlinkRRtoWireFormat(rr *syscall.NetlinkRouteRequest) []byte { - b := make([]byte, rr.Header.Len) - *(*uint32)(unsafe.Pointer(&b[0:4][0])) = rr.Header.Len - *(*uint16)(unsafe.Pointer(&b[4:6][0])) = rr.Header.Type - *(*uint16)(unsafe.Pointer(&b[6:8][0])) = rr.Header.Flags - *(*uint32)(unsafe.Pointer(&b[8:12][0])) = rr.Header.Seq - *(*uint32)(unsafe.Pointer(&b[12:16][0])) = rr.Header.Pid - b[16] = byte(rr.Data.Family) - return b -} - -func (s *Stack) getNetlinkFd() (uint32, *syserr.Error) { - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(syscall.AF_NETLINK), Type: int64(syscall.SOCK_RAW | syscall.SOCK_NONBLOCK), Protocol: int64(syscall.NETLINK_ROUTE)}}}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result - if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok { - return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - return res.(*pb.SocketResponse_Fd).Fd, nil -} - -func (s *Stack) bindNetlinkFd(fd uint32, sockaddr []byte) *syserr.Error { - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: fd, Address: sockaddr}}}, false /* ignoreResult */) - <-c - - if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 { - return syserr.FromHost(syscall.Errno(e)) - } - return nil -} - -func (s *Stack) closeNetlinkFd(fd uint32) { - _, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: fd}}}, true /* ignoreResult */) -} - -func (s *Stack) rpcSendMsg(req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) { - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result - if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok { - return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.SendmsgResponse_Length).Length, nil -} - -func (s *Stack) sendMsg(fd uint32, buf []byte, to []byte, flags int) (int, *syserr.Error) { - // Whitelist flags. - if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { - return 0, syserr.ErrInvalidArgument - } - - req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{ - Fd: fd, - Data: buf, - Address: to, - More: flags&linux.MSG_MORE != 0, - EndOfRecord: flags&linux.MSG_EOR != 0, - }} - - n, err := s.rpcSendMsg(req) - return int(n), err -} - -func (s *Stack) rpcRecvMsg(req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) { - id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */) - <-c - - res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result - if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok { - return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber)) - } - - return res.(*pb.RecvmsgResponse_Payload).Payload, nil -} - -func (s *Stack) recvMsg(fd, l, flags uint32) ([]byte, *syserr.Error) { - req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{ - Fd: fd, - Length: l, - Sender: false, - Trunc: flags&linux.MSG_TRUNC != 0, - Peek: flags&linux.MSG_PEEK != 0, - }} - - res, err := s.rpcRecvMsg(req) - if err != nil { - return nil, err - } - return res.Data, nil -} - -func (s *Stack) netlinkRequest(proto, family int) ([]byte, error) { - fd, err := s.getNetlinkFd() - if err != nil { - return nil, err.ToError() - } - defer s.closeNetlinkFd(fd) - - lsa := syscall.SockaddrNetlink{Family: syscall.AF_NETLINK} - b := binary.Marshal(nil, usermem.ByteOrder, &lsa) - if err := s.bindNetlinkFd(fd, b); err != nil { - return nil, err.ToError() - } - - wb := newNetlinkRouteRequest(proto, 1, family) - _, err = s.sendMsg(fd, wb, b, 0) - if err != nil { - return nil, err.ToError() - } - - var tab []byte -done: - for { - rb, err := s.recvMsg(fd, uint32(syscall.Getpagesize()), 0) - nr := len(rb) - if err != nil { - return nil, err.ToError() - } - - if nr < syscall.NLMSG_HDRLEN { - return nil, syserr.ErrInvalidArgument.ToError() - } - - tab = append(tab, rb...) - msgs, e := syscall.ParseNetlinkMessage(rb) - if e != nil { - return nil, e - } - - for _, m := range msgs { - if m.Header.Type == syscall.NLMSG_DONE { - break done - } - if m.Header.Type == syscall.NLMSG_ERROR { - return nil, syserr.ErrInvalidArgument.ToError() - } - } - } - - return tab, nil -} - -// DoNetlinkRouteRequest returns routing information base, also known as RIB, -// which consists of network facility information, states and parameters. -func (s *Stack) DoNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) { - data, err := s.netlinkRequest(req, syscall.AF_UNSPEC) - if err != nil { - return nil, err - } - return syscall.ParseNetlinkMessage(data) -} diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto deleted file mode 100644 index b677e9eb3..000000000 --- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto +++ /dev/null @@ -1,352 +0,0 @@ -syntax = "proto3"; - -// package syscall_rpc is a set of networking related system calls that can be -// forwarded to a socket gofer. -// -package syscall_rpc; - -message SendmsgRequest { - uint32 fd = 1; - bytes data = 2 [ctype = CORD]; - bytes address = 3; - bool more = 4; - bool end_of_record = 5; -} - -message SendmsgResponse { - oneof result { - uint32 error_number = 1; - uint32 length = 2; - } -} - -message IOCtlRequest { - uint32 fd = 1; - uint32 cmd = 2; - bytes arg = 3; -} - -message IOCtlResponse { - oneof result { - uint32 error_number = 1; - bytes value = 2; - } -} - -message RecvmsgRequest { - uint32 fd = 1; - uint32 length = 2; - bool sender = 3; - bool peek = 4; - bool trunc = 5; - uint32 cmsg_length = 6; -} - -message OpenRequest { - bytes path = 1; - uint32 flags = 2; - uint32 mode = 3; -} - -message OpenResponse { - oneof result { - uint32 error_number = 1; - uint32 fd = 2; - } -} - -message ReadRequest { - uint32 fd = 1; - uint32 length = 2; -} - -message ReadResponse { - oneof result { - uint32 error_number = 1; - bytes data = 2 [ctype = CORD]; - } -} - -message ReadFileRequest { - string path = 1; -} - -message ReadFileResponse { - oneof result { - uint32 error_number = 1; - bytes data = 2 [ctype = CORD]; - } -} - -message WriteRequest { - uint32 fd = 1; - bytes data = 2 [ctype = CORD]; -} - -message WriteResponse { - oneof result { - uint32 error_number = 1; - uint32 length = 2; - } -} - -message WriteFileRequest { - string path = 1; - bytes content = 2; -} - -message WriteFileResponse { - uint32 error_number = 1; - uint32 written = 2; -} - -message AddressResponse { - bytes address = 1; - uint32 length = 2; -} - -message RecvmsgResponse { - message ResultPayload { - bytes data = 1 [ctype = CORD]; - AddressResponse address = 2; - uint32 length = 3; - bytes cmsg_data = 4; - } - oneof result { - uint32 error_number = 1; - ResultPayload payload = 2; - } -} - -message BindRequest { - uint32 fd = 1; - bytes address = 2; -} - -message BindResponse { - uint32 error_number = 1; -} - -message AcceptRequest { - uint32 fd = 1; - bool peer = 2; - int64 flags = 3; -} - -message AcceptResponse { - message ResultPayload { - uint32 fd = 1; - AddressResponse address = 2; - } - oneof result { - uint32 error_number = 1; - ResultPayload payload = 2; - } -} - -message ConnectRequest { - uint32 fd = 1; - bytes address = 2; -} - -message ConnectResponse { - uint32 error_number = 1; -} - -message ListenRequest { - uint32 fd = 1; - int64 backlog = 2; -} - -message ListenResponse { - uint32 error_number = 1; -} - -message ShutdownRequest { - uint32 fd = 1; - int64 how = 2; -} - -message ShutdownResponse { - uint32 error_number = 1; -} - -message CloseRequest { - uint32 fd = 1; -} - -message CloseResponse { - uint32 error_number = 1; -} - -message GetSockOptRequest { - uint32 fd = 1; - int64 level = 2; - int64 name = 3; - uint32 length = 4; -} - -message GetSockOptResponse { - oneof result { - uint32 error_number = 1; - bytes opt = 2; - } -} - -message SetSockOptRequest { - uint32 fd = 1; - int64 level = 2; - int64 name = 3; - bytes opt = 4; -} - -message SetSockOptResponse { - uint32 error_number = 1; -} - -message GetSockNameRequest { - uint32 fd = 1; -} - -message GetSockNameResponse { - oneof result { - uint32 error_number = 1; - AddressResponse address = 2; - } -} - -message GetPeerNameRequest { - uint32 fd = 1; -} - -message GetPeerNameResponse { - oneof result { - uint32 error_number = 1; - AddressResponse address = 2; - } -} - -message SocketRequest { - int64 family = 1; - int64 type = 2; - int64 protocol = 3; -} - -message SocketResponse { - oneof result { - uint32 error_number = 1; - uint32 fd = 2; - } -} - -message EpollWaitRequest { - uint32 fd = 1; - uint32 num_events = 2; - sint64 msec = 3; -} - -message EpollEvent { - uint32 fd = 1; - uint32 events = 2; -} - -message EpollEvents { - repeated EpollEvent events = 1; -} - -message EpollWaitResponse { - oneof result { - uint32 error_number = 1; - EpollEvents events = 2; - } -} - -message EpollCtlRequest { - uint32 epfd = 1; - int64 op = 2; - uint32 fd = 3; - EpollEvent event = 4; -} - -message EpollCtlResponse { - uint32 error_number = 1; -} - -message EpollCreate1Request { - int64 flag = 1; -} - -message EpollCreate1Response { - oneof result { - uint32 error_number = 1; - uint32 fd = 2; - } -} - -message PollRequest { - uint32 fd = 1; - uint32 events = 2; -} - -message PollResponse { - oneof result { - uint32 error_number = 1; - uint32 events = 2; - } -} - -message SyscallRequest { - oneof args { - SocketRequest socket = 1; - SendmsgRequest sendmsg = 2; - RecvmsgRequest recvmsg = 3; - BindRequest bind = 4; - AcceptRequest accept = 5; - ConnectRequest connect = 6; - ListenRequest listen = 7; - ShutdownRequest shutdown = 8; - CloseRequest close = 9; - GetSockOptRequest get_sock_opt = 10; - SetSockOptRequest set_sock_opt = 11; - GetSockNameRequest get_sock_name = 12; - GetPeerNameRequest get_peer_name = 13; - EpollWaitRequest epoll_wait = 14; - EpollCtlRequest epoll_ctl = 15; - EpollCreate1Request epoll_create1 = 16; - PollRequest poll = 17; - ReadRequest read = 18; - WriteRequest write = 19; - OpenRequest open = 20; - IOCtlRequest ioctl = 21; - WriteFileRequest write_file = 22; - ReadFileRequest read_file = 23; - } -} - -message SyscallResponse { - oneof result { - SocketResponse socket = 1; - SendmsgResponse sendmsg = 2; - RecvmsgResponse recvmsg = 3; - BindResponse bind = 4; - AcceptResponse accept = 5; - ConnectResponse connect = 6; - ListenResponse listen = 7; - ShutdownResponse shutdown = 8; - CloseResponse close = 9; - GetSockOptResponse get_sock_opt = 10; - SetSockOptResponse set_sock_opt = 11; - GetSockNameResponse get_sock_name = 12; - GetPeerNameResponse get_peer_name = 13; - EpollWaitResponse epoll_wait = 14; - EpollCtlResponse epoll_ctl = 15; - EpollCreate1Response epoll_create1 = 16; - PollResponse poll = 17; - ReadResponse read = 18; - WriteResponse write = 19; - OpenResponse open = 20; - IOCtlResponse ioctl = 21; - WriteFileResponse write_file = 22; - ReadFileResponse read_file = 23; - } -} -- cgit v1.2.3 From 345df7cab48ac79bccf2620900cd972b3026296d Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 17 Jan 2020 08:10:14 -0800 Subject: Add explanation for implementation of BSD full file locks. PiperOrigin-RevId: 290272560 --- pkg/sentry/fs/lock/lock.go | 3 +++ 1 file changed, 3 insertions(+) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 41b040818..926538d90 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -78,6 +78,9 @@ const ( ) // LockEOF is the maximal possible end of a regional file lock. +// +// A BSD-style full file lock can be represented as a regional file lock from +// offset 0 to LockEOF. const LockEOF = math.MaxUint64 // Lock is a regional file lock. It consists of either a single writer -- cgit v1.2.3 From acf2d6dcc34501d2573f9c3f2b6da80308f3267e Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Fri, 17 Jan 2020 08:22:21 -0800 Subject: Enable stat syscall support on arm64. x86 and arm64 use a different stat struct in Linux kernel, so the stat() syscall implementation has to handle the file stat data separately. Signed-off-by: Haibo Xu Change-Id: If3986e915a667362257a54e7fbbcc1fe18951015 COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1493 from xiaobo55x:stat f15a216d9297eb9a96d2c483d396a9919145d7fa PiperOrigin-RevId: 290274287 --- pkg/sentry/syscalls/linux/BUILD | 2 + pkg/sentry/syscalls/linux/linux64_arm64.go | 8 +++ pkg/sentry/syscalls/linux/sys_stat.go | 51 ------------------- pkg/sentry/syscalls/linux/sys_stat_amd64.go | 75 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/sys_stat_arm64.go | 77 +++++++++++++++++++++++++++++ 5 files changed, 162 insertions(+), 51 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go create mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go (limited to 'pkg/sentry') diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 430d796ba..917f74e07 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -42,6 +42,8 @@ go_library( "sys_socket.go", "sys_splice.go", "sys_stat.go", + "sys_stat_amd64.go", + "sys_stat_arm64.go", "sys_sync.go", "sys_sysinfo.go", "sys_syslog.go", diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index 8c1b20911..c9629f6f3 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -61,6 +61,7 @@ var ARM64 = &kernel.SyscallTable{ 22: syscalls.Supported("epoll_pwait", EpollPwait), 23: syscalls.Supported("dup", Dup), 24: syscalls.Supported("dup3", Dup3), + 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), 26: syscalls.Supported("inotify_init1", InotifyInit1), 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), @@ -78,7 +79,9 @@ var ARM64 = &kernel.SyscallTable{ 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 45: syscalls.Supported("truncate", Truncate), 46: syscalls.Supported("ftruncate", Ftruncate), 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), 48: syscalls.Supported("faccessat", Faccessat), @@ -112,6 +115,7 @@ var ARM64 = &kernel.SyscallTable{ 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 77: syscalls.Supported("tee", Tee), 78: syscalls.Supported("readlinkat", Readlinkat), + 79: syscalls.Supported("fstatat", Fstatat), 80: syscalls.Supported("fstat", Fstat), 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), @@ -254,6 +258,8 @@ var ARM64 = &kernel.SyscallTable{ 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), 221: syscalls.Supported("execve", Execve), + 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), 226: syscalls.Supported("mprotect", Mprotect), @@ -299,6 +305,8 @@ var ARM64 = &kernel.SyscallTable{ 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls after 284 are "backports" from versions of Linux after 4.4. 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), 286: syscalls.Supported("preadv2", Preadv2), 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 5556bc276..69b17b799 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -16,7 +16,6 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -125,56 +124,6 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error { return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr) } -// copyOutStat copies the attributes (sattr, uattr) to the struct stat at -// address dst in t's address space. It encodes the stat struct to bytes -// manually, as stat() is a very common syscall for many applications, and -// t.CopyObjectOut has noticeable performance impact due to its many slice -// allocations and use of reflection. -func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { - b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] - - // Dev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) - // Ino (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) - // Nlink (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links) - // Mode (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) - // UID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - // GID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) - // Padding (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, 0) - // Rdev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) - // Size (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) - // Blksize (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize)) - // Blocks (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) - - // ATime - atime := uattr.AccessTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) - - // MTime - mtime := uattr.ModificationTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) - - // CTime - ctime := uattr.StatusChangeTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) - - _, err := t.CopyOutBytes(dst, b) - return err -} - // Statx implements linux syscall statx(2). func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go new file mode 100644 index 000000000..58afb4a9a --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -0,0 +1,75 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//+build amd64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// copyOutStat copies the attributes (sattr, uattr) to the struct stat at +// address dst in t's address space. It encodes the stat struct to bytes +// manually, as stat() is a very common syscall for many applications, and +// t.CopyObjectOut has noticeable performance impact due to its many slice +// allocations and use of reflection. +func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { + b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] + + // Dev (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) + // Ino (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) + // Nlink (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links) + // Mode (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) + // UID (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + // GID (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) + // Padding (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, 0) + // Rdev (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) + // Size (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) + // Blksize (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize)) + // Blocks (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) + + // ATime + atime := uattr.AccessTime.Timespec() + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) + + // MTime + mtime := uattr.ModificationTime.Timespec() + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) + + // CTime + ctime := uattr.StatusChangeTime.Timespec() + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) + + _, err := t.CopyOutBytes(dst, b) + return err +} diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go new file mode 100644 index 000000000..3e1251e0b --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -0,0 +1,77 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//+build arm64 + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// copyOutStat copies the attributes (sattr, uattr) to the struct stat at +// address dst in t's address space. It encodes the stat struct to bytes +// manually, as stat() is a very common syscall for many applications, and +// t.CopyObjectOut has noticeable performance impact due to its many slice +// allocations and use of reflection. +func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { + b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] + + // Dev (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) + // Ino (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) + // Mode (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) + // Nlink (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links)) + // UID (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + // GID (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) + // Rdev (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) + // Padding (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, 0) + // Size (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) + // Blksize (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize)) + // Padding (uint32) + b = binary.AppendUint32(b, usermem.ByteOrder, 0) + // Blocks (uint64) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) + + // ATime + atime := uattr.AccessTime.Timespec() + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) + + // MTime + mtime := uattr.ModificationTime.Timespec() + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) + + // CTime + ctime := uattr.StatusChangeTime.Timespec() + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) + b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) + + _, err := t.CopyOutBytes(dst, b) + return err +} -- cgit v1.2.3 From ff9960985848a48863c01f91acd5b34d3e83a9c5 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 17 Jan 2020 09:33:14 -0800 Subject: Add /proc/net/* files Updates #1195 PiperOrigin-RevId: 290285420 --- pkg/sentry/fsimpl/proc/BUILD | 2 +- pkg/sentry/fsimpl/proc/tasks.go | 1 + pkg/sentry/fsimpl/proc/tasks_net.go | 541 ++++++++++++++++++++++++++++--- pkg/sentry/fsimpl/proc/tasks_sys_test.go | 4 +- pkg/sentry/fsimpl/proc/tasks_test.go | 1 + 5 files changed, 499 insertions(+), 50 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index e92564b5d..f69aa19c4 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -18,7 +18,6 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", @@ -37,6 +36,7 @@ go_library( "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/tcpip/header", ], ) diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index a97b1753a..5646c602a 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -67,6 +67,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "sys": newSysDir(root, inoGen), "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), + "net": newNetDir(root, inoGen, k), "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 06dc43c26..3dbf3ba41 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -17,33 +17,88 @@ package proc import ( "bytes" "fmt" + "io" + "reflect" + "time" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/header" ) +func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { + var contents map[string]*kernfs.Dentry + if stack := k.NetworkStack(); stack != nil { + const ( + arp = "IP address HW type Flags HW address Mask Device" + netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode" + packet = "sk RefCnt Type Proto Iface R Rmem User Inode" + protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em" + ptype = "Type Device Function" + upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode" + ) + psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) + + contents = map[string]*kernfs.Dentry{ + "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), + + // The following files are simple stubs until they are implemented in + // netstack, if the file contains a header the stub is just the header + // otherwise it is an empty file. + "arp": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)), + "netlink": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)), + "netstat": newDentry(root, inoGen.NextIno(), 0444, &netStatData{}), + "packet": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)), + "protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)), + + // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, + // high res timer ticks per sec (ClockGetres returns 1ns resolution). + "psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)), + "ptype": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)), + "route": newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}), + } + + if stack.SupportsIPv6() { + contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6)) + } + } + + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) +} + // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. // // +stateify savable type ifinet6 struct { - s inet.Stack + kernfs.DynamicBytesFile + + stack inet.Stack } -var _ vfs.DynamicBytesSource = (*ifinet6)(nil) +var _ dynamicInode = (*ifinet6)(nil) func (n *ifinet6) contents() []string { var lines []string - nics := n.s.Interfaces() - for id, naddrs := range n.s.InterfaceAddrs() { + nics := n.stack.Interfaces() + for id, naddrs := range n.stack.InterfaceAddrs() { nic, ok := nics[id] if !ok { // NIC was added after NICNames was called. We'll just ignore it. @@ -77,18 +132,20 @@ func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// netDev implements vfs.DynamicBytesSource for /proc/net/dev. +// netDevData implements vfs.DynamicBytesSource for /proc/net/dev. // // +stateify savable -type netDev struct { - s inet.Stack +type netDevData struct { + kernfs.DynamicBytesFile + + stack inet.Stack } -var _ vfs.DynamicBytesSource = (*netDev)(nil) +var _ dynamicInode = (*netDevData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { - interfaces := n.s.Interfaces() +func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.stack.Interfaces() buf.WriteString("Inter-| Receive | Transmit\n") buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") @@ -96,7 +153,7 @@ func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { // Implements the same format as // net/core/net-procfs.c:dev_seq_printf_stats. var stats inet.StatDev - if err := n.s.Statistics(&stats, i.Name); err != nil { + if err := n.stack.Statistics(&stats, i.Name); err != nil { log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) continue } @@ -128,19 +185,21 @@ func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. +// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix. // // +stateify savable -type netUnix struct { - k *kernel.Kernel +type netUnixData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*netUnix)(nil) +var _ dynamicInode = (*netUnixData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") - for _, se := range n.k.ListSockets() { + for _, se := range n.kernel.ListSockets() { s := se.Sock.Get() if s == nil { log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) @@ -213,22 +272,72 @@ func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. -// -// +stateify savable -type netTCP struct { - k *kernel.Kernel +func networkToHost16(n uint16) uint16 { + // n is in network byte order, so is big-endian. The most-significant byte + // should be stored in the lower address. + // + // We manually inline binary.BigEndian.Uint16() because Go does not support + // non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to + // binary.BigEndian.Uint16() require a read of binary.BigEndian and an + // interface method call, defeating inlining. + buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)} + return usermem.ByteOrder.Uint16(buf[:]) } -var _ vfs.DynamicBytesSource = (*netTCP)(nil) +func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { + switch family { + case linux.AF_INET: + var a linux.SockAddrInet + if i != nil { + a = *i.(*linux.SockAddrInet) + } + + // linux.SockAddrInet.Port is stored in the network byte order and is + // printed like a number in host byte order. Note that all numbers in host + // byte order are printed with the most-significant byte first when + // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. + port := networkToHost16(a.Port) + + // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order + // (i.e. most-significant byte in index 0). Linux represents this as a + // __be32 which is a typedef for an unsigned int, and is printed with + // %X. This means that for a little-endian machine, Linux prints the + // least-significant byte of the address first. To emulate this, we first + // invert the byte order for the address using usermem.ByteOrder.Uint32, + // which makes it have the equivalent encoding to a __be32 on a little + // endian machine. Note that this operation is a no-op on a big endian + // machine. Then similar to Linux, we format it with %X, which will print + // the most-significant byte of the __be32 address first, which is now + // actually the least-significant byte of the original address in + // linux.SockAddrInet.Addr on little endian machines, due to the conversion. + addr := usermem.ByteOrder.Uint32(a.Addr[:]) + + fmt.Fprintf(w, "%08X:%04X ", addr, port) + case linux.AF_INET6: + var a linux.SockAddrInet6 + if i != nil { + a = *i.(*linux.SockAddrInet6) + } -func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { + port := networkToHost16(a.Port) + addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4]) + addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8]) + addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12]) + addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16]) + fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) + } +} + +func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. t := kernel.TaskFromContext(ctx) - buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") - for _, se := range n.k.ListSockets() { + + for _, se := range k.ListSockets() { s := se.Sock.Get() if s == nil { - log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) continue } sfile := s.(*fs.File) @@ -236,7 +345,7 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { if !ok { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } - if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() // Not tcp4 sockets. continue @@ -250,27 +359,23 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: sl; entry number. fmt.Fprintf(buf, "%4d: ", se.ID) - portBuf := make([]byte, 2) - // Field: local_adddress. - var localAddr linux.SockAddrInet - if local, _, err := sops.GetSockName(t); err == nil { - localAddr = *local.(*linux.SockAddrInet) + var localAddr linux.SockAddr + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = local + } } - binary.LittleEndian.PutUint16(portBuf, localAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(localAddr.Addr[:]), - portBuf) + writeInetAddr(buf, family, localAddr) // Field: rem_address. - var remoteAddr linux.SockAddrInet - if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = *remote.(*linux.SockAddrInet) + var remoteAddr linux.SockAddr + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = remote + } } - binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(remoteAddr.Addr[:]), - portBuf) + writeInetAddr(buf, family, remoteAddr) // Field: state; socket state. fmt.Fprintf(buf, "%02X ", sops.State()) @@ -293,7 +398,8 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) fmt.Fprintf(buf, "%5d ", 0) } else { - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. @@ -335,3 +441,344 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } + +// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCPData)(nil) + +func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET) +} + +// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6. +// +// +stateify savable +type netTCP6Data struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCP6Data)(nil) + +func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6) +} + +// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp. +// +// +stateify savable +type netUDPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netUDPData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. + t := kernel.TaskFromContext(ctx) + + for _, se := range d.kernel.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { + s.DecRef() + // Not udp4 socket. + continue + } + + // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock(). + + // Field: sl; entry number. + fmt.Fprintf(buf, "%5d: ", se.ID) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &localAddr) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &remoteAddr) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when. Always 0 for UDP. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt. Always 0 for UDP. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + } + + // Field: timeout. Always 0 for UDP. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: ref; reference count on the socket inode. Don't count the ref + // we obtain while deferencing the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: drops; number of dropped packets. Unimplemented. + fmt.Fprintf(buf, "%d", 0) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp. +// +// +stateify savable +type netSnmpData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netSnmpData)(nil) + +type snmpLine struct { + prefix string + header string +} + +var snmp = []snmpLine{ + { + prefix: "Ip", + header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", + }, + { + prefix: "Icmp", + header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", + }, + { + prefix: "IcmpMsg", + }, + { + prefix: "Tcp", + header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", + }, + { + prefix: "Udp", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, + { + prefix: "UdpLite", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, +} + +func toSlice(a interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(a)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + +func sprintSlice(s []uint64) string { + if len(s) == 0 { + return "" + } + r := fmt.Sprint(s) + return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. +} + +// Generate implements vfs.DynamicBytesSource. +func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { + types := []interface{}{ + &inet.StatSNMPIP{}, + &inet.StatSNMPICMP{}, + nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. + &inet.StatSNMPTCP{}, + &inet.StatSNMPUDP{}, + &inet.StatSNMPUDPLite{}, + } + for i, stat := range types { + line := snmp[i] + if stat == nil { + fmt.Fprintf(buf, "%s:\n", line.prefix) + fmt.Fprintf(buf, "%s:\n", line.prefix) + continue + } + if err := d.stack.Statistics(stat, line.prefix); err != nil { + if err == syserror.EOPNOTSUPP { + log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } else { + log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } + } + + fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header) + + if line.prefix == "Tcp" { + tcp := stat.(*inet.StatSNMPTCP) + // "Tcp" needs special processing because MaxConn is signed. RFC 2012. + fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + } else { + fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + } + } + return nil +} + +// netRouteData implements vfs.DynamicBytesSource for /proc/net/route. +// +// +stateify savable +type netRouteData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netRouteData)(nil) + +// Generate implements vfs.DynamicBytesSource. +// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. +func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") + + interfaces := d.stack.Interfaces() + for _, rt := range d.stack.RouteTable() { + // /proc/net/route only includes ipv4 routes. + if rt.Family != linux.AF_INET { + continue + } + + // /proc/net/route does not include broadcast or multicast routes. + if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { + continue + } + + iface, ok := interfaces[rt.OutputInterface] + if !ok || iface.Name == "lo" { + continue + } + + var ( + gw uint32 + prefix uint32 + flags = linux.RTF_UP + ) + if len(rt.GatewayAddr) == header.IPv4AddressSize { + flags |= linux.RTF_GATEWAY + gw = usermem.ByteOrder.Uint32(rt.GatewayAddr) + } + if len(rt.DstAddr) == header.IPv4AddressSize { + prefix = usermem.ByteOrder.Uint32(rt.DstAddr) + } + l := fmt.Sprintf( + "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", + iface.Name, + prefix, + gw, + flags, + 0, // RefCnt. + 0, // Use. + 0, // Metric. + (uint32(1)< 0 { @@ -62,7 +62,7 @@ func TestIfinet6(t *testing.T) { "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, } - n := &ifinet6{s: s} + n := &ifinet6{stack: s} contents := n.contents() if len(contents) != len(want) { t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 6b58c16b9..8eddf95e0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -73,6 +73,7 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { "loadavg": {Type: linux.DT_REG}, "meminfo": {Type: linux.DT_REG}, "mounts": {Type: linux.DT_LNK}, + "net": {Type: linux.DT_DIR}, "self": selfLink, "stat": {Type: linux.DT_REG}, "sys": {Type: linux.DT_DIR}, -- cgit v1.2.3 From 8e8d0f96f651ce161dfe6003d738dbda28f7cb0e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 17 Jan 2020 10:39:24 -0800 Subject: Add /proc/[pid]/cgroups file Updates #1195 PiperOrigin-RevId: 290298266 --- pkg/sentry/fsimpl/proc/filesystem.go | 13 ++++++++++++- pkg/sentry/fsimpl/proc/subtasks.go | 18 ++++++++++-------- pkg/sentry/fsimpl/proc/task.go | 30 +++++++++++++++++++++++++----- pkg/sentry/fsimpl/proc/tasks.go | 10 ++++++++-- pkg/sentry/fsimpl/proc/tasks_test.go | 11 ++++++++++- 5 files changed, 65 insertions(+), 17 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index e9cb7895f..f49819187 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -47,7 +47,12 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile procfs := &kernfs.Filesystem{} procfs.VFSFilesystem().Init(vfsObj, procfs) - _, dentry := newTasksInode(procfs, k, pidns) + var data *InternalData + if opts.InternalData != nil { + data = opts.InternalData.(*InternalData) + } + + _, dentry := newTasksInode(procfs, k, pidns, data.Cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } @@ -78,3 +83,9 @@ var _ dynamicInode = (*staticFile)(nil) func newStaticFile(data string) *staticFile { return &staticFile{StaticData: vfs.StaticData{Data: data}} } + +// InternalData contains internal data passed in to the procfs mount via +// vfs.GetFilesystemOptions.InternalData. +type InternalData struct { + Cgroups map[string]string +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 8892c5a11..91eded415 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -35,18 +35,20 @@ type subtasksInode struct { kernfs.InodeAttrs kernfs.OrderedChildren - task *kernel.Task - pidns *kernel.PIDNamespace - inoGen InoGenerator + task *kernel.Task + pidns *kernel.PIDNamespace + inoGen InoGenerator + cgroupControllers map[string]string } var _ kernfs.Inode = (*subtasksInode)(nil) -func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator) *kernfs.Dentry { +func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry { subInode := &subtasksInode{ - task: task, - pidns: pidns, - inoGen: inoGen, + task: task, + pidns: pidns, + inoGen: inoGen, + cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) @@ -79,7 +81,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e return nil, syserror.ENOENT } - subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false) + subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers) return subTaskDentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 621c17cfe..a0580f20d 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,6 +15,7 @@ package proc import ( + "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -42,7 +43,7 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { +func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { contents := map[string]*kernfs.Dentry{ "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), @@ -68,11 +69,11 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - contents["task"] = newSubtasks(task, pidns, inoGen) + contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) + } + if len(cgroupControllers) > 0 { + contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers)) } - //if len(p.cgroupControllers) > 0 { - // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) - //} taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. @@ -227,3 +228,22 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr d.Init(taskInode) return d } + +// newCgroupData creates inode that shows cgroup information. +// From man 7 cgroups: "For each cgroup hierarchy of which the process is a +// member, there is one entry containing three colon-separated fields: +// hierarchy-ID:controller-list:cgroup-path" +func newCgroupData(controllers map[string]string) dynamicInode { + buf := bytes.Buffer{} + + // The hierarchy ids must be positive integers (for cgroup v1), but the + // exact number does not matter, so long as they are unique. We can + // just use a counter, but since linux sorts this file in descending + // order, we must count down to preserve this behavior. + i := len(controllers) + for name, dir := range controllers { + fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir) + i-- + } + return newStaticFile(buf.String()) +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 5646c602a..51f634716 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -54,11 +54,16 @@ type tasksInode struct { // Linux. So handle them outside of OrderedChildren. selfSymlink *vfs.Dentry threadSelfSymlink *vfs.Dentry + + // cgroupControllers is a map of controller name to directory in the + // cgroup hierarchy. These controllers are immutable and will be listed + // in /proc/pid/cgroup if not nil. + cgroupControllers map[string]string } var _ kernfs.Inode = (*tasksInode)(nil) -func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { +func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), @@ -78,6 +83,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames inoGen: inoGen, selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) @@ -111,7 +117,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := newTaskInode(i.inoGen, task, i.pidns, true) + taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers) return taskDentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 8eddf95e0..002d2f73b 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -87,6 +87,7 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ "auxv": {Type: linux.DT_REG}, + "cgroup": {Type: linux.DT_REG}, "cmdline": {Type: linux.DT_REG}, "comm": {Type: linux.DT_REG}, "environ": {Type: linux.DT_REG}, @@ -145,7 +146,15 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{}) + fsOpts := vfs.GetFilesystemOptions{ + InternalData: &InternalData{ + Cgroups: map[string]string{ + "cpuset": "/foo/cpuset", + "memory": "/foo/memory", + }, + }, + } + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts) if err != nil { return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) } -- cgit v1.2.3 From 80d0f9304484897e4307c9701ddbfaacb925715d Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 17 Jan 2020 11:20:29 -0800 Subject: Fix data race in tty.queue.readableSize. We were setting queue.readable without holding the lock. PiperOrigin-RevId: 290306922 --- pkg/sentry/fs/tty/line_discipline.go | 4 +++- pkg/sentry/fs/tty/queue.go | 11 ++--------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 894964260..9fe02657e 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -140,8 +140,10 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc // buffer to its read buffer. Anything already in the read buffer is // now readable. if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) { - l.inQueue.pushWaitBuf(l) + l.inQueue.mu.Lock() + l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = true + l.inQueue.mu.Unlock() l.slaveWaiter.Notify(waiter.EventIn) } diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 8b5d4699a..21ccc6f32 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -197,16 +197,9 @@ func (q *queue) writeBytes(b []byte, l *lineDiscipline) { q.pushWaitBufLocked(l) } -// pushWaitBuf fills the queue's read buffer with data from the wait buffer. +// pushWaitBufLocked fills the queue's read buffer with data from the wait +// buffer. // -// Preconditions: -// * l.termiosMu must be held for reading. -func (q *queue) pushWaitBuf(l *lineDiscipline) int { - q.mu.Lock() - defer q.mu.Unlock() - return q.pushWaitBufLocked(l) -} - // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be locked. -- cgit v1.2.3 From f1a5178c589dbd9a1fe4f1b9fb943fbe64791b58 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 17 Jan 2020 14:20:00 -0800 Subject: Fix data race in MountNamespace.resolve. We must hold fs.renameMu to access Dirent.parent. PiperOrigin-RevId: 290340804 --- pkg/sentry/fs/mounts.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index db3dfd096..a9627a9d1 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -609,8 +609,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema } // Find the node; we resolve relative to the current symlink's parent. + renameMu.RLock() + parent := node.parent + renameMu.RUnlock() *remainingTraversals-- - d, err := mns.FindInode(ctx, root, node.parent, targetPath, remainingTraversals) + d, err := mns.FindInode(ctx, root, parent, targetPath, remainingTraversals) if err != nil { return nil, err } -- cgit v1.2.3 From 47d85257d3d015f0b9f7739c81af0ee9f510aaf5 Mon Sep 17 00:00:00 2001 From: Eyal Soha Date: Fri, 17 Jan 2020 18:24:39 -0800 Subject: Filter out received packets with a local source IP address. CERT Advisory CA-96.21 III. Solution advises that devices drop packets which could not have correctly arrived on the wire, such as receiving a packet where the source IP address is owned by the device that sent it. Fixes #1507 PiperOrigin-RevId: 290378240 --- pkg/sentry/socket/netstack/netstack.go | 15 +++++----- pkg/sentry/socket/netstack/stack.go | 38 ++++++++++++------------- pkg/tcpip/stack/nic.go | 14 +++++++-- pkg/tcpip/tcpip.go | 10 +++++-- pkg/tcpip/transport/udp/udp_test.go | 52 ++++++++++++++++++++++++++++++++-- 5 files changed, 95 insertions(+), 34 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index d2f7e987d..fec575357 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -138,13 +138,14 @@ var Metrics = tcpip.Stats{ }, }, IP: tcpip.IPStats{ - PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."), - InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."), - PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), - PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."), - OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), - MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."), - MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."), + PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."), + InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."), + InvalidSourceAddressesReceived: mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Total number of IP packets received with an unknown or invalid source address."), + PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), + PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."), + OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), + MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."), + MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."), }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index a0db2d4fd..31ea66eca 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -148,25 +148,25 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { case *inet.StatSNMPIP: ip := Metrics.IP *stats = inet.StatSNMPIP{ - 0, // TODO(gvisor.dev/issue/969): Support Ip/Forwarding. - 0, // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL. - ip.PacketsReceived.Value(), // InReceives. - 0, // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors. - ip.InvalidAddressesReceived.Value(), // InAddrErrors. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams. - 0, // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos. - 0, // TODO(gvisor.dev/issue/969): Support Ip/InDiscards. - ip.PacketsDelivered.Value(), // InDelivers. - ip.PacketsSent.Value(), // OutRequests. - ip.OutgoingPacketErrors.Value(), // OutDiscards. - 0, // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails. - 0, // TODO(gvisor.dev/issue/969): Support Ip/FragOKs. - 0, // TODO(gvisor.dev/issue/969): Support Ip/FragFails. - 0, // TODO(gvisor.dev/issue/969): Support Ip/FragCreates. + 0, // TODO(gvisor.dev/issue/969): Support Ip/Forwarding. + 0, // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL. + ip.PacketsReceived.Value(), // InReceives. + 0, // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors. + ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams. + 0, // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos. + 0, // TODO(gvisor.dev/issue/969): Support Ip/InDiscards. + ip.PacketsDelivered.Value(), // InDelivers. + ip.PacketsSent.Value(), // OutRequests. + ip.OutgoingPacketErrors.Value(), // OutDiscards. + 0, // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs. + 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails. + 0, // TODO(gvisor.dev/issue/969): Support Ip/FragOKs. + 0, // TODO(gvisor.dev/issue/969): Support Ip/FragFails. + 0, // TODO(gvisor.dev/issue/969): Support Ip/FragCreates. } case *inet.StatSNMPICMP: in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 53abf29e5..4afe7b744 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -984,7 +984,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, // DeliverNetworkPacket finds the appropriate network protocol endpoint and // hands the packet over for further processing. This function is called when -// the NIC receives a packet from the physical interface. +// the NIC receives a packet from the link endpoint. // Note that the ownership of the slice backing vv is retained by the caller. // This rule applies only to the slice itself, not to the items of the slice; // the ownership of the items is not retained by the caller. @@ -1029,6 +1029,14 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link src, dst := netProto.ParseAddresses(pkt.Data.First()) + if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil { + // The source address is one of our own, so we never should have gotten a + // packet like this unless handleLocal is false. Loopback also calls this + // function even though the packets didn't come from the physical interface + // so don't drop those. + n.stack.stats.IP.InvalidSourceAddressesReceived.Increment() + return + } if ref := n.getRef(protocol, dst); ref != nil { handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt) return @@ -1041,7 +1049,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link if n.stack.Forwarding() { r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */) if err != nil { - n.stack.stats.IP.InvalidAddressesReceived.Increment() + n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment() return } defer r.Release() @@ -1079,7 +1087,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link // If a packet socket handled the packet, don't treat it as invalid. if len(packetEPs) == 0 { - n.stack.stats.IP.InvalidAddressesReceived.Increment() + n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment() } } diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index b7813cbc0..6243762e3 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -903,9 +903,13 @@ type IPStats struct { // link layer in nic.DeliverNetworkPacket. PacketsReceived *StatCounter - // InvalidAddressesReceived is the total number of IP packets received - // with an unknown or invalid destination address. - InvalidAddressesReceived *StatCounter + // InvalidDestinationAddressesReceived is the total number of IP packets + // received with an unknown or invalid destination address. + InvalidDestinationAddressesReceived *StatCounter + + // InvalidSourceAddressesReceived is the total number of IP packets received + // with a source address that should never have been received on the wire. + InvalidSourceAddressesReceived *StatCounter // PacketsDelivered is the total number of incoming IP packets that // are successfully delivered to the transport layer via HandlePacket. diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index ee9d10555..51bb61167 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -274,11 +274,16 @@ type testContext struct { func newDualTestContext(t *testing.T, mtu uint32) *testContext { t.Helper() - - s := stack.New(stack.Options{ + return newDualTestContextWithOptions(t, mtu, stack.Options{ NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()}, TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}, }) +} + +func newDualTestContextWithOptions(t *testing.T, mtu uint32, options stack.Options) *testContext { + t.Helper() + + s := stack.New(options) ep := channel.New(256, mtu, "") wep := stack.LinkEndpoint(ep) @@ -763,6 +768,49 @@ func TestV6ReadOnV6(t *testing.T) { testRead(c, unicastV6) } +// TestV4ReadSelfSource checks that packets coming from a local IP address are +// correctly dropped when handleLocal is true and not otherwise. +func TestV4ReadSelfSource(t *testing.T) { + for _, tt := range []struct { + name string + handleLocal bool + wantErr *tcpip.Error + wantInvalidSource uint64 + }{ + {"HandleLocal", false, nil, 0}, + {"NoHandleLocal", true, tcpip.ErrWouldBlock, 1}, + } { + t.Run(tt.name, func(t *testing.T) { + c := newDualTestContextWithOptions(t, defaultMTU, stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()}, + TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}, + HandleLocal: tt.handleLocal, + }) + defer c.cleanup() + + c.createEndpointForFlow(unicastV4) + + if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil { + t.Fatalf("Bind failed: %s", err) + } + + payload := newPayload() + h := unicastV4.header4Tuple(incoming) + h.srcAddr = h.dstAddr + + c.injectV4Packet(payload, &h, true /* valid */) + + if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != tt.wantInvalidSource { + t.Errorf("c.s.Stats().IP.InvalidSourceAddressesReceived got %d, want %d", got, tt.wantInvalidSource) + } + + if _, _, err := c.ep.Read(nil); err != tt.wantErr { + t.Errorf("c.ep.Read() got error %v, want %v", err, tt.wantErr) + } + }) + } +} + func TestV4ReadOnV4(t *testing.T) { c := newDualTestContext(t, defaultMTU) defer c.cleanup() -- cgit v1.2.3 From 10401599e104d90644a220c1cce3e4c2f224f0b3 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Sat, 18 Jan 2020 09:32:39 -0800 Subject: Include the cgroup name in the superblock options in /proc/self/mountinfo. Java 11 parses /proc/self/mountinfo for cgroup information. Java 11.0.4 uses the mount path to determine what cgroups existed, but Java 11.0.5 reads the cgroup names from the superblock options. This CL adds the cgroup name to the superblock options if the filesystem type is "cgroup". Since gVisor doesn't actually support cgroups yet, we just infer the cgroup name from the path. PiperOrigin-RevId: 290434323 --- pkg/sentry/fs/proc/mounts.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index 5aedae799..d4efc86e0 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "sort" + "strings" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -146,14 +147,35 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se // (10) Mount source: filesystem-specific information or "none". fmt.Fprintf(&buf, "none ") - // (11) Superblock options. Only "ro/rw" is supported for now, - // and is the same as the filesystem option. - fmt.Fprintf(&buf, "%s\n", opts) + // (11) Superblock options, and final newline. + fmt.Fprintf(&buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource)) }) return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountInfoFile)(nil)}}, 0 } +func superBlockOpts(mountPath string, msrc *fs.MountSource) string { + // gVisor doesn't (yet) have a concept of super block options, so we + // use the ro/rw bit from the mount flag. + opts := "rw" + if msrc.Flags.ReadOnly { + opts = "ro" + } + + // NOTE(b/147673608): If the mount is a cgroup, we also need to include + // the cgroup name in the options. For now we just read that from the + // path. + // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // should get this value from the cgroup itself, and not rely on the + // path. + if msrc.FilesystemType == "cgroup" { + splitPath := strings.Split(mountPath, "/") + cgroupType := splitPath[len(splitPath)-1] + opts += "," + cgroupType + } + return opts +} + // mountsFile is used to implement /proc/[pid]/mounts. // // +stateify savable -- cgit v1.2.3 From 2ba6198851dc1e293295d7cadf8c0ae456b68beb Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 21 Jan 2020 12:41:50 -0800 Subject: Add syscalls for lgetxattr, fgetxattr, lsetxattr, and fsetxattr. Note that these simply will use the same logic as getxattr and setxattr, which is not yet implemented for most filesystems. PiperOrigin-RevId: 290800960 --- pkg/sentry/syscalls/linux/linux64_amd64.go | 8 +- pkg/sentry/syscalls/linux/linux64_arm64.go | 8 +- pkg/sentry/syscalls/linux/sys_xattr.go | 136 +++++++++++++++++++++-------- test/syscalls/linux/xattr.cc | 41 +++++++++ 4 files changed, 150 insertions(+), 43 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 6b2920900..c76771a54 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -229,11 +229,11 @@ var AMD64 = &kernel.SyscallTable{ 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index c9629f6f3..d3587fda6 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -42,11 +42,11 @@ var ARM64 = &kernel.SyscallTable{ 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 6: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 7: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 12: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 23d20da6f..e35c077d6 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -27,6 +27,40 @@ import ( // GetXattr implements linux syscall getxattr(2). func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getXattrFromPath(t, args, true) +} + +// LGetXattr implements linux syscall lgetxattr(2). +func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getXattrFromPath(t, args, false) +} + +// FGetXattr implements linux syscall fgetxattr(2). +func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := uint64(args[3].SizeT()) + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + n, value, err := getXattr(t, f.Dirent, nameAddr, size) + if err != nil { + return 0, nil, err + } + + if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() @@ -38,29 +72,17 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } valueLen := 0 - err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { - // If getxattr(2) is called with size 0, the size of the value will be - // returned successfully even if it is nonzero. In that case, we need to - // retrieve the entire attribute value so we can return the correct size. - requestedSize := size - if size == 0 || size > linux.XATTR_SIZE_MAX { - requestedSize = linux.XATTR_SIZE_MAX + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR } - value, err := getXattr(t, d, dirPath, nameAddr, uint64(requestedSize)) + n, value, err := getXattr(t, d, nameAddr, size) + valueLen = n if err != nil { return err } - valueLen = len(value) - if uint64(valueLen) > requestedSize { - return syserror.ERANGE - } - - // Skip copying out the attribute value if size is 0. - if size == 0 { - return nil - } _, err = t.CopyOutBytes(valueAddr, []byte(value)) return err }) @@ -71,29 +93,73 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // getXattr implements getxattr(2) from the given *fs.Dirent. -func getXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr, size uint64) (string, error) { - if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return "", syserror.ENOTDIR - } - +func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) { if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil { - return "", err + return 0, "", err } name, err := copyInXattrName(t, nameAddr) if err != nil { - return "", err + return 0, "", err } if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return "", syserror.EOPNOTSUPP + return 0, "", syserror.EOPNOTSUPP } - return d.Inode.GetXattr(t, name, size) + // If getxattr(2) is called with size 0, the size of the value will be + // returned successfully even if it is nonzero. In that case, we need to + // retrieve the entire attribute value so we can return the correct size. + requestedSize := size + if size == 0 || size > linux.XATTR_SIZE_MAX { + requestedSize = linux.XATTR_SIZE_MAX + } + + value, err := d.Inode.GetXattr(t, name, requestedSize) + if err != nil { + return 0, "", err + } + n := len(value) + if uint64(n) > requestedSize { + return 0, "", syserror.ERANGE + } + + // Don't copy out the attribute value if size is 0. + if size == 0 { + return n, "", nil + } + return n, value, nil } // SetXattr implements linux syscall setxattr(2). func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return setXattrFromPath(t, args, true) +} + +// LSetXattr implements linux syscall lsetxattr(2). +func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return setXattrFromPath(t, args, false) +} + +// FSetXattr implements linux syscall fsetxattr(2). +func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := uint64(args[3].SizeT()) + flags := args[4].Uint() + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags) +} + +func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() @@ -105,19 +171,19 @@ func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return 0, nil, syserror.EINVAL - } + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } - return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { - return setXattr(t, d, dirPath, nameAddr, valueAddr, uint64(size), flags) + return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags) }) } // setXattr implements setxattr(2) from the given *fs.Dirent. -func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error { - if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR +func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error { + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return syserror.EINVAL } if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { @@ -133,7 +199,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us return syserror.E2BIG } buf := make([]byte, size) - if _, err = t.CopyInBytes(valueAddr, buf); err != nil { + if _, err := t.CopyInBytes(valueAddr, buf); err != nil { return err } value := string(buf) diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc index b3bc3463e..e77c355d7 100644 --- a/test/syscalls/linux/xattr.cc +++ b/test/syscalls/linux/xattr.cc @@ -26,6 +26,7 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/file_base.h" #include "test/util/capability_util.h" +#include "test/util/file_descriptor.h" #include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -414,6 +415,46 @@ TEST_F(XattrTest, GetxattrNonexistentName) { EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); } +TEST_F(XattrTest, LGetSetxattrOnSymlink) { + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(dir.path(), test_file_name_)); + + EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0), + SyscallFailsWithErrno(ENODATA)); +} + +TEST_F(XattrTest, LGetSetxattrOnNonsymlink) { + const char* path = test_file_name_.c_str(); + const char name[] = "user.test"; + int val = 1234; + size_t size = sizeof(val); + EXPECT_THAT(lsetxattr(path, name, &val, size, /*flags=*/0), + SyscallSucceeds()); + + int buf = 0; + EXPECT_THAT(lgetxattr(path, name, &buf, size), + SyscallSucceedsWithValue(size)); + EXPECT_EQ(buf, val); +} + +TEST_F(XattrTest, FGetSetxattr) { + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0)); + const char name[] = "user.test"; + int val = 1234; + size_t size = sizeof(val); + EXPECT_THAT(fsetxattr(fd.get(), name, &val, size, /*flags=*/0), + SyscallSucceeds()); + + int buf = 0; + EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size), + SyscallSucceedsWithValue(size)); + EXPECT_EQ(buf, val); +} + } // namespace } // namespace testing -- cgit v1.2.3