From 2572af38860ed8a5aa37f0ad705a360628a4ed10 Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Thu, 9 Sep 2021 17:07:37 -0700 Subject: Remove linux-compat loopback hacks from packet endpoint Previously, gVisor did not represent loopback devices as an ethernet device as Linux does. To maintain Linux API compatibility for packet sockets, a workaround was used to add an ethernet header if a link header was not already present in the packet buffer delivered to a packet endpoint. However, this workaround is a bug for non-ethernet based interfaces; not all links use an ethernet header (e.g. pure L3/TUN interfaces). As of 3b4bb947517d0d9010120aaa1c3989fd6abf278e, gVisor represents loopback devices as an ethernet-based device so this workaround can now be removed. BUG: https://fxbug.dev/81592 Updates #6530, #6531. PiperOrigin-RevId: 395819151 --- pkg/sentry/socket/socket.go | 2 + pkg/tcpip/link/channel/channel.go | 23 +++- pkg/tcpip/transport/packet/endpoint.go | 86 ++++---------- test/syscalls/linux/tuntap.cc | 210 +++++++++++++++++++++++++-------- 4 files changed, 209 insertions(+), 112 deletions(-) diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 83b9d9389..841d5bd55 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -743,6 +743,8 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } a.UnmarshalUnsafe(addr[:sockAddrLinkSize]) + // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have + // an ethernet address. if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index d02eea93c..658557d62 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -28,7 +28,9 @@ import ( // PacketInfo holds all the information about an outbound packet. type PacketInfo struct { - Pkt *stack.PacketBuffer + Pkt *stack.PacketBuffer + + // TODO(https://gvisor.dev/issue/6537): Remove these fields. Proto tcpip.NetworkProtocolNumber Route stack.RouteInfo } @@ -244,7 +246,10 @@ func (e *Endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocol Route: r, } - e.q.Write(p) + // Write returns false if the queue is full. A full queue is not an error + // from the perspective of a LinkEndpoint so we ignore Write's return + // value and always return nil from this method. + _ = e.q.Write(p) return nil } @@ -292,4 +297,16 @@ func (*Endpoint) AddHeader(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkPr } // WriteRawPacket implements stack.LinkEndpoint. -func (*Endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } +func (e *Endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + p := PacketInfo{ + Pkt: pkt, + Proto: pkt.NetworkProtocolNumber, + } + + // Write returns false if the queue is full. A full queue is not an error + // from the perspective of a LinkEndpoint so we ignore Write's return + // value and always return nil from this method. + _ = e.q.Write(p) + + return nil +} diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 89b4720aa..0554d2f4a 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -25,7 +25,6 @@ package packet import ( - "fmt" "io" "time" @@ -424,76 +423,39 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, wasEmpty := ep.rcvBufSize == 0 - // Push new packet into receive list and increment the buffer size. - var packet packet + rcvdPkt := packet{ + packetInfo: tcpip.LinkPacketInfo{ + Protocol: netProto, + PktType: pkt.PktType, + }, + senderAddr: tcpip.FullAddress{ + NIC: nicID, + }, + receivedAt: ep.stack.Clock().Now(), + } + if !pkt.LinkHeader().View().IsEmpty() { - // Get info directly from the ethernet header. hdr := header.Ethernet(pkt.LinkHeader().View()) - packet.senderAddr = tcpip.FullAddress{ - NIC: nicID, - Addr: tcpip.Address(hdr.SourceAddress()), - } - packet.packetInfo.Protocol = netProto - packet.packetInfo.PktType = pkt.PktType - } else { - // Guess the would-be ethernet header. - packet.senderAddr = tcpip.FullAddress{ - NIC: nicID, - Addr: tcpip.Address(localAddr), - } - packet.packetInfo.Protocol = netProto - packet.packetInfo.PktType = pkt.PktType + rcvdPkt.senderAddr.Addr = tcpip.Address(hdr.SourceAddress()) } if ep.cooked { - // Cooked packets can simply be queued. - switch pkt.PktType { - case tcpip.PacketHost: - packet.data = pkt.Data().ExtractVV() - case tcpip.PacketOutgoing: - // Strip Link Header. - var combinedVV buffer.VectorisedView - if v := pkt.NetworkHeader().View(); !v.IsEmpty() { - combinedVV.AppendView(v) - } - if v := pkt.TransportHeader().View(); !v.IsEmpty() { - combinedVV.AppendView(v) - } - combinedVV.Append(pkt.Data().ExtractVV()) - packet.data = combinedVV - default: - panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt)) + // Cooked packet endpoints don't include the link-headers in received + // packets. + if v := pkt.NetworkHeader().View(); !v.IsEmpty() { + rcvdPkt.data.AppendView(v) } - } else { - // Raw packets need their ethernet headers prepended before - // queueing. - var linkHeader buffer.View - if pkt.PktType != tcpip.PacketOutgoing { - if pkt.LinkHeader().View().IsEmpty() { - // We weren't provided with an actual ethernet header, - // so fake one. - ethFields := header.EthernetFields{ - SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), - DstAddr: localAddr, - Type: netProto, - } - fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) - fakeHeader.Encode(ðFields) - linkHeader = buffer.View(fakeHeader) - } else { - linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...) - } - combinedVV := linkHeader.ToVectorisedView() - combinedVV.Append(pkt.Data().ExtractVV()) - packet.data = combinedVV - } else { - packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views()) + if v := pkt.TransportHeader().View(); !v.IsEmpty() { + rcvdPkt.data.AppendView(v) } + rcvdPkt.data.Append(pkt.Data().ExtractVV()) + } else { + // Raw packet endpoints include link-headers in received packets. + rcvdPkt.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views()) } - packet.receivedAt = ep.stack.Clock().Now() - ep.rcvList.PushBack(&packet) - ep.rcvBufSize += packet.data.Size() + ep.rcvList.PushBack(&rcvdPkt) + ep.rcvBufSize += rcvdPkt.data.Size() ep.rcvMu.Unlock() ep.stats.PacketsReceived.Increment() diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc index 7c9c5c870..956208bee 100644 --- a/test/syscalls/linux/tuntap.cc +++ b/test/syscalls/linux/tuntap.cc @@ -73,29 +73,14 @@ PosixErrorOr GetLinkByName(const std::string& name) { return PosixError(ENOENT, "interface not found"); } -struct pihdr { - uint16_t pi_flags; - uint16_t pi_protocol; -} __attribute__((packed)); - -struct ping_pkt { - pihdr pi; - struct ethhdr eth; - struct iphdr ip; - struct icmphdr icmp; +struct ping_ip_pkt { + iphdr ip; + icmphdr icmp; char payload[64]; } __attribute__((packed)); -ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip, - const uint8_t dstmac[ETH_ALEN], - const in_addr_t dstip) { - ping_pkt pkt = {}; - - pkt.pi.pi_protocol = htons(ETH_P_IP); - - memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest)); - memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source)); - pkt.eth.h_proto = htons(ETH_P_IP); +ping_ip_pkt CreatePingIPPacket(const in_addr_t srcip, const in_addr_t dstip) { + ping_ip_pkt pkt = {}; pkt.ip.ihl = 5; pkt.ip.version = 4; @@ -122,6 +107,33 @@ ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip, return pkt; } +struct pihdr { + uint16_t pi_flags; + uint16_t pi_protocol; +} __attribute__((packed)); + +struct ping_pkt { + pihdr pi; + ethhdr eth; + ping_ip_pkt ip_pkt; +} __attribute__((packed)); + +ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip, + const uint8_t dstmac[ETH_ALEN], + const in_addr_t dstip) { + ping_pkt pkt = {}; + + pkt.pi.pi_protocol = htons(ETH_P_IP); + + memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest)); + memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source)); + pkt.eth.h_proto = htons(ETH_P_IP); + + pkt.ip_pkt = CreatePingIPPacket(srcip, dstip); + + return pkt; +} + struct arp_pkt { pihdr pi; struct ethhdr eth; @@ -274,13 +286,26 @@ TEST_F(TuntapTest, WriteToDownDevice) { EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO)); } -PosixErrorOr OpenAndAttachTap(const std::string& dev_name, - const in_addr_t dev_addr) { +struct TunTapInterface { + FileDescriptor fd; + Link link; +}; + +PosixErrorOr OpenAndAttachTunTap(const std::string& dev_name, + const in_addr_t dev_addr, + bool tap, bool no_pi) { // Interface creation. ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR)); struct ifreq ifr_set = {}; - ifr_set.ifr_flags = IFF_TAP; + if (tap) { + ifr_set.ifr_flags |= IFF_TAP; + } else { + ifr_set.ifr_flags |= IFF_TUN; + } + if (no_pi) { + ifr_set.ifr_flags |= IFF_NO_PI; + } strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ); if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) { return PosixError(errno); @@ -296,13 +321,15 @@ PosixErrorOr OpenAndAttachTap(const std::string& dev_name, if (!IsRunningOnGvisor()) { // FIXME(b/110961832): gVisor doesn't support setting MAC address on // interfaces yet. - RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA))); + if (tap) { + RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA))); + } // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces. RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP)); } - return fd; + return TunTapInterface{.fd = std::move(fd), .link = std::move(link)}; } // This test sets up a TAP device and pings kernel by sending ICMP echo request. @@ -322,8 +349,9 @@ PosixErrorOr OpenAndAttachTap(const std::string& dev_name, TEST_F(TuntapTest, PingKernel) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto& [fd, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); + ping_pkt ping_req = CreatePingPacket(kMacB, kTapPeerIPAddr, kMacA, kTapIPAddr); std::string arp_rep = @@ -365,10 +393,10 @@ TEST_F(TuntapTest, PingKernel) { // Process ping response packet. if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol && - r.ping.ip.protocol == ping_req.ip.protocol && - !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) && - !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) && - r.ping.icmp.type == 0 && r.ping.icmp.code == 0) { + r.ping.ip_pkt.ip.protocol == ping_req.ip_pkt.ip.protocol && + !memcmp(&r.ping.ip_pkt.ip.saddr, &ping_req.ip_pkt.ip.daddr, kIPLen) && + !memcmp(&r.ping.ip_pkt.ip.daddr, &ping_req.ip_pkt.ip.saddr, kIPLen) && + r.ping.ip_pkt.icmp.type == 0 && r.ping.ip_pkt.icmp.code == 0) { // Ends and passes the test. break; } @@ -378,8 +406,8 @@ TEST_F(TuntapTest, PingKernel) { TEST_F(TuntapTest, SendUdpTriggersArpResolution) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto& [fd, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); // Send a UDP packet to remote. int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); @@ -433,19 +461,17 @@ TEST_F(TuntapTest, TUNNoPacketInfo) { EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, 24, &dev_ipv4_addr, sizeof(dev_ipv4_addr))); - ping_pkt ping_req = - CreatePingPacket(kMacB, kTapPeerIPAddr, kMacA, kTapIPAddr); - size_t packet_size = sizeof(ping_req) - offsetof(ping_pkt, ip); + ping_ip_pkt ping_req = CreatePingIPPacket(kTapPeerIPAddr, kTapIPAddr); // Send ICMP query - EXPECT_THAT(write(fd.get(), &ping_req.ip, packet_size), - SyscallSucceedsWithValue(packet_size)); + EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)), + SyscallSucceedsWithValue(sizeof(ping_req))); // Receive loop to process inbound packets. while (1) { - ping_pkt ping_resp = {}; - EXPECT_THAT(read(fd.get(), &ping_resp.ip, packet_size), - SyscallSucceedsWithValue(packet_size)); + ping_ip_pkt ping_resp = {}; + EXPECT_THAT(read(fd.get(), &ping_resp, sizeof(ping_req)), + SyscallSucceedsWithValue(sizeof(ping_req))); // Process ping response packet. if (!memcmp(&ping_resp.ip.saddr, &ping_req.ip.daddr, kIPLen) && @@ -465,8 +491,8 @@ TEST_F(TuntapTest, TCPBlockingConnectFailsArpResolution) { FileDescriptor sender = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); sockaddr_in connect_addr = { .sin_family = AF_INET, @@ -487,8 +513,8 @@ TEST_F(TuntapTest, TCPNonBlockingConnectFailsArpResolution) { FileDescriptor sender = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); sockaddr_in connect_addr = { .sin_family = AF_INET, @@ -518,8 +544,8 @@ TEST_F(TuntapTest, TCPNonBlockingConnectFailsArpResolution) { TEST_F(TuntapTest, WriteHangBug155928773) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); int sock = socket(AF_INET, SOCK_DGRAM, 0); ASSERT_THAT(sock, SyscallSucceeds()); @@ -534,5 +560,95 @@ TEST_F(TuntapTest, WriteHangBug155928773) { write(sock, "hello", 5); } +// Test that raw packet sockets do not need/include link headers when +// sending/receiving packets to/from pure L3 (e.g. TUN) interfaces. +TEST_F(TuntapTest, RawPacketSocket) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + auto [tun, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTunName, kTapIPAddr, false /* tap */, true /* no_pi */)); + FileDescriptor packet_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP))); + + constexpr int kInfiniteTimeout = -1; + + uint8_t hardware_address_length = 0; + if (IsRunningOnGvisor()) { + // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have + // an ethernet address. + hardware_address_length = ETH_ALEN; + } + + { + const ping_ip_pkt ping_req = CreatePingIPPacket(kTapPeerIPAddr, kTapIPAddr); + ASSERT_THAT(write(tun.get(), &ping_req, sizeof(ping_req)), + SyscallSucceedsWithValue(sizeof(ping_req))); + // Wait for the packet socket to become readable. + pollfd pfd = { + .fd = packet_sock.get(), + .events = POLLIN, + }; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout), + SyscallSucceedsWithValue(1)); + + char read_buf[sizeof(ping_req) + 1]; + struct sockaddr_ll src; + socklen_t src_len = sizeof(src); + ASSERT_THAT(recvfrom(packet_sock.get(), read_buf, sizeof(read_buf), 0, + reinterpret_cast(&src), &src_len), + SyscallSucceedsWithValue(sizeof(ping_req))); + EXPECT_EQ(memcmp(read_buf, &ping_req, sizeof(ping_req)), 0); + ASSERT_EQ(src_len, sizeof(src)); + EXPECT_EQ(src.sll_family, AF_PACKET); + EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); + EXPECT_EQ(src.sll_ifindex, link.index); + EXPECT_EQ(src.sll_pkttype, PACKET_HOST); + EXPECT_EQ(src.sll_halen, hardware_address_length); + if (IsRunningOnGvisor()) { + // TODO(https://gvisor.dev/issue/6531): Check this field for the right + // hardware type. + EXPECT_EQ(src.sll_hatype, 0); + } else { + EXPECT_EQ(src.sll_hatype, ARPHRD_NONE); + } + } + + { + const struct sockaddr_ll dest = { + .sll_family = AF_PACKET, + .sll_protocol = htons(ETH_P_IP), + .sll_ifindex = link.index, + .sll_halen = hardware_address_length, + }; + + const ping_ip_pkt ping_req = CreatePingIPPacket(kTapIPAddr, kTapPeerIPAddr); + ASSERT_THAT( + sendto(packet_sock.get(), &ping_req, sizeof(ping_req), 0, + reinterpret_cast(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(ping_req))); + + // Loop until we receive the packet we expect - the kernel may send packets + // we do not care about. + while (true) { + // Wait for the TUN interface to become readable. + pollfd pfd = { + .fd = tun.get(), + .events = POLLIN, + }; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout), + SyscallSucceedsWithValue(1)); + + char read_buf[sizeof(ping_req) + 1]; + int n = read(tun.get(), &read_buf, sizeof(read_buf)); + ASSERT_THAT(n, SyscallSucceeds()); + if (n == sizeof(ping_req) && + memcmp(read_buf, &ping_req, sizeof(ping_req)) == 0) { + break; + } + } + } +} + } // namespace testing } // namespace gvisor -- cgit v1.2.3