summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGhanan Gowripalan <ghanan@google.com>2021-09-09 17:07:37 -0700
committergVisor bot <gvisor-bot@google.com>2021-09-09 17:10:17 -0700
commit2572af38860ed8a5aa37f0ad705a360628a4ed10 (patch)
tree88f7ac7db4797b5d1437ed4cf9f0dc37bad57e70
parent833d933afda03706328ac556d08294a78e372a6a (diff)
Remove linux-compat loopback hacks from packet endpoint
Previously, gVisor did not represent loopback devices as an ethernet device as Linux does. To maintain Linux API compatibility for packet sockets, a workaround was used to add an ethernet header if a link header was not already present in the packet buffer delivered to a packet endpoint. However, this workaround is a bug for non-ethernet based interfaces; not all links use an ethernet header (e.g. pure L3/TUN interfaces). As of 3b4bb947517d0d9010120aaa1c3989fd6abf278e, gVisor represents loopback devices as an ethernet-based device so this workaround can now be removed. BUG: https://fxbug.dev/81592 Updates #6530, #6531. PiperOrigin-RevId: 395819151
-rw-r--r--pkg/sentry/socket/socket.go2
-rw-r--r--pkg/tcpip/link/channel/channel.go23
-rw-r--r--pkg/tcpip/transport/packet/endpoint.go86
-rw-r--r--test/syscalls/linux/tuntap.cc210
4 files changed, 209 insertions, 112 deletions
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 83b9d9389..841d5bd55 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -743,6 +743,8 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
a.UnmarshalUnsafe(addr[:sockAddrLinkSize])
+ // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have
+ // an ethernet address.
if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index d02eea93c..658557d62 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -28,7 +28,9 @@ import (
// PacketInfo holds all the information about an outbound packet.
type PacketInfo struct {
- Pkt *stack.PacketBuffer
+ Pkt *stack.PacketBuffer
+
+ // TODO(https://gvisor.dev/issue/6537): Remove these fields.
Proto tcpip.NetworkProtocolNumber
Route stack.RouteInfo
}
@@ -244,7 +246,10 @@ func (e *Endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocol
Route: r,
}
- e.q.Write(p)
+ // Write returns false if the queue is full. A full queue is not an error
+ // from the perspective of a LinkEndpoint so we ignore Write's return
+ // value and always return nil from this method.
+ _ = e.q.Write(p)
return nil
}
@@ -292,4 +297,16 @@ func (*Endpoint) AddHeader(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkPr
}
// WriteRawPacket implements stack.LinkEndpoint.
-func (*Endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} }
+func (e *Endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error {
+ p := PacketInfo{
+ Pkt: pkt,
+ Proto: pkt.NetworkProtocolNumber,
+ }
+
+ // Write returns false if the queue is full. A full queue is not an error
+ // from the perspective of a LinkEndpoint so we ignore Write's return
+ // value and always return nil from this method.
+ _ = e.q.Write(p)
+
+ return nil
+}
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 89b4720aa..0554d2f4a 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -25,7 +25,6 @@
package packet
import (
- "fmt"
"io"
"time"
@@ -424,76 +423,39 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress,
wasEmpty := ep.rcvBufSize == 0
- // Push new packet into receive list and increment the buffer size.
- var packet packet
+ rcvdPkt := packet{
+ packetInfo: tcpip.LinkPacketInfo{
+ Protocol: netProto,
+ PktType: pkt.PktType,
+ },
+ senderAddr: tcpip.FullAddress{
+ NIC: nicID,
+ },
+ receivedAt: ep.stack.Clock().Now(),
+ }
+
if !pkt.LinkHeader().View().IsEmpty() {
- // Get info directly from the ethernet header.
hdr := header.Ethernet(pkt.LinkHeader().View())
- packet.senderAddr = tcpip.FullAddress{
- NIC: nicID,
- Addr: tcpip.Address(hdr.SourceAddress()),
- }
- packet.packetInfo.Protocol = netProto
- packet.packetInfo.PktType = pkt.PktType
- } else {
- // Guess the would-be ethernet header.
- packet.senderAddr = tcpip.FullAddress{
- NIC: nicID,
- Addr: tcpip.Address(localAddr),
- }
- packet.packetInfo.Protocol = netProto
- packet.packetInfo.PktType = pkt.PktType
+ rcvdPkt.senderAddr.Addr = tcpip.Address(hdr.SourceAddress())
}
if ep.cooked {
- // Cooked packets can simply be queued.
- switch pkt.PktType {
- case tcpip.PacketHost:
- packet.data = pkt.Data().ExtractVV()
- case tcpip.PacketOutgoing:
- // Strip Link Header.
- var combinedVV buffer.VectorisedView
- if v := pkt.NetworkHeader().View(); !v.IsEmpty() {
- combinedVV.AppendView(v)
- }
- if v := pkt.TransportHeader().View(); !v.IsEmpty() {
- combinedVV.AppendView(v)
- }
- combinedVV.Append(pkt.Data().ExtractVV())
- packet.data = combinedVV
- default:
- panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt))
+ // Cooked packet endpoints don't include the link-headers in received
+ // packets.
+ if v := pkt.NetworkHeader().View(); !v.IsEmpty() {
+ rcvdPkt.data.AppendView(v)
}
- } else {
- // Raw packets need their ethernet headers prepended before
- // queueing.
- var linkHeader buffer.View
- if pkt.PktType != tcpip.PacketOutgoing {
- if pkt.LinkHeader().View().IsEmpty() {
- // We weren't provided with an actual ethernet header,
- // so fake one.
- ethFields := header.EthernetFields{
- SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
- DstAddr: localAddr,
- Type: netProto,
- }
- fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
- fakeHeader.Encode(&ethFields)
- linkHeader = buffer.View(fakeHeader)
- } else {
- linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...)
- }
- combinedVV := linkHeader.ToVectorisedView()
- combinedVV.Append(pkt.Data().ExtractVV())
- packet.data = combinedVV
- } else {
- packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views())
+ if v := pkt.TransportHeader().View(); !v.IsEmpty() {
+ rcvdPkt.data.AppendView(v)
}
+ rcvdPkt.data.Append(pkt.Data().ExtractVV())
+ } else {
+ // Raw packet endpoints include link-headers in received packets.
+ rcvdPkt.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views())
}
- packet.receivedAt = ep.stack.Clock().Now()
- ep.rcvList.PushBack(&packet)
- ep.rcvBufSize += packet.data.Size()
+ ep.rcvList.PushBack(&rcvdPkt)
+ ep.rcvBufSize += rcvdPkt.data.Size()
ep.rcvMu.Unlock()
ep.stats.PacketsReceived.Increment()
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index 7c9c5c870..956208bee 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -73,29 +73,14 @@ PosixErrorOr<Link> GetLinkByName(const std::string& name) {
return PosixError(ENOENT, "interface not found");
}
-struct pihdr {
- uint16_t pi_flags;
- uint16_t pi_protocol;
-} __attribute__((packed));
-
-struct ping_pkt {
- pihdr pi;
- struct ethhdr eth;
- struct iphdr ip;
- struct icmphdr icmp;
+struct ping_ip_pkt {
+ iphdr ip;
+ icmphdr icmp;
char payload[64];
} __attribute__((packed));
-ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip,
- const uint8_t dstmac[ETH_ALEN],
- const in_addr_t dstip) {
- ping_pkt pkt = {};
-
- pkt.pi.pi_protocol = htons(ETH_P_IP);
-
- memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
- memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
- pkt.eth.h_proto = htons(ETH_P_IP);
+ping_ip_pkt CreatePingIPPacket(const in_addr_t srcip, const in_addr_t dstip) {
+ ping_ip_pkt pkt = {};
pkt.ip.ihl = 5;
pkt.ip.version = 4;
@@ -122,6 +107,33 @@ ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip,
return pkt;
}
+struct pihdr {
+ uint16_t pi_flags;
+ uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+ pihdr pi;
+ ethhdr eth;
+ ping_ip_pkt ip_pkt;
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip,
+ const uint8_t dstmac[ETH_ALEN],
+ const in_addr_t dstip) {
+ ping_pkt pkt = {};
+
+ pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+ memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+ memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+ pkt.eth.h_proto = htons(ETH_P_IP);
+
+ pkt.ip_pkt = CreatePingIPPacket(srcip, dstip);
+
+ return pkt;
+}
+
struct arp_pkt {
pihdr pi;
struct ethhdr eth;
@@ -274,13 +286,26 @@ TEST_F(TuntapTest, WriteToDownDevice) {
EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
}
-PosixErrorOr<FileDescriptor> OpenAndAttachTap(const std::string& dev_name,
- const in_addr_t dev_addr) {
+struct TunTapInterface {
+ FileDescriptor fd;
+ Link link;
+};
+
+PosixErrorOr<TunTapInterface> OpenAndAttachTunTap(const std::string& dev_name,
+ const in_addr_t dev_addr,
+ bool tap, bool no_pi) {
// Interface creation.
ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR));
struct ifreq ifr_set = {};
- ifr_set.ifr_flags = IFF_TAP;
+ if (tap) {
+ ifr_set.ifr_flags |= IFF_TAP;
+ } else {
+ ifr_set.ifr_flags |= IFF_TUN;
+ }
+ if (no_pi) {
+ ifr_set.ifr_flags |= IFF_NO_PI;
+ }
strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ);
if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) {
return PosixError(errno);
@@ -296,13 +321,15 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(const std::string& dev_name,
if (!IsRunningOnGvisor()) {
// FIXME(b/110961832): gVisor doesn't support setting MAC address on
// interfaces yet.
- RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA)));
+ if (tap) {
+ RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA)));
+ }
// FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP));
}
- return fd;
+ return TunTapInterface{.fd = std::move(fd), .link = std::move(link)};
}
// This test sets up a TAP device and pings kernel by sending ICMP echo request.
@@ -322,8 +349,9 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(const std::string& dev_name,
TEST_F(TuntapTest, PingKernel) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
- FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr));
+ const auto& [fd, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap(
+ kTapName, kTapIPAddr, true /* tap */, false /* no_pi */));
+
ping_pkt ping_req =
CreatePingPacket(kMacB, kTapPeerIPAddr, kMacA, kTapIPAddr);
std::string arp_rep =
@@ -365,10 +393,10 @@ TEST_F(TuntapTest, PingKernel) {
// Process ping response packet.
if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
- r.ping.ip.protocol == ping_req.ip.protocol &&
- !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
- !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
- r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+ r.ping.ip_pkt.ip.protocol == ping_req.ip_pkt.ip.protocol &&
+ !memcmp(&r.ping.ip_pkt.ip.saddr, &ping_req.ip_pkt.ip.daddr, kIPLen) &&
+ !memcmp(&r.ping.ip_pkt.ip.daddr, &ping_req.ip_pkt.ip.saddr, kIPLen) &&
+ r.ping.ip_pkt.icmp.type == 0 && r.ping.ip_pkt.icmp.code == 0) {
// Ends and passes the test.
break;
}
@@ -378,8 +406,8 @@ TEST_F(TuntapTest, PingKernel) {
TEST_F(TuntapTest, SendUdpTriggersArpResolution) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
- FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr));
+ const auto& [fd, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap(
+ kTapName, kTapIPAddr, true /* tap */, false /* no_pi */));
// Send a UDP packet to remote.
int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
@@ -433,19 +461,17 @@ TEST_F(TuntapTest, TUNNoPacketInfo) {
EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, 24, &dev_ipv4_addr,
sizeof(dev_ipv4_addr)));
- ping_pkt ping_req =
- CreatePingPacket(kMacB, kTapPeerIPAddr, kMacA, kTapIPAddr);
- size_t packet_size = sizeof(ping_req) - offsetof(ping_pkt, ip);
+ ping_ip_pkt ping_req = CreatePingIPPacket(kTapPeerIPAddr, kTapIPAddr);
// Send ICMP query
- EXPECT_THAT(write(fd.get(), &ping_req.ip, packet_size),
- SyscallSucceedsWithValue(packet_size));
+ EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
// Receive loop to process inbound packets.
while (1) {
- ping_pkt ping_resp = {};
- EXPECT_THAT(read(fd.get(), &ping_resp.ip, packet_size),
- SyscallSucceedsWithValue(packet_size));
+ ping_ip_pkt ping_resp = {};
+ EXPECT_THAT(read(fd.get(), &ping_resp, sizeof(ping_req)),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
// Process ping response packet.
if (!memcmp(&ping_resp.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
@@ -465,8 +491,8 @@ TEST_F(TuntapTest, TCPBlockingConnectFailsArpResolution) {
FileDescriptor sender =
ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
- FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr));
+ const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap(
+ kTapName, kTapIPAddr, true /* tap */, false /* no_pi */));
sockaddr_in connect_addr = {
.sin_family = AF_INET,
@@ -487,8 +513,8 @@ TEST_F(TuntapTest, TCPNonBlockingConnectFailsArpResolution) {
FileDescriptor sender = ASSERT_NO_ERRNO_AND_VALUE(
Socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
- FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr));
+ const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap(
+ kTapName, kTapIPAddr, true /* tap */, false /* no_pi */));
sockaddr_in connect_addr = {
.sin_family = AF_INET,
@@ -518,8 +544,8 @@ TEST_F(TuntapTest, TCPNonBlockingConnectFailsArpResolution) {
TEST_F(TuntapTest, WriteHangBug155928773) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
- FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr));
+ const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap(
+ kTapName, kTapIPAddr, true /* tap */, false /* no_pi */));
int sock = socket(AF_INET, SOCK_DGRAM, 0);
ASSERT_THAT(sock, SyscallSucceeds());
@@ -534,5 +560,95 @@ TEST_F(TuntapTest, WriteHangBug155928773) {
write(sock, "hello", 5);
}
+// Test that raw packet sockets do not need/include link headers when
+// sending/receiving packets to/from pure L3 (e.g. TUN) interfaces.
+TEST_F(TuntapTest, RawPacketSocket) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ auto [tun, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap(
+ kTunName, kTapIPAddr, false /* tap */, true /* no_pi */));
+ FileDescriptor packet_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP)));
+
+ constexpr int kInfiniteTimeout = -1;
+
+ uint8_t hardware_address_length = 0;
+ if (IsRunningOnGvisor()) {
+ // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have
+ // an ethernet address.
+ hardware_address_length = ETH_ALEN;
+ }
+
+ {
+ const ping_ip_pkt ping_req = CreatePingIPPacket(kTapPeerIPAddr, kTapIPAddr);
+ ASSERT_THAT(write(tun.get(), &ping_req, sizeof(ping_req)),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
+ // Wait for the packet socket to become readable.
+ pollfd pfd = {
+ .fd = packet_sock.get(),
+ .events = POLLIN,
+ };
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout),
+ SyscallSucceedsWithValue(1));
+
+ char read_buf[sizeof(ping_req) + 1];
+ struct sockaddr_ll src;
+ socklen_t src_len = sizeof(src);
+ ASSERT_THAT(recvfrom(packet_sock.get(), read_buf, sizeof(read_buf), 0,
+ reinterpret_cast<struct sockaddr*>(&src), &src_len),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
+ EXPECT_EQ(memcmp(read_buf, &ping_req, sizeof(ping_req)), 0);
+ ASSERT_EQ(src_len, sizeof(src));
+ EXPECT_EQ(src.sll_family, AF_PACKET);
+ EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP);
+ EXPECT_EQ(src.sll_ifindex, link.index);
+ EXPECT_EQ(src.sll_pkttype, PACKET_HOST);
+ EXPECT_EQ(src.sll_halen, hardware_address_length);
+ if (IsRunningOnGvisor()) {
+ // TODO(https://gvisor.dev/issue/6531): Check this field for the right
+ // hardware type.
+ EXPECT_EQ(src.sll_hatype, 0);
+ } else {
+ EXPECT_EQ(src.sll_hatype, ARPHRD_NONE);
+ }
+ }
+
+ {
+ const struct sockaddr_ll dest = {
+ .sll_family = AF_PACKET,
+ .sll_protocol = htons(ETH_P_IP),
+ .sll_ifindex = link.index,
+ .sll_halen = hardware_address_length,
+ };
+
+ const ping_ip_pkt ping_req = CreatePingIPPacket(kTapIPAddr, kTapPeerIPAddr);
+ ASSERT_THAT(
+ sendto(packet_sock.get(), &ping_req, sizeof(ping_req), 0,
+ reinterpret_cast<const struct sockaddr*>(&dest), sizeof(dest)),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
+
+ // Loop until we receive the packet we expect - the kernel may send packets
+ // we do not care about.
+ while (true) {
+ // Wait for the TUN interface to become readable.
+ pollfd pfd = {
+ .fd = tun.get(),
+ .events = POLLIN,
+ };
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout),
+ SyscallSucceedsWithValue(1));
+
+ char read_buf[sizeof(ping_req) + 1];
+ int n = read(tun.get(), &read_buf, sizeof(read_buf));
+ ASSERT_THAT(n, SyscallSucceeds());
+ if (n == sizeof(ping_req) &&
+ memcmp(read_buf, &ping_req, sizeof(ping_req)) == 0) {
+ break;
+ }
+ }
+ }
+}
+
} // namespace testing
} // namespace gvisor