// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ipv4 contains the implementation of the ipv4 network protocol. To use // it in the networking stack, this package must be added to the project, and // activated on the stack by passing ipv4.NewProtocol() as one of the network // protocols when calling stack.New(). Then endpoints can be created by passing // ipv4.ProtocolNumber as the network protocol number when calling // Stack.NewEndpoint(). package ipv4 import ( "sync/atomic" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/network/hash" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // ProtocolNumber is the ipv4 protocol number. ProtocolNumber = header.IPv4ProtocolNumber // MaxTotalSize is maximum size that can be encoded in the 16-bit // TotalLength field of the ipv4 header. MaxTotalSize = 0xffff // DefaultTTL is the default time-to-live value for this endpoint. DefaultTTL = 64 // buckets is the number of identifier buckets. buckets = 2048 ) type endpoint struct { nicid tcpip.NICID id stack.NetworkEndpointID prefixLen int linkEP stack.LinkEndpoint dispatcher stack.TransportDispatcher fragmentation *fragmentation.Fragmentation protocol *protocol } // NewEndpoint creates a new ipv4 endpoint. func (p *protocol) NewEndpoint(nicid tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { e := &endpoint{ nicid: nicid, id: stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address}, prefixLen: addrWithPrefix.PrefixLen, linkEP: linkEP, dispatcher: dispatcher, fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), protocol: p, } return e, nil } // DefaultTTL is the default time-to-live value for this endpoint. func (e *endpoint) DefaultTTL() uint8 { return e.protocol.DefaultTTL() } // MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus // the network layer max header length. func (e *endpoint) MTU() uint32 { return calculateMTU(e.linkEP.MTU()) } // Capabilities implements stack.NetworkEndpoint.Capabilities. func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { return e.linkEP.Capabilities() } // NICID returns the ID of the NIC this endpoint belongs to. func (e *endpoint) NICID() tcpip.NICID { return e.nicid } // ID returns the ipv4 endpoint ID. func (e *endpoint) ID() *stack.NetworkEndpointID { return &e.id } // PrefixLen returns the ipv4 endpoint subnet prefix length in bits. func (e *endpoint) PrefixLen() int { return e.prefixLen } // MaxHeaderLength returns the maximum length needed by ipv4 headers (and // underlying protocols). func (e *endpoint) MaxHeaderLength() uint16 { return e.linkEP.MaxHeaderLength() + header.IPv4MinimumSize } // GSOMaxSize returns the maximum GSO packet size. func (e *endpoint) GSOMaxSize() uint32 { if gso, ok := e.linkEP.(stack.GSOEndpoint); ok { return gso.GSOMaxSize() } return 0 } // writePacketFragments calls e.linkEP.WritePacket with each packet fragment to // write. It assumes that the IP header is entirely in hdr but does not assume // that only the IP header is in hdr. It assumes that the input packet's stated // length matches the length of the hdr+payload. mtu includes the IP header and // options. This does not support the DontFragment IP flag. func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, mtu int) *tcpip.Error { // This packet is too big, it needs to be fragmented. ip := header.IPv4(hdr.View()) flags := ip.Flags() // Update mtu to take into account the header, which will exist in all // fragments anyway. innerMTU := mtu - int(ip.HeaderLength()) // Round the MTU down to align to 8 bytes. Then calculate the number of // fragments. Calculate fragment sizes as in RFC791. innerMTU &^= 7 n := (int(ip.PayloadLength()) + innerMTU - 1) / innerMTU outerMTU := innerMTU + int(ip.HeaderLength()) offset := ip.FragmentOffset() originalAvailableLength := hdr.AvailableLength() for i := 0; i < n; i++ { // Where possible, the first fragment that is sent has the same // hdr.UsedLength() as the input packet. The link-layer endpoint may depends // on this for looking at, eg, L4 headers. h := ip if i > 0 { hdr = buffer.NewPrependable(int(ip.HeaderLength()) + originalAvailableLength) h = header.IPv4(hdr.Prepend(int(ip.HeaderLength()))) copy(h, ip[:ip.HeaderLength()]) } if i != n-1 { h.SetTotalLength(uint16(outerMTU)) h.SetFlagsFragmentOffset(flags|header.IPv4FlagMoreFragments, offset) } else { h.SetTotalLength(uint16(h.HeaderLength()) + uint16(payload.Size())) h.SetFlagsFragmentOffset(flags, offset) } h.SetChecksum(0) h.SetChecksum(^h.CalculateChecksum()) offset += uint16(innerMTU) if i > 0 { newPayload := payload.Clone([]buffer.View{}) newPayload.CapLength(innerMTU) if err := e.linkEP.WritePacket(r, gso, hdr, newPayload, ProtocolNumber); err != nil { return err } r.Stats().IP.PacketsSent.Increment() payload.TrimFront(newPayload.Size()) continue } // Special handling for the first fragment because it comes from the hdr. if outerMTU >= hdr.UsedLength() { // This fragment can fit all of hdr and possibly some of payload, too. newPayload := payload.Clone([]buffer.View{}) newPayloadLength := outerMTU - hdr.UsedLength() newPayload.CapLength(newPayloadLength) if err := e.linkEP.WritePacket(r, gso, hdr, newPayload, ProtocolNumber); err != nil { return err } r.Stats().IP.PacketsSent.Increment() payload.TrimFront(newPayloadLength) } else { // The fragment is too small to fit all of hdr. startOfHdr := hdr startOfHdr.TrimBack(hdr.UsedLength() - outerMTU) emptyVV := buffer.NewVectorisedView(0, []buffer.View{}) if err := e.linkEP.WritePacket(r, gso, startOfHdr, emptyVV, ProtocolNumber); err != nil { return err } r.Stats().IP.PacketsSent.Increment() // Add the unused bytes of hdr into the payload that remains to be sent. restOfHdr := hdr.View()[outerMTU:] tmp := buffer.NewVectorisedView(len(restOfHdr), []buffer.View{buffer.NewViewFromBytes(restOfHdr)}) tmp.Append(payload) payload = tmp } } return nil } // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop stack.PacketLooping) *tcpip.Error { ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) length := uint16(hdr.UsedLength() + payload.Size()) id := uint32(0) if length > header.IPv4MaximumHeaderSize+8 { // Packets of 68 bytes or less are required by RFC 791 to not be // fragmented, so we only assign ids to larger packets. id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, protocol, e.protocol.hashIV)%buckets], 1) } ip.Encode(&header.IPv4Fields{ IHL: header.IPv4MinimumSize, TotalLength: length, ID: uint16(id), TTL: ttl, Protocol: uint8(protocol), SrcAddr: r.LocalAddress, DstAddr: r.RemoteAddress, }) ip.SetChecksum(^ip.CalculateChecksum()) if loop&stack.PacketLoop != 0 { views := make([]buffer.View, 1, 1+len(payload.Views())) views[0] = hdr.View() views = append(views, payload.Views()...) vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views) loopedR := r.MakeLoopedRoute() e.HandlePacket(&loopedR, vv) loopedR.Release() } if loop&stack.PacketOut == 0 { return nil } if hdr.UsedLength()+payload.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) { return e.writePacketFragments(r, gso, hdr, payload, int(e.linkEP.MTU())) } if err := e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber); err != nil { return err } r.Stats().IP.PacketsSent.Increment() return nil } // WriteHeaderIncludedPacket writes a packet already containing a network // header through the given route. func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { // The packet already has an IP header, but there are a few required // checks. ip := header.IPv4(payload.First()) if !ip.IsValid(payload.Size()) { return tcpip.ErrInvalidOptionValue } // Always set the total length. ip.SetTotalLength(uint16(payload.Size())) // Set the source address when zero. if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) { ip.SetSourceAddress(r.LocalAddress) } // Set the destination. If the packet already included a destination, // it will be part of the route. ip.SetDestinationAddress(r.RemoteAddress) // Set the packet ID when zero. if ip.ID() == 0 { id := uint32(0) if payload.Size() > header.IPv4MaximumHeaderSize+8 { // Packets of 68 bytes or less are required by RFC 791 to not be // fragmented, so we only assign ids to larger packets. id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1) } ip.SetID(uint16(id)) } // Always set the checksum. ip.SetChecksum(0) ip.SetChecksum(^ip.CalculateChecksum()) if loop&stack.PacketLoop != 0 { e.HandlePacket(r, payload) } if loop&stack.PacketOut == 0 { return nil } hdr := buffer.NewPrependableFromView(payload.ToView()) r.Stats().IP.PacketsSent.Increment() return e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber) } // HandlePacket is called by the link layer when new ipv4 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { headerView := vv.First() h := header.IPv4(headerView) if !h.IsValid(vv.Size()) { r.Stats().IP.MalformedPacketsReceived.Increment() return } hlen := int(h.HeaderLength()) tlen := int(h.TotalLength()) vv.TrimFront(hlen) vv.CapLength(tlen - hlen) more := (h.Flags() & header.IPv4FlagMoreFragments) != 0 if more || h.FragmentOffset() != 0 { if vv.Size() == 0 { // Drop the packet as it's marked as a fragment but has // no payload. r.Stats().IP.MalformedPacketsReceived.Increment() r.Stats().IP.MalformedFragmentsReceived.Increment() return } // The packet is a fragment, let's try to reassemble it. last := h.FragmentOffset() + uint16(vv.Size()) - 1 // Drop the packet if the fragmentOffset is incorrect. i.e the // combination of fragmentOffset and vv.size() causes a wrap // around resulting in last being less than the offset. if last < h.FragmentOffset() { r.Stats().IP.MalformedPacketsReceived.Increment() r.Stats().IP.MalformedFragmentsReceived.Increment() return } var ready bool vv, ready = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv) if !ready { return } } p := h.TransportProtocol() if p == header.ICMPv4ProtocolNumber { headerView.CapLength(hlen) e.handleICMP(r, headerView, vv) return } r.Stats().IP.PacketsDelivered.Increment() e.dispatcher.DeliverTransportPacket(r, p, headerView, vv) } // Close cleans up resources associated with the endpoint. func (e *endpoint) Close() {} type protocol struct { ids []uint32 hashIV uint32 // defaultTTL is the current default TTL for the protocol. Only the // uint8 portion of it is meaningful and it must be accessed // atomically. defaultTTL uint32 } // Number returns the ipv4 protocol number. func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } // MinimumPacketSize returns the minimum valid ipv4 packet size. func (p *protocol) MinimumPacketSize() int { return header.IPv4MinimumSize } // DefaultPrefixLen returns the IPv4 default prefix length. func (p *protocol) DefaultPrefixLen() int { return header.IPv4AddressSize * 8 } // ParseAddresses implements NetworkProtocol.ParseAddresses. func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { h := header.IPv4(v) return h.SourceAddress(), h.DestinationAddress() } // SetOption implements NetworkProtocol.SetOption. func (p *protocol) SetOption(option interface{}) *tcpip.Error { switch v := option.(type) { case tcpip.DefaultTTLOption: p.SetDefaultTTL(uint8(v)) return nil default: return tcpip.ErrUnknownProtocolOption } } // Option implements NetworkProtocol.Option. func (p *protocol) Option(option interface{}) *tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: *v = tcpip.DefaultTTLOption(p.DefaultTTL()) return nil default: return tcpip.ErrUnknownProtocolOption } } // SetDefaultTTL sets the default TTL for endpoints created with this protocol. func (p *protocol) SetDefaultTTL(ttl uint8) { atomic.StoreUint32(&p.defaultTTL, uint32(ttl)) } // DefaultTTL returns the default TTL for endpoints created with this protocol. func (p *protocol) DefaultTTL() uint8 { return uint8(atomic.LoadUint32(&p.defaultTTL)) } // calculateMTU calculates the network-layer payload MTU based on the link-layer // payload mtu. func calculateMTU(mtu uint32) uint32 { if mtu > MaxTotalSize { mtu = MaxTotalSize } return mtu - header.IPv4MinimumSize } // hashRoute calculates a hash value for the given route. It uses the source & // destination address, the transport protocol number, and a random initial // value (generated once on initialization) to generate the hash. func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 { t := r.LocalAddress a := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 t = r.RemoteAddress b := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 return hash.Hash3Words(a, b, uint32(protocol), hashIV) } // NewProtocol returns an IPv4 network protocol. func NewProtocol() stack.NetworkProtocol { ids := make([]uint32, buckets) // Randomly initialize hashIV and the ids. r := hash.RandN32(1 + buckets) for i := range ids { ids[i] = r[i] } hashIV := r[buckets] return &protocol{ids: ids, hashIV: hashIV, defaultTTL: DefaultTTL} }