diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/tcpip/link/fdbased/packet_dispatchers.go | 256 |
1 files changed, 118 insertions, 138 deletions
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 8c3ca86d6..edab110b5 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -29,92 +29,124 @@ import ( // BufConfig defines the shape of the vectorised view used to read packets from the NIC. var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} -// readVDispatcher uses readv() system call to read inbound packets and -// dispatches them. -type readVDispatcher struct { - // fd is the file descriptor used to send and receive packets. - fd int - - // e is the endpoint this dispatcher is attached to. - e *endpoint - +type iovecBuffer struct { // views are the actual buffers that hold the packet contents. views []buffer.View // iovecs are initialized with base pointers/len of the corresponding - // entries in the views defined above, except when GSO is enabled then - // the first iovec points to a buffer for the vnet header which is - // stripped before the views are passed up the stack for further + // entries in the views defined above, except when GSO is enabled + // (skipsVnetHdr) then the first iovec points to a buffer for the vnet header + // which is stripped before the views are passed up the stack for further // processing. iovecs []syscall.Iovec + + // sizes is an array of buffer sizes for the underlying views. sizes is + // immutable. + sizes []int + + // skipsVnetHdr is true if virtioNetHdr is to skipped. + skipsVnetHdr bool } -func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { - d := &readVDispatcher{fd: fd, e: e} - d.views = make([]buffer.View, len(BufConfig)) - iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - iovLen++ +func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer { + b := &iovecBuffer{ + views: make([]buffer.View, len(sizes)), + sizes: sizes, + skipsVnetHdr: skipsVnetHdr, } - d.iovecs = make([]syscall.Iovec, iovLen) - return d, nil + niov := len(b.views) + if b.skipsVnetHdr { + niov++ + } + b.iovecs = make([]syscall.Iovec, niov) + return b } -func (d *readVDispatcher) allocateViews(bufConfig []int) { - var vnetHdr [virtioNetHdrSize]byte +func (b *iovecBuffer) nextIovecs() []syscall.Iovec { vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { + if b.skipsVnetHdr { + var vnetHdr [virtioNetHdrSize]byte // The kernel adds virtioNetHdr before each packet, but // we don't use it, so so we allocate a buffer for it, // add it in iovecs but don't add it in a view. - d.iovecs[0] = syscall.Iovec{ + b.iovecs[0] = syscall.Iovec{ Base: &vnetHdr[0], Len: uint64(virtioNetHdrSize), } vnetHdrOff++ } - for i := 0; i < len(bufConfig); i++ { - if d.views[i] != nil { + for i := range b.views { + if b.views[i] != nil { break } - b := buffer.NewView(bufConfig[i]) - d.views[i] = b - d.iovecs[i+vnetHdrOff] = syscall.Iovec{ - Base: &b[0], - Len: uint64(len(b)), + v := buffer.NewView(b.sizes[i]) + b.views[i] = v + b.iovecs[i+vnetHdrOff] = syscall.Iovec{ + Base: &v[0], + Len: uint64(len(v)), } } + return b.iovecs } -func (d *readVDispatcher) capViews(n int, buffers []int) int { +func (b *iovecBuffer) pullViews(n int) buffer.VectorisedView { + var views []buffer.View c := 0 - for i, s := range buffers { - c += s + if b.skipsVnetHdr { + c += virtioNetHdrSize if c >= n { - d.views[i].CapLength(s - (c - n)) - return i + 1 + // Nothing in the packet. + return buffer.NewVectorisedView(0, nil) + } + } + for i, v := range b.views { + c += len(v) + if c >= n { + b.views[i].CapLength(len(v) - (c - n)) + views = append([]buffer.View(nil), b.views[:i+1]...) + break } } - return len(buffers) + // Remove the first len(views) used views from the state. + for i := range views { + b.views[i] = nil + } + if b.skipsVnetHdr { + // Exclude the size of the vnet header. + n -= virtioNetHdrSize + } + return buffer.NewVectorisedView(n, views) +} + +// readVDispatcher uses readv() system call to read inbound packets and +// dispatches them. +type readVDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // buf is the iovec buffer that contains the packet contents. + buf *iovecBuffer +} + +func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &readVDispatcher{fd: fd, e: e} + skipsVnetHdr := d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 + d.buf = newIovecBuffer(BufConfig, skipsVnetHdr) + return d, nil } // dispatch reads one packet from the file descriptor and dispatches it. func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { - d.allocateViews(BufConfig) - - n, err := rawfile.BlockingReadv(d.fd, d.iovecs) + n, err := rawfile.BlockingReadv(d.fd, d.buf.nextIovecs()) if n == 0 || err != nil { return false, err } - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - // Skip virtioNetHdr which is added before each packet, it - // isn't used and it isn't in a view. - n -= virtioNetHdrSize - } - used := d.capViews(n, BufConfig) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)), + Data: d.buf.pullViews(n), }) var ( @@ -133,7 +165,12 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { } else { // We don't get any indication of what the packet is, so try to guess // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(d.views[0]) { + // IP version information is at the first octet, so pulling up 1 byte. + h, ok := pkt.Data.PullUp(1) + if !ok { + return true, nil + } + switch header.IPVersion(h) { case header.IPv4Version: p = header.IPv4ProtocolNumber case header.IPv6Version: @@ -145,11 +182,6 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt) - // Prepare e.views for another packet: release used views. - for i := 0; i < used; i++ { - d.views[i] = nil - } - return true, nil } @@ -162,15 +194,8 @@ type recvMMsgDispatcher struct { // e is the endpoint this dispatcher is attached to. e *endpoint - // views is an array of array of buffers that contain packet contents. - views [][]buffer.View - - // iovecs is an array of array of iovec records where each iovec base - // pointer and length are initialzed to the corresponding view above, - // except when GSO is enabled then the first iovec in each array of - // iovecs points to a buffer for the vnet header which is stripped - // before the views are passed up the stack for further processing. - iovecs [][]syscall.Iovec + // bufs is an array of iovec buffers that contain packet contents. + bufs []*iovecBuffer // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to // reference an array of iovecs in the iovecs field defined above. This @@ -187,74 +212,32 @@ const ( func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { d := &recvMMsgDispatcher{ - fd: fd, - e: e, - } - d.views = make([][]buffer.View, MaxMsgsPerRecv) - for i := range d.views { - d.views[i] = make([]buffer.View, len(BufConfig)) - } - d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv) - iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - // virtioNetHdr is prepended before each packet. - iovLen++ + fd: fd, + e: e, + bufs: make([]*iovecBuffer, MaxMsgsPerRecv), + msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv), } - for i := range d.iovecs { - d.iovecs[i] = make([]syscall.Iovec, iovLen) - } - d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv) - for i := range d.msgHdrs { - d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0] - d.msgHdrs[i].Msg.Iovlen = uint64(iovLen) + skipsVnetHdr := d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 + for i := range d.bufs { + d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr) } return d, nil } -func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int { - c := 0 - for i, s := range buffers { - c += s - if c >= n { - d.views[k][i].CapLength(s - (c - n)) - return i + 1 - } - } - return len(buffers) -} - -func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) { - for k := 0; k < len(d.views); k++ { - var vnetHdr [virtioNetHdrSize]byte - vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - // The kernel adds virtioNetHdr before each packet, but - // we don't use it, so so we allocate a buffer for it, - // add it in iovecs but don't add it in a view. - d.iovecs[k][0] = syscall.Iovec{ - Base: &vnetHdr[0], - Len: uint64(virtioNetHdrSize), - } - vnetHdrOff++ - } - for i := 0; i < len(bufConfig); i++ { - if d.views[k][i] != nil { - break - } - b := buffer.NewView(bufConfig[i]) - d.views[k][i] = b - d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{ - Base: &b[0], - Len: uint64(len(b)), - } - } - } -} - // recvMMsgDispatch reads more than one packet at a time from the file // descriptor and dispatches it. func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { - d.allocateViews(BufConfig) + // Fill message headers. + for k := range d.msgHdrs { + if d.msgHdrs[k].Msg.Iovlen > 0 { + break + } + iovecs := d.bufs[k].nextIovecs() + iovLen := len(iovecs) + d.msgHdrs[k].Len = 0 + d.msgHdrs[k].Msg.Iov = &iovecs[0] + d.msgHdrs[k].Msg.Iovlen = uint64(iovLen) + } nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs) if err != nil { @@ -263,15 +246,14 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { // Process each of received packets. for k := 0; k < nMsgs; k++ { n := int(d.msgHdrs[k].Len) - if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { - n -= virtioNetHdrSize - } - used := d.capViews(k, int(n), BufConfig) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)), + Data: d.bufs[k].pullViews(n), }) + // Mark that this iovec has been processed. + d.msgHdrs[k].Msg.Iovlen = 0 + var ( p tcpip.NetworkProtocolNumber remote, local tcpip.LinkAddress @@ -288,26 +270,24 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { } else { // We don't get any indication of what the packet is, so try to guess // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(d.views[k][0]) { + // IP version information is at the first octet, so pulling up 1 byte. + h, ok := pkt.Data.PullUp(1) + if !ok { + // Skip this packet. + continue + } + switch header.IPVersion(h) { case header.IPv4Version: p = header.IPv4ProtocolNumber case header.IPv6Version: p = header.IPv6ProtocolNumber default: - return true, nil + // Skip this packet. + continue } } d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt) - - // Prepare e.views for another packet: release used views. - for i := 0; i < used; i++ { - d.views[k][i] = nil - } - } - - for k := 0; k < nMsgs; k++ { - d.msgHdrs[k].Len = 0 } return true, nil |