// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build linux,amd64

package fdbased

import (
	"encoding/binary"
	"fmt"
	"syscall"
	"unsafe"

	"golang.org/x/sys/unix"
	"gvisor.googlesource.com/gvisor/pkg/tcpip"
	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
)

const (
	tPacketAlignment = uintptr(16)
	tpStatusKernel   = 0
	tpStatusUser     = 1
	tpStatusCopy     = 2
	tpStatusLosing   = 4
)

// We overallocate the frame size to accommodate space for the
// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
//
// NOTE: Frames need to be aligned at 16 byte boundaries.
const (
	tpFrameSize = 65536 + 128
	tpBlockSize = tpFrameSize * 128
	tpBlockNR   = 10
	tpFrameNR   = (tpBlockSize * tpBlockNR) / tpFrameSize
)

// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
func tPacketAlign(v uintptr) uintptr {
	return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
}

// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))

// tPacketReq is the tpacket_req structure as described in
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
type tPacketReq struct {
	tpBlockSize uint32
	tpBlockNR   uint32
	tpFrameSize uint32
	tpFrameNR   uint32
}

// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
type tPacketHdr []byte

const (
	tpStatusOffset  = 0
	tpLenOffset     = 8
	tpSnapLenOffset = 12
	tpMacOffset     = 16
	tpNetOffset     = 18
	tpSecOffset     = 20
	tpUSecOffset    = 24
)

func (t tPacketHdr) tpStatus() uint32 {
	return binary.LittleEndian.Uint32(t[tpStatusOffset:])
}

func (t tPacketHdr) setTPStatus(status uint32) {
	binary.LittleEndian.PutUint32(t[tpStatusOffset:], status)
}

func (t tPacketHdr) tpLen() uint32 {
	return binary.LittleEndian.Uint32(t[tpLenOffset:])
}

func (t tPacketHdr) tpSnapLen() uint32 {
	return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
}

func (t tPacketHdr) tpMac() uint16 {
	return binary.LittleEndian.Uint16(t[tpMacOffset:])
}

func (t tPacketHdr) tpNet() uint16 {
	return binary.LittleEndian.Uint16(t[tpNetOffset:])
}

func (t tPacketHdr) tpSec() uint32 {
	return binary.LittleEndian.Uint32(t[tpSecOffset:])
}

func (t tPacketHdr) tpUSec() uint32 {
	return binary.LittleEndian.Uint32(t[tpUSecOffset:])
}

func (t tPacketHdr) Payload() []byte {
	return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
}

func (e *endpoint) setupPacketRXRing() error {
	tReq := tPacketReq{
		tpBlockSize: uint32(tpBlockSize),
		tpBlockNR:   uint32(tpBlockNR),
		tpFrameSize: uint32(tpFrameSize),
		tpFrameNR:   uint32(tpFrameNR),
	}
	// Setup PACKET_RX_RING.
	if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
		return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
	}
	// Let's mmap the blocks.
	sz := tpBlockSize * tpBlockNR
	buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
	if err != nil {
		return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
	}
	e.ringBuffer = buf
	return nil
}

func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
	hdr := (tPacketHdr)(e.ringBuffer[0+e.ringOffset*tpFrameSize:])
	for (hdr.tpStatus() & tpStatusUser) == 0 {
		event := rawfile.PollEvent{
			FD:     int32(e.fd),
			Events: unix.POLLIN | unix.POLLERR,
		}
		_, errno := rawfile.BlockingPoll(&event, 1, -1)
		if errno != 0 {
			if errno == syscall.EINTR {
				continue
			}
			return nil, rawfile.TranslateErrno(errno)
		}
		if hdr.tpStatus()&tpStatusCopy != 0 {
			continue
		}
		if hdr.tpStatus()&tpStatusLosing != 0 {
			continue
		}
	}

	// Copy out the packet from the mmapped frame to a locally owned buffer.
	pkt := make([]byte, hdr.tpSnapLen())
	copy(pkt, hdr.Payload())
	// Release packet to kernel.
	hdr.setTPStatus(tpStatusKernel)
	e.ringOffset = (e.ringOffset + 1) % tpFrameNR
	return pkt, nil
}

// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches
// them to the network stack.
func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
	pkt, err := e.readMMappedPacket()
	if err != nil {
		return false, err
	}
	var (
		p             tcpip.NetworkProtocolNumber
		remote, local tcpip.LinkAddress
	)
	if e.hdrSize > 0 {
		eth := header.Ethernet(pkt)
		p = eth.Type()
		remote = eth.SourceAddress()
		local = eth.DestinationAddress()
	} else {
		// We don't get any indication of what the packet is, so try to guess
		// if it's an IPv4 or IPv6 packet.
		switch header.IPVersion(pkt) {
		case header.IPv4Version:
			p = header.IPv4ProtocolNumber
		case header.IPv6Version:
			p = header.IPv6ProtocolNumber
		default:
			return true, nil
		}
	}

	pkt = pkt[e.hdrSize:]
	e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
	return true, nil
}

func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
	if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 {
		return error(errno)
	}

	return nil
}