// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package socket provides the interfaces that need to be provided by socket
// implementations and providers, as well as per family demultiplexing of socket
// creation.
package socket

import (
	"fmt"
	"sync/atomic"

	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
	"gvisor.googlesource.com/gvisor/pkg/syserr"
	"gvisor.googlesource.com/gvisor/pkg/tcpip"
)

// ControlMessages represents the union of unix control messages and tcpip
// control messages.
type ControlMessages struct {
	Unix transport.ControlMessages
	IP   tcpip.ControlMessages
}

// Socket is the interface containing socket syscalls used by the syscall layer
// to redirect them to the appropriate implementation.
type Socket interface {
	fs.FileOperations

	// Connect implements the connect(2) linux syscall.
	Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error

	// Accept implements the accept4(2) linux syscall.
	// Returns fd, real peer address length and error. Real peer address
	// length is only set if len(peer) > 0.
	Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error)

	// Bind implements the bind(2) linux syscall.
	Bind(t *kernel.Task, sockaddr []byte) *syserr.Error

	// Listen implements the listen(2) linux syscall.
	Listen(t *kernel.Task, backlog int) *syserr.Error

	// Shutdown implements the shutdown(2) linux syscall.
	Shutdown(t *kernel.Task, how int) *syserr.Error

	// GetSockOpt implements the getsockopt(2) linux syscall.
	GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error)

	// SetSockOpt implements the setsockopt(2) linux syscall.
	SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error

	// GetSockName implements the getsockname(2) linux syscall.
	//
	// addrLen is the address length to be returned to the application, not
	// necessarily the actual length of the address.
	GetSockName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)

	// GetPeerName implements the getpeername(2) linux syscall.
	//
	// addrLen is the address length to be returned to the application, not
	// necessarily the actual length of the address.
	GetPeerName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)

	// RecvMsg implements the recvmsg(2) linux syscall.
	//
	// senderAddrLen is the address length to be returned to the application,
	// not necessarily the actual length of the address.
	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)

	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
	// ownership of the ControlMessage on error.
	//
	// If n > 0, err will either be nil or an error from t.Block.
	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages ControlMessages) (n int, err *syserr.Error)

	// SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
	// no timeout.
	SetRecvTimeout(nanoseconds int64)

	// RecvTimeout gets the current timeout (in ns) for recv operations. Zero
	// means no timeout.
	RecvTimeout() int64
}

// Provider is the interface implemented by providers of sockets for specific
// address families (e.g., AF_INET).
type Provider interface {
	// Socket creates a new socket.
	//
	// If a nil Socket _and_ a nil error is returned, it means that the
	// protocol is not supported. A non-nil error should only be returned
	// if the protocol is supported, but an error occurs during creation.
	Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error)

	// Pair creates a pair of connected sockets.
	//
	// See Socket for error information.
	Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
}

// families holds a map of all known address families and their providers.
var families = make(map[int][]Provider)

// RegisterProvider registers the provider of a given address family so that
// sockets of that type can be created via socket() and/or socketpair()
// syscalls.
func RegisterProvider(family int, provider Provider) {
	families[family] = append(families[family], provider)
}

// New creates a new socket with the given family, type and protocol.
func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
	for _, p := range families[family] {
		s, err := p.Socket(t, stype, protocol)
		if err != nil {
			return nil, err
		}
		if s != nil {
			return s, nil
		}
	}

	return nil, syserr.ErrAddressFamilyNotSupported
}

// Pair creates a new connected socket pair with the given family, type and
// protocol.
func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
	providers, ok := families[family]
	if !ok {
		return nil, nil, syserr.ErrAddressFamilyNotSupported
	}

	for _, p := range providers {
		s, t, err := p.Pair(t, stype, protocol)
		if err != nil {
			return nil, nil, err
		}
		if s != nil && t != nil {
			return s, t, nil
		}
	}

	return nil, nil, syserr.ErrSocketNotSupported
}

// NewDirent returns a sockfs fs.Dirent that resides on device d.
func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
	ino := d.NextIno()
	// There is no real filesystem backing this pipe, so we pass in a nil
	// Filesystem.
	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
		FSType: linux.SOCKFS_MAGIC,
		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
			Owner: fs.FileOwnerFromContext(ctx),
			Perms: fs.FilePermissions{
				User: fs.PermMask{Read: true, Write: true},
			},
			Links: 1,
		}),
	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
		Type:      fs.Socket,
		DeviceID:  d.DeviceID(),
		InodeID:   ino,
		BlockSize: usermem.PageSize,
	})

	// Dirent name matches net/socket.c:sockfs_dname.
	return fs.NewDirent(inode, fmt.Sprintf("socket:[%d]", ino))
}

// ReceiveTimeout stores a timeout for receive calls.
//
// It is meant to be embedded into Socket implementations to help satisfy the
// interface.
//
// Care must be taken when copying ReceiveTimeout as it contains atomic
// variables.
//
// +stateify savable
type ReceiveTimeout struct {
	// ns is length of the timeout in nanoseconds.
	//
	// ns must be accessed atomically.
	ns int64
}

// SetRecvTimeout implements Socket.SetRecvTimeout.
func (rt *ReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
	atomic.StoreInt64(&rt.ns, nanoseconds)
}

// RecvTimeout implements Socket.RecvTimeout.
func (rt *ReceiveTimeout) RecvTimeout() int64 {
	return atomic.LoadInt64(&rt.ns)
}

// GetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
// It contains names that are valid for GetSockOpt when level is SOL_SOCKET.
func GetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
	switch name {
	case linux.SO_ACCEPTCONN,
		linux.SO_BPF_EXTENSIONS,
		linux.SO_COOKIE,
		linux.SO_DOMAIN,
		linux.SO_ERROR,
		linux.SO_GET_FILTER,
		linux.SO_INCOMING_NAPI_ID,
		linux.SO_MEMINFO,
		linux.SO_PEERCRED,
		linux.SO_PEERGROUPS,
		linux.SO_PEERNAME,
		linux.SO_PEERSEC,
		linux.SO_PROTOCOL,
		linux.SO_SNDLOWAT,
		linux.SO_TYPE:

		t.Kernel().EmitUnimplementedEvent(t)

	default:
		emitUnimplementedEvent(t, name)
	}
}

// SetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
// It contains names that are valid for SetSockOpt when level is SOL_SOCKET.
func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
	switch name {
	case linux.SO_ATTACH_BPF,
		linux.SO_ATTACH_FILTER,
		linux.SO_ATTACH_REUSEPORT_CBPF,
		linux.SO_ATTACH_REUSEPORT_EBPF,
		linux.SO_CNX_ADVICE,
		linux.SO_DETACH_FILTER,
		linux.SO_RCVBUFFORCE,
		linux.SO_SNDBUFFORCE:

		t.Kernel().EmitUnimplementedEvent(t)

	default:
		emitUnimplementedEvent(t, name)
	}
}

// emitUnimplementedEvent emits unimplemented event if name is valid. It
// contains names that are common between Get and SetSocketOpt when level is
// SOL_SOCKET.
func emitUnimplementedEvent(t *kernel.Task, name int) {
	switch name {
	case linux.SO_BINDTODEVICE,
		linux.SO_BROADCAST,
		linux.SO_BSDCOMPAT,
		linux.SO_BUSY_POLL,
		linux.SO_DEBUG,
		linux.SO_DONTROUTE,
		linux.SO_INCOMING_CPU,
		linux.SO_KEEPALIVE,
		linux.SO_LINGER,
		linux.SO_LOCK_FILTER,
		linux.SO_MARK,
		linux.SO_MAX_PACING_RATE,
		linux.SO_NOFCS,
		linux.SO_NO_CHECK,
		linux.SO_OOBINLINE,
		linux.SO_PASSCRED,
		linux.SO_PASSSEC,
		linux.SO_PEEK_OFF,
		linux.SO_PRIORITY,
		linux.SO_RCVBUF,
		linux.SO_RCVLOWAT,
		linux.SO_RCVTIMEO,
		linux.SO_REUSEADDR,
		linux.SO_REUSEPORT,
		linux.SO_RXQ_OVFL,
		linux.SO_SELECT_ERR_QUEUE,
		linux.SO_SNDBUF,
		linux.SO_SNDTIMEO,
		linux.SO_TIMESTAMP,
		linux.SO_TIMESTAMPING,
		linux.SO_TIMESTAMPNS,
		linux.SO_TXTIME,
		linux.SO_WIFI_STATUS,
		linux.SO_ZEROCOPY:

		t.Kernel().EmitUnimplementedEvent(t)
	}
}