diff options
Diffstat (limited to 'pkg/sentry/socket')
54 files changed, 14442 insertions, 0 deletions
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD new file mode 100644 index 000000000..c40c6d673 --- /dev/null +++ b/pkg/sentry/socket/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "socket", + srcs = ["socket.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/syserr", + "//pkg/tcpip", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD new file mode 100644 index 000000000..ca16d0381 --- /dev/null +++ b/pkg/sentry/socket/control/BUILD @@ -0,0 +1,29 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "control", + srcs = [ + "control.go", + "control_vfs2.go", + ], + imports = [ + "gvisor.dev/gvisor/pkg/sentry/fs", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/fs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/socket", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go new file mode 100644 index 000000000..8b439a078 --- /dev/null +++ b/pkg/sentry/socket/control/control.go @@ -0,0 +1,591 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package control provides internal representations of socket control +// messages. +package control + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" +) + +const maxInt = int(^uint(0) >> 1) + +// SCMCredentials represents a SCM_CREDENTIALS socket control message. +type SCMCredentials interface { + transport.CredentialsControlMessage + + // Credentials returns properly namespaced values for the sender's pid, uid + // and gid. + Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) +} + +// LINT.IfChange + +// SCMRights represents a SCM_RIGHTS socket control message. +type SCMRights interface { + transport.RightsControlMessage + + // Files returns up to max RightsFiles. + // + // Returned files are consumed and ownership is transferred to the caller. + // Subsequent calls to Files will return the next files. + Files(ctx context.Context, max int) (rf RightsFiles, truncated bool) +} + +// RightsFiles represents a SCM_RIGHTS socket control message. A reference is +// maintained for each fs.File and is release either when an FD is created or +// when the Release method is called. +// +// +stateify savable +type RightsFiles []*fs.File + +// NewSCMRights creates a new SCM_RIGHTS socket control message representation +// using local sentry FDs. +func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { + files := make(RightsFiles, 0, len(fds)) + for _, fd := range fds { + file := t.GetFile(fd) + if file == nil { + files.Release() + return nil, syserror.EBADF + } + files = append(files, file) + } + return &files, nil +} + +// Files implements SCMRights.Files. +func (fs *RightsFiles) Files(ctx context.Context, max int) (RightsFiles, bool) { + n := max + var trunc bool + if l := len(*fs); n > l { + n = l + } else if n < l { + trunc = true + } + rf := (*fs)[:n] + *fs = (*fs)[n:] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (fs *RightsFiles) Clone() transport.RightsControlMessage { + nfs := append(RightsFiles(nil), *fs...) + for _, nf := range nfs { + nf.IncRef() + } + return &nfs +} + +// Release implements transport.RightsControlMessage.Release. +func (fs *RightsFiles) Release() { + for _, f := range *fs { + f.DecRef() + } + *fs = nil +} + +// rightsFDs gets up to the specified maximum number of FDs. +func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32, bool) { + files, trunc := rights.Files(t, max) + fds := make([]int32, 0, len(files)) + for i := 0; i < max && len(files) > 0; i++ { + fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) + files[0].DecRef() + files = files[1:] + if err != nil { + t.Warningf("Error inserting FD: %v", err) + // This is what Linux does. + break + } + + fds = append(fds, int32(fd)) + } + return fds, trunc +} + +// PackRights packs as many FDs as will fit into the unused capacity of buf. +func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flags int) ([]byte, int) { + maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4 + // Linux does not return any FDs if none fit. + if maxFDs <= 0 { + flags |= linux.MSG_CTRUNC + return buf, flags + } + fds, trunc := rightsFDs(t, rights, cloexec, maxFDs) + if trunc { + flags |= linux.MSG_CTRUNC + } + align := t.Arch().Width() + return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) +} + +// LINT.ThenChange(./control_vfs2.go) + +// scmCredentials represents an SCM_CREDENTIALS socket control message. +// +// +stateify savable +type scmCredentials struct { + t *kernel.Task + kuid auth.KUID + kgid auth.KGID +} + +// NewSCMCredentials creates a new SCM_CREDENTIALS socket control message +// representation. +func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SCMCredentials, error) { + tcred := t.Credentials() + kuid, err := tcred.UseUID(auth.UID(cred.UID)) + if err != nil { + return nil, err + } + kgid, err := tcred.UseGID(auth.GID(cred.GID)) + if err != nil { + return nil, err + } + if kernel.ThreadID(cred.PID) != t.ThreadGroup().ID() && !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.PIDNamespace().UserNamespace()) { + return nil, syserror.EPERM + } + return &scmCredentials{t, kuid, kgid}, nil +} + +// Equals implements transport.CredentialsControlMessage.Equals. +func (c *scmCredentials) Equals(oc transport.CredentialsControlMessage) bool { + if oc, _ := oc.(*scmCredentials); oc != nil && *c == *oc { + return true + } + return false +} + +func putUint64(buf []byte, n uint64) []byte { + usermem.ByteOrder.PutUint64(buf[len(buf):len(buf)+8], n) + return buf[:len(buf)+8] +} + +func putUint32(buf []byte, n uint32) []byte { + usermem.ByteOrder.PutUint32(buf[len(buf):len(buf)+4], n) + return buf[:len(buf)+4] +} + +// putCmsg writes a control message header and as much data as will fit into +// the unused capacity of a buffer. +func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) { + space := binary.AlignDown(cap(buf)-len(buf), 4) + + // We can't write to space that doesn't exist, so if we are going to align + // the available space, we must align down. + // + // align must be >= 4 and each data int32 is 4 bytes. The length of the + // header is already aligned, so if we align to the width of the data there + // are two cases: + // 1. The aligned length is less than the length of the header. The + // unaligned length was also less than the length of the header, so we + // can't write anything. + // 2. The aligned length is greater than or equal to the length of the + // header. We can write the header plus zero or more bytes of data. We can't + // write a partial int32, so the length of the message will be + // min(aligned length, header + data). + if space < linux.SizeOfControlMessageHeader { + flags |= linux.MSG_CTRUNC + return buf, flags + } + + length := 4*len(data) + linux.SizeOfControlMessageHeader + if length > space { + length = space + } + buf = putUint64(buf, uint64(length)) + buf = putUint32(buf, linux.SOL_SOCKET) + buf = putUint32(buf, msgType) + for _, d := range data { + if len(buf)+4 > cap(buf) { + flags |= linux.MSG_CTRUNC + break + } + buf = putUint32(buf, uint32(d)) + } + return alignSlice(buf, align), flags +} + +func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interface{}) []byte { + if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader { + return buf + } + ob := buf + + buf = putUint64(buf, uint64(linux.SizeOfControlMessageHeader)) + buf = putUint32(buf, msgLevel) + buf = putUint32(buf, msgType) + + hdrBuf := buf + + buf = binary.Marshal(buf, usermem.ByteOrder, data) + + // If the control message data brought us over capacity, omit it. + if cap(buf) != cap(ob) { + return hdrBuf + } + + // Update control message length to include data. + putUint64(ob, uint64(len(buf)-len(ob))) + + return alignSlice(buf, align) +} + +// Credentials implements SCMCredentials.Credentials. +func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) { + // "When a process's user and group IDs are passed over a UNIX domain + // socket to a process in a different user namespace (see the description + // of SCM_CREDENTIALS in unix(7)), they are translated into the + // corresponding values as per the receiving process's user and group ID + // mappings." - user_namespaces(7) + pid := t.PIDNamespace().IDOfTask(c.t) + uid := c.kuid.In(t.UserNamespace()).OrOverflow() + gid := c.kgid.In(t.UserNamespace()).OrOverflow() + + return pid, uid, gid +} + +// PackCredentials packs the credentials in the control message (or default +// credentials if none) into a buffer. +func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int) ([]byte, int) { + align := t.Arch().Width() + + // Default credentials if none are available. + pid := kernel.ThreadID(0) + uid := auth.UID(auth.NobodyKUID) + gid := auth.GID(auth.NobodyKGID) + + if creds != nil { + pid, uid, gid = creds.Credentials(t) + } + c := []int32{int32(pid), int32(uid), int32(gid)} + return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c) +} + +// alignSlice extends a slice's length (up to the capacity) to align it. +func alignSlice(buf []byte, align uint) []byte { + aligned := binary.AlignUp(len(buf), align) + if aligned > cap(buf) { + // Linux allows unaligned data if there isn't room for alignment. + // Since there isn't room for alignment, there isn't room for any + // additional messages either. + return buf + } + return buf[:aligned] +} + +// PackTimestamp packs a SO_TIMESTAMP socket control message. +func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_SOCKET, + linux.SO_TIMESTAMP, + t.Arch().Width(), + linux.NsecToTimeval(timestamp), + ) +} + +// PackInq packs a TCP_INQ socket control message. +func PackInq(t *kernel.Task, inq int32, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_TCP, + linux.TCP_INQ, + t.Arch().Width(), + inq, + ) +} + +// PackTOS packs an IP_TOS socket control message. +func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IP, + linux.IP_TOS, + t.Arch().Width(), + tos, + ) +} + +// PackTClass packs an IPV6_TCLASS socket control message. +func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte { + return putCmsgStruct( + buf, + linux.SOL_IPV6, + linux.IPV6_TCLASS, + t.Arch().Width(), + tClass, + ) +} + +// PackIPPacketInfo packs an IP_PKTINFO socket control message. +func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte { + var p linux.ControlMessageIPPacketInfo + p.NIC = int32(packetInfo.NIC) + copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr)) + copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr)) + + return putCmsgStruct( + buf, + linux.SOL_IP, + linux.IP_PKTINFO, + t.Arch().Width(), + p, + ) +} + +// PackControlMessages packs control messages into the given buffer. +// +// We skip control messages specific to Unix domain sockets. +// +// Note that some control messages may be truncated if they do not fit under +// the capacity of buf. +func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte { + if cmsgs.IP.HasTimestamp { + buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf) + } + + if cmsgs.IP.HasInq { + // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. + buf = PackInq(t, cmsgs.IP.Inq, buf) + } + + if cmsgs.IP.HasTOS { + buf = PackTOS(t, cmsgs.IP.TOS, buf) + } + + if cmsgs.IP.HasTClass { + buf = PackTClass(t, cmsgs.IP.TClass, buf) + } + + if cmsgs.IP.HasIPPacketInfo { + buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf) + } + + return buf +} + +// cmsgSpace is equivalent to CMSG_SPACE in Linux. +func cmsgSpace(t *kernel.Task, dataLen int) int { + return linux.SizeOfControlMessageHeader + binary.AlignUp(dataLen, t.Arch().Width()) +} + +// CmsgsSpace returns the number of bytes needed to fit the control messages +// represented in cmsgs. +func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { + space := 0 + + if cmsgs.IP.HasTimestamp { + space += cmsgSpace(t, linux.SizeOfTimeval) + } + + if cmsgs.IP.HasInq { + space += cmsgSpace(t, linux.SizeOfControlMessageInq) + } + + if cmsgs.IP.HasTOS { + space += cmsgSpace(t, linux.SizeOfControlMessageTOS) + } + + if cmsgs.IP.HasTClass { + space += cmsgSpace(t, linux.SizeOfControlMessageTClass) + } + + return space +} + +// NewIPPacketInfo returns the IPPacketInfo struct. +func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo { + var p tcpip.IPPacketInfo + p.NIC = tcpip.NICID(packetInfo.NIC) + copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:]) + copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:]) + + return p +} + +// Parse parses a raw socket control message into portable objects. +func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) { + var ( + cmsgs socket.ControlMessages + fds linux.ControlMessageRights + ) + + for i := 0; i < len(buf); { + if i+linux.SizeOfControlMessageHeader > len(buf) { + return cmsgs, syserror.EINVAL + } + + var h linux.ControlMessageHeader + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h) + + if h.Length < uint64(linux.SizeOfControlMessageHeader) { + return socket.ControlMessages{}, syserror.EINVAL + } + if h.Length > uint64(len(buf)-i) { + return socket.ControlMessages{}, syserror.EINVAL + } + + i += linux.SizeOfControlMessageHeader + length := int(h.Length) - linux.SizeOfControlMessageHeader + + // The use of t.Arch().Width() is analogous to Linux's use of + // sizeof(long) in CMSG_ALIGN. + width := t.Arch().Width() + + switch h.Level { + case linux.SOL_SOCKET: + switch h.Type { + case linux.SCM_RIGHTS: + rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight) + numRights := rightsSize / linux.SizeOfControlMessageRight + + if len(fds)+numRights > linux.SCM_MAX_FD { + return socket.ControlMessages{}, syserror.EINVAL + } + + for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { + fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight]))) + } + + i += binary.AlignUp(length, width) + + case linux.SCM_CREDENTIALS: + if length < linux.SizeOfControlMessageCredentials { + return socket.ControlMessages{}, syserror.EINVAL + } + + var creds linux.ControlMessageCredentials + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds) + scmCreds, err := NewSCMCredentials(t, creds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Credentials = scmCreds + i += binary.AlignUp(length, width) + + default: + // Unknown message type. + return socket.ControlMessages{}, syserror.EINVAL + } + case linux.SOL_IP: + switch h.Type { + case linux.IP_TOS: + if length < linux.SizeOfControlMessageTOS { + return socket.ControlMessages{}, syserror.EINVAL + } + cmsgs.IP.HasTOS = true + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS) + i += binary.AlignUp(length, width) + + case linux.IP_PKTINFO: + if length < linux.SizeOfControlMessageIPPacketInfo { + return socket.ControlMessages{}, syserror.EINVAL + } + + cmsgs.IP.HasIPPacketInfo = true + var packetInfo linux.ControlMessageIPPacketInfo + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo) + + cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo) + i += binary.AlignUp(length, width) + + default: + return socket.ControlMessages{}, syserror.EINVAL + } + case linux.SOL_IPV6: + switch h.Type { + case linux.IPV6_TCLASS: + if length < linux.SizeOfControlMessageTClass { + return socket.ControlMessages{}, syserror.EINVAL + } + cmsgs.IP.HasTClass = true + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass) + i += binary.AlignUp(length, width) + + default: + return socket.ControlMessages{}, syserror.EINVAL + } + default: + return socket.ControlMessages{}, syserror.EINVAL + } + } + + if cmsgs.Unix.Credentials == nil { + cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint) + } + + if len(fds) > 0 { + if kernel.VFS2Enabled { + rights, err := NewSCMRightsVFS2(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights + } else { + rights, err := NewSCMRights(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights + } + } + + return cmsgs, nil +} + +func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials { + if t == nil || socketOrEndpoint == nil { + return nil + } + if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) { + return MakeCreds(t) + } + return nil +} + +// MakeCreds creates default SCMCredentials. +func MakeCreds(t *kernel.Task) SCMCredentials { + if t == nil { + return nil + } + tcred := t.Credentials() + return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} +} + +// LINT.IfChange + +// New creates default control messages if needed. +func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages { + return transport.ControlMessages{ + Credentials: makeCreds(t, socketOrEndpoint), + Rights: rights, + } +} + +// LINT.ThenChange(./control_vfs2.go) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go new file mode 100644 index 000000000..fd08179be --- /dev/null +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SCMRightsVFS2 represents a SCM_RIGHTS socket control message. +type SCMRightsVFS2 interface { + transport.RightsControlMessage + + // Files returns up to max RightsFiles. + // + // Returned files are consumed and ownership is transferred to the caller. + // Subsequent calls to Files will return the next files. + Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool) +} + +// RightsFiles represents a SCM_RIGHTS socket control message. A reference is +// maintained for each vfs.FileDescription and is release either when an FD is created or +// when the Release method is called. +type RightsFilesVFS2 []*vfs.FileDescription + +// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message +// representation using local sentry FDs. +func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { + files := make(RightsFilesVFS2, 0, len(fds)) + for _, fd := range fds { + file := t.GetFileVFS2(fd) + if file == nil { + files.Release() + return nil, syserror.EBADF + } + files = append(files, file) + } + return &files, nil +} + +// Files implements SCMRights.Files. +func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(*fs); n > l { + n = l + } else if n < l { + trunc = true + } + rf := (*fs)[:n] + *fs = (*fs)[n:] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { + nfs := append(RightsFilesVFS2(nil), *fs...) + for _, nf := range nfs { + nf.IncRef() + } + return &nfs +} + +// Release implements transport.RightsControlMessage.Release. +func (fs *RightsFilesVFS2) Release() { + for _, f := range *fs { + f.DecRef() + } + *fs = nil +} + +// rightsFDsVFS2 gets up to the specified maximum number of FDs. +func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) { + files, trunc := rights.Files(t, max) + fds := make([]int32, 0, len(files)) + for i := 0; i < max && len(files) > 0; i++ { + fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) + files[0].DecRef() + files = files[1:] + if err != nil { + t.Warningf("Error inserting FD: %v", err) + // This is what Linux does. + break + } + + fds = append(fds, int32(fd)) + } + return fds, trunc +} + +// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf. +func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) { + maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4 + // Linux does not return any FDs if none fit. + if maxFDs <= 0 { + flags |= linux.MSG_CTRUNC + return buf, flags + } + fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs) + if trunc { + flags |= linux.MSG_CTRUNC + } + align := t.Arch().Width() + return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) +} + +// NewVFS2 creates default control messages if needed. +func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages { + return transport.ControlMessages{ + Credentials: makeCreds(t, socketOrEndpoint), + Rights: rights, + } +} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD new file mode 100644 index 000000000..ff81ea6e6 --- /dev/null +++ b/pkg/sentry/socket/hostinet/BUILD @@ -0,0 +1,45 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "hostinet", + srcs = [ + "device.go", + "hostinet.go", + "save_restore.go", + "socket.go", + "socket_unsafe.go", + "socket_vfs2.go", + "sockopt_impl.go", + "stack.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/log", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/hostfd", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/vfs", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip/stack", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go new file mode 100644 index 000000000..27049d65f --- /dev/null +++ b/pkg/sentry/socket/hostinet/device.go @@ -0,0 +1,19 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import "gvisor.dev/gvisor/pkg/sentry/device" + +var socketDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go new file mode 100644 index 000000000..0d6f51d2b --- /dev/null +++ b/pkg/sentry/socket/hostinet/hostinet.go @@ -0,0 +1,17 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostinet implements AF_INET and AF_INET6 sockets using the host's +// network stack. +package hostinet diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go new file mode 100644 index 000000000..1dec33897 --- /dev/null +++ b/pkg/sentry/socket/hostinet/save_restore.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +// beforeSave is invoked by stateify. +func (*socketOperations) beforeSave() { + panic("host.socketOperations is not savable") +} diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go new file mode 100644 index 000000000..a92aed2c9 --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket.go @@ -0,0 +1,713 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "fmt" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + sizeofInt32 = 4 + + // sizeofSockaddr is the size in bytes of the largest sockaddr type + // supported by this package. + sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in) + + // maxControlLen is the maximum size of a control message buffer used in a + // recvmsg or sendmsg syscall. + maxControlLen = 1024 +) + +// LINT.IfChange + +// socketOperations implements fs.FileOperations and socket.Socket for a socket +// implemented using a host socket. +type socketOperations struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + socket.SendReceiveTimeout + + family int // Read-only. + stype linux.SockType // Read-only. + protocol int // Read-only. + queue waiter.Queue + + // fd is the host socket fd. It must have O_NONBLOCK, so that operations + // will return EWOULDBLOCK instead of blocking on the host. This allows us to + // handle blocking behavior independently in the sentry. + fd int +} + +var _ = socket.Socket(&socketOperations{}) + +func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { + s := &socketOperations{ + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, + } + if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { + return nil, syserr.FromError(err) + } + dirent := socket.NewDirent(ctx, socketDevice) + defer dirent.DecRef() + return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil +} + +// Release implements fs.FileOperations.Release. +func (s *socketOpsCommon) Release() { + fdnotifier.RemoveFD(int32(s.fd)) + syscall.Close(s.fd) +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(s.fd), mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(s.fd)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { + s.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(s.fd)) +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, io, args) +} + +// Read implements fs.FileOperations.Read. +func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of dst.Addrs was unusable. + if uint64(dst.NumBytes()) != dsts.NumBytes() { + return 0, nil + } + if dsts.IsEmpty() { + return 0, nil + } + if dsts.NumBlocks() == 1 { + // Skip allocating []syscall.Iovec. + n, err := syscall.Read(s.fd, dsts.Head().ToSlice()) + if err != nil { + return 0, translateIOSyscallError(err) + } + return uint64(n), nil + } + return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) + })) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of src.Addrs was unusable. + if uint64(src.NumBytes()) != srcs.NumBytes() { + return 0, nil + } + if srcs.IsEmpty() { + return 0, nil + } + if srcs.NumBlocks() == 1 { + // Skip allocating []syscall.Iovec. + n, err := syscall.Write(s.fd, srcs.Head().ToSlice()) + if err != nil { + return 0, translateIOSyscallError(err) + } + return uint64(n), nil + } + return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) + })) + return int64(n), err +} + +// Connect implements socket.Socket.Connect. +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + if len(sockaddr) > sizeofSockaddr { + sockaddr = sockaddr[:sizeofSockaddr] + } + + _, _, errno := syscall.Syscall(syscall.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) + + if errno == 0 { + return nil + } + if errno != syscall.EINPROGRESS || !blocking { + return syserr.FromError(translateIOSyscallError(errno)) + } + + // "EINPROGRESS: The socket is nonblocking and the connection cannot be + // completed immediately. It is possible to select(2) or poll(2) for + // completion by selecting the socket for writing. After select(2) + // indicates writability, use getsockopt(2) to read the SO_ERROR option at + // level SOL-SOCKET to determine whether connect() completed successfully + // (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error + // codes listed here, explaining the reason for the failure)." - connect(2) + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + if s.Readiness(waiter.EventOut)&waiter.EventOut == 0 { + if err := t.Block(ch); err != nil { + return syserr.FromError(err) + } + } + val, err := syscall.GetsockoptInt(s.fd, syscall.SOL_SOCKET, syscall.SO_ERROR) + if err != nil { + return syserr.FromError(err) + } + if val != 0 { + return syserr.FromError(syscall.Errno(uintptr(val))) + } + return nil +} + +// Accept implements socket.Socket.Accept. +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + var peerAddr linux.SockAddr + var peerAddrBuf []byte + var peerAddrlen uint32 + var peerAddrPtr *byte + var peerAddrlenPtr *uint32 + if peerRequested { + peerAddrBuf = make([]byte, sizeofSockaddr) + peerAddrlen = uint32(len(peerAddrBuf)) + peerAddrPtr = &peerAddrBuf[0] + peerAddrlenPtr = &peerAddrlen + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOpsCommon requires it. + fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC) + if blocking { + var ch chan struct{} + for syscallErr == syserror.ErrWouldBlock { + if ch != nil { + if syscallErr = t.Block(ch); syscallErr != nil { + break + } + } else { + var e waiter.Entry + e, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + } + fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC) + } + } + + if peerRequested { + peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen]) + } + if syscallErr != nil { + return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) + } + + var ( + kfd int32 + kerr error + ) + if kernel.VFS2Enabled { + f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&syscall.SOCK_NONBLOCK)) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() + + kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocketVFS2(f) + } else { + f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() + + kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocket(f) + } + + return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) +} + +// Bind implements socket.Socket.Bind. +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + if len(sockaddr) > sizeofSockaddr { + sockaddr = sockaddr[:sizeofSockaddr] + } + + _, _, errno := syscall.Syscall(syscall.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) + if errno != 0 { + return syserr.FromError(errno) + } + return nil +} + +// Listen implements socket.Socket.Listen. +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { + return syserr.FromError(syscall.Listen(s.fd, backlog)) +} + +// Shutdown implements socket.Socket.Shutdown. +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { + switch how { + case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR: + return syserr.FromError(syscall.Shutdown(s.fd, how)) + default: + return syserr.ErrInvalidArgument + } +} + +// GetSockOpt implements socket.Socket.GetSockOpt. +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + if outLen < 0 { + return nil, syserr.ErrInvalidArgument + } + + // Only allow known and safe options. + optlen := getSockOptLen(t, level, name) + switch level { + case linux.SOL_IP: + switch name { + case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO: + optlen = sizeofInt32 + } + case linux.SOL_IPV6: + switch name { + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: + optlen = sizeofInt32 + } + case linux.SOL_SOCKET: + switch name { + case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: + optlen = sizeofInt32 + case linux.SO_LINGER: + optlen = syscall.SizeofLinger + } + case linux.SOL_TCP: + switch name { + case linux.TCP_NODELAY: + optlen = sizeofInt32 + case linux.TCP_INFO: + optlen = int(linux.SizeOfTCPInfo) + } + } + + if optlen == 0 { + return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT + } + if outLen < optlen { + return nil, syserr.ErrInvalidArgument + } + + opt, err := getsockopt(s.fd, level, name, optlen) + if err != nil { + return nil, syserr.FromError(err) + } + return opt, nil +} + +// SetSockOpt implements socket.Socket.SetSockOpt. +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { + // Only allow known and safe options. + optlen := setSockOptLen(t, level, name) + switch level { + case linux.SOL_IP: + switch name { + case linux.IP_TOS, linux.IP_RECVTOS: + optlen = sizeofInt32 + case linux.IP_PKTINFO: + optlen = linux.SizeOfControlMessageIPPacketInfo + } + case linux.SOL_IPV6: + switch name { + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: + optlen = sizeofInt32 + } + case linux.SOL_SOCKET: + switch name { + case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: + optlen = sizeofInt32 + } + case linux.SOL_TCP: + switch name { + case linux.TCP_NODELAY: + optlen = sizeofInt32 + } + } + + if optlen == 0 { + // Pretend to accept socket options we don't understand. This seems + // dangerous, but it's what netstack does... + return nil + } + if len(opt) < optlen { + return syserr.ErrInvalidArgument + } + opt = opt[:optlen] + + _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(s.fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(len(opt)), 0) + if errno != 0 { + return syserr.FromError(errno) + } + return nil +} + +// RecvMsg implements socket.Socket.RecvMsg. +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { + // Only allow known and safe flags. + // + // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary + // messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the + // Socket interface's dependence on netstack. + if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument + } + + var senderAddr linux.SockAddr + var senderAddrBuf []byte + if senderRequested { + senderAddrBuf = make([]byte, sizeofSockaddr) + } + + var controlBuf []byte + var msgFlags int + + recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of dst.Addrs was unusable. + if uint64(dst.NumBytes()) != dsts.NumBytes() { + return 0, nil + } + if dsts.IsEmpty() { + return 0, nil + } + + // We always do a non-blocking recv*(). + sysflags := flags | syscall.MSG_DONTWAIT + + iovs := safemem.IovecsFromBlockSeq(dsts) + msg := syscall.Msghdr{ + Iov: &iovs[0], + Iovlen: uint64(len(iovs)), + } + if len(senderAddrBuf) != 0 { + msg.Name = &senderAddrBuf[0] + msg.Namelen = uint32(len(senderAddrBuf)) + } + if controlLen > 0 { + if controlLen > maxControlLen { + controlLen = maxControlLen + } + controlBuf = make([]byte, controlLen) + msg.Control = &controlBuf[0] + msg.Controllen = controlLen + } + n, err := recvmsg(s.fd, &msg, sysflags) + if err != nil { + return 0, err + } + senderAddrBuf = senderAddrBuf[:msg.Namelen] + msgFlags = int(msg.Flags) + controlLen = uint64(msg.Controllen) + return n, nil + }) + + var ch chan struct{} + n, err := dst.CopyOutFrom(t, recvmsgToBlocks) + if flags&syscall.MSG_DONTWAIT == 0 { + for err == syserror.ErrWouldBlock { + // We only expect blocking to come from the actual syscall, in which + // case it can't have returned any data. + if n != 0 { + panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err)) + } + if ch != nil { + if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + break + } + } else { + var e waiter.Entry + e, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + } + n, err = dst.CopyOutFrom(t, recvmsgToBlocks) + } + } + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + + if senderRequested { + senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf) + } + + unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf[:controlLen]) + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + + controlMessages := socket.ControlMessages{} + for _, unixCmsg := range unixControlMessages { + switch unixCmsg.Header.Level { + case syscall.SOL_IP: + switch unixCmsg.Header.Type { + case syscall.IP_TOS: + controlMessages.IP.HasTOS = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS) + + case syscall.IP_PKTINFO: + controlMessages.IP.HasIPPacketInfo = true + var packetInfo linux.ControlMessageIPPacketInfo + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo) + controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo) + } + + case syscall.SOL_IPV6: + switch unixCmsg.Header.Type { + case syscall.IPV6_TCLASS: + controlMessages.IP.HasTClass = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], usermem.ByteOrder, &controlMessages.IP.TClass) + } + } + } + + return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), controlMessages, nil +} + +// SendMsg implements socket.Socket.SendMsg. +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { + // Only allow known and safe flags. + if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { + return 0, syserr.ErrInvalidArgument + } + + space := uint64(control.CmsgsSpace(t, controlMessages)) + if space > maxControlLen { + space = maxControlLen + } + controlBuf := make([]byte, 0, space) + // PackControlMessages will append up to space bytes to controlBuf. + controlBuf = control.PackControlMessages(t, controlMessages, controlBuf) + + sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of src.Addrs was unusable. + if uint64(src.NumBytes()) != srcs.NumBytes() { + return 0, nil + } + if srcs.IsEmpty() && len(controlBuf) == 0 { + return 0, nil + } + + // We always do a non-blocking send*(). + sysflags := flags | syscall.MSG_DONTWAIT + + if srcs.NumBlocks() == 1 && len(controlBuf) == 0 { + // Skip allocating []syscall.Iovec. + src := srcs.Head() + n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil + } + + iovs := safemem.IovecsFromBlockSeq(srcs) + msg := syscall.Msghdr{ + Iov: &iovs[0], + Iovlen: uint64(len(iovs)), + } + if len(to) != 0 { + msg.Name = &to[0] + msg.Namelen = uint32(len(to)) + } + if len(controlBuf) != 0 { + msg.Control = &controlBuf[0] + msg.Controllen = uint64(len(controlBuf)) + } + return sendmsg(s.fd, &msg, sysflags) + }) + + var ch chan struct{} + n, err := src.CopyInTo(t, sendmsgFromBlocks) + if flags&syscall.MSG_DONTWAIT == 0 { + for err == syserror.ErrWouldBlock { + // We only expect blocking to come from the actual syscall, in which + // case it can't have returned any data. + if n != 0 { + panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err)) + } + if ch != nil { + if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } else { + var e waiter.Entry + e, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + } + n, err = src.CopyInTo(t, sendmsgFromBlocks) + } + } + + return int(n), syserr.FromError(err) +} + +func translateIOSyscallError(err error) error { + if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +// State implements socket.Socket.State. +func (s *socketOpsCommon) State() uint32 { + info := linux.TCPInfo{} + buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo) + if err != nil { + if err != syscall.ENOPROTOOPT { + log.Warningf("Failed to get TCP socket info from %+v: %v", s, err) + } + // For non-TCP sockets, silently ignore the failure. + return 0 + } + if len(buf) != linux.SizeOfTCPInfo { + // Unmarshal below will panic if getsockopt returns a buffer of + // unexpected size. + log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo) + return 0 + } + + binary.Unmarshal(buf, usermem.ByteOrder, &info) + return uint32(info.State) +} + +// Type implements socket.Socket.Type. +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { + return s.family, s.stype, s.protocol +} + +type socketProvider struct { + family int +} + +// Socket implements socket.Provider.Socket. +func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := stypeflags & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&syscall.SOCK_NONBLOCK != 0) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} + +// LINT.ThenChange(./socket_vfs2.go) + +func init() { + for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { + socket.RegisterProvider(family, &socketProvider{family}) + socket.RegisterProviderVFS2(family, &socketProviderVFS2{}) + } +} diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go new file mode 100644 index 000000000..3f420c2ec --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -0,0 +1,139 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func firstBytePtr(bs []byte) unsafe.Pointer { + if bs == nil { + return nil + } + return unsafe.Pointer(&bs[0]) +} + +// Preconditions: len(dsts) != 0. +func readv(fd int, dsts []syscall.Iovec) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&dsts[0])), uintptr(len(dsts))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} + +// Preconditions: len(srcs) != 0. +func writev(fd int, srcs []syscall.Iovec) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&srcs[0])), uintptr(len(srcs))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} + +func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := uintptr(args[1].Int()); cmd { + case syscall.TIOCINQ, syscall.TIOCOUTQ: + var val int32 + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { + return 0, translateIOSyscallError(errno) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], uint32(val)) + _, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func accept4(fd int, addr *byte, addrlen *uint32, flags int) (int, error) { + afd, _, errno := syscall.Syscall6(syscall.SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(addr)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return int(afd), nil +} + +func getsockopt(fd int, level, name int, optlen int) ([]byte, error) { + opt := make([]byte, optlen) + optlen32 := int32(len(opt)) + _, _, errno := syscall.Syscall6(syscall.SYS_GETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(unsafe.Pointer(&optlen32)), 0) + if errno != 0 { + return nil, errno + } + return opt[:optlen32], nil +} + +// GetSockName implements socket.Socket.GetSockName. +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addr := make([]byte, sizeofSockaddr) + addrlen := uint32(len(addr)) + _, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) + if errno != 0 { + return nil, 0, syserr.FromError(errno) + } + return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil +} + +// GetPeerName implements socket.Socket.GetPeerName. +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addr := make([]byte, sizeofSockaddr) + addrlen := uint32(len(addr)) + _, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) + if errno != 0 { + return nil, 0, syserr.FromError(errno) + } + return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil +} + +func recvfrom(fd int, dst []byte, flags int, from *[]byte) (uint64, error) { + fromLen := uint32(len(*from)) + n, _, errno := syscall.Syscall6(syscall.SYS_RECVFROM, uintptr(fd), uintptr(firstBytePtr(dst)), uintptr(len(dst)), uintptr(flags), uintptr(firstBytePtr(*from)), uintptr(unsafe.Pointer(&fromLen))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + *from = (*from)[:fromLen] + return uint64(n), nil +} + +func recvmsg(fd int, msg *syscall.Msghdr, flags int) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} + +func sendmsg(fd int, msg *syscall.Msghdr, flags int) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go new file mode 100644 index 000000000..8f192c62f --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -0,0 +1,202 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +type socketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + // We store metadata for hostinet sockets internally. Technically, we should + // access metadata (e.g. through stat, chmod) on the host for correctness, + // but this is not very useful for inet socket fds, which do not belong to a + // concrete file anyway. + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&socketVFS2{}) + +func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + + s := &socketVFS2{ + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, + } + s.LockFD.Init(&vfs.FileLocks{}) + if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { + return nil, syserr.FromError(err) + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, uio, args) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) + return int64(n), err +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) + return int64(n), err +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + +type socketProviderVFS2 struct { + family int +} + +// Socket implements socket.ProviderVFS2.Socket. +func (p *socketProviderVFS2) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := stypeflags & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newVFS2Socket(t, p.family, stype, protocol, fd, uint32(stypeflags&syscall.SOCK_NONBLOCK)) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProviderVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go new file mode 100644 index 000000000..8a783712e --- /dev/null +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +func getSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} + +func setSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go new file mode 100644 index 000000000..a48082631 --- /dev/null +++ b/pkg/sentry/socket/hostinet/stack.go @@ -0,0 +1,459 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "reflect" + "strconv" + "strings" + "syscall" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +var defaultRecvBufSize = inet.TCPBufferSize{ + Min: 4096, + Default: 87380, + Max: 6291456, +} + +var defaultSendBufSize = inet.TCPBufferSize{ + Min: 4096, + Default: 16384, + Max: 4194304, +} + +// Stack implements inet.Stack for host sockets. +type Stack struct { + // Stack is immutable. + interfaces map[int32]inet.Interface + interfaceAddrs map[int32][]inet.InterfaceAddr + routes []inet.Route + supportsIPv6 bool + tcpRecvBufSize inet.TCPBufferSize + tcpSendBufSize inet.TCPBufferSize + tcpSACKEnabled bool + netDevFile *os.File + netSNMPFile *os.File +} + +// NewStack returns an empty Stack containing no configuration. +func NewStack() *Stack { + return &Stack{ + interfaces: make(map[int32]inet.Interface), + interfaceAddrs: make(map[int32][]inet.InterfaceAddr), + } +} + +// Configure sets up the stack using the current state of the host network. +func (s *Stack) Configure() error { + if err := addHostInterfaces(s); err != nil { + return err + } + + if err := addHostRoutes(s); err != nil { + return err + } + + if _, err := os.Stat("/proc/net/if_inet6"); err == nil { + s.supportsIPv6 = true + } + + s.tcpRecvBufSize = defaultRecvBufSize + if tcpRMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_rmem"); err == nil { + s.tcpRecvBufSize = tcpRMem + } else { + log.Warningf("Failed to read TCP receive buffer size, using default values") + } + + s.tcpSendBufSize = defaultSendBufSize + if tcpWMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_wmem"); err == nil { + s.tcpSendBufSize = tcpWMem + } else { + log.Warningf("Failed to read TCP send buffer size, using default values") + } + + // SACK is important for performance and even compatibility, assume it's + // enabled if we can't find the actual value. + s.tcpSACKEnabled = true + if sack, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_sack"); err == nil { + s.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0" + } else { + log.Warningf("Failed to read if TCP SACK if enabled, setting to true") + } + + if f, err := os.Open("/proc/net/dev"); err != nil { + log.Warningf("Failed to open /proc/net/dev: %v", err) + } else { + s.netDevFile = f + } + + if f, err := os.Open("/proc/net/snmp"); err != nil { + log.Warningf("Failed to open /proc/net/snmp: %v", err) + } else { + s.netSNMPFile = f + } + + return nil +} + +// ExtractHostInterfaces will populate an interface map and +// interfaceAddrs map with the results of the equivalent +// netlink messages. +func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.NetlinkMessage, interfaces map[int32]inet.Interface, interfaceAddrs map[int32][]inet.InterfaceAddr) error { + for _, link := range links { + if link.Header.Type != syscall.RTM_NEWLINK { + continue + } + if len(link.Data) < syscall.SizeofIfInfomsg { + return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid data length (%d bytes, expected at least %d bytes)", len(link.Data), syscall.SizeofIfInfomsg) + } + var ifinfo syscall.IfInfomsg + binary.Unmarshal(link.Data[:syscall.SizeofIfInfomsg], usermem.ByteOrder, &ifinfo) + inetIF := inet.Interface{ + DeviceType: ifinfo.Type, + Flags: ifinfo.Flags, + } + // Not clearly documented: syscall.ParseNetlinkRouteAttr will check the + // syscall.NetlinkMessage.Header.Type and skip the struct ifinfomsg + // accordingly. + attrs, err := syscall.ParseNetlinkRouteAttr(&link) + if err != nil { + return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid rtattrs: %v", err) + } + for _, attr := range attrs { + switch attr.Attr.Type { + case syscall.IFLA_ADDRESS: + inetIF.Addr = attr.Value + case syscall.IFLA_IFNAME: + inetIF.Name = string(attr.Value[:len(attr.Value)-1]) + } + } + interfaces[ifinfo.Index] = inetIF + } + + for _, addr := range addrs { + if addr.Header.Type != syscall.RTM_NEWADDR { + continue + } + if len(addr.Data) < syscall.SizeofIfAddrmsg { + return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid data length (%d bytes, expected at least %d bytes)", len(addr.Data), syscall.SizeofIfAddrmsg) + } + var ifaddr syscall.IfAddrmsg + binary.Unmarshal(addr.Data[:syscall.SizeofIfAddrmsg], usermem.ByteOrder, &ifaddr) + inetAddr := inet.InterfaceAddr{ + Family: ifaddr.Family, + PrefixLen: ifaddr.Prefixlen, + Flags: ifaddr.Flags, + } + attrs, err := syscall.ParseNetlinkRouteAttr(&addr) + if err != nil { + return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid rtattrs: %v", err) + } + for _, attr := range attrs { + switch attr.Attr.Type { + case syscall.IFA_ADDRESS: + inetAddr.Addr = attr.Value + } + } + interfaceAddrs[int32(ifaddr.Index)] = append(interfaceAddrs[int32(ifaddr.Index)], inetAddr) + } + + return nil +} + +// ExtractHostRoutes populates the given routes slice with the data from the +// host route table. +func ExtractHostRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error) { + var routes []inet.Route + for _, routeMsg := range routeMsgs { + if routeMsg.Header.Type != syscall.RTM_NEWROUTE { + continue + } + + var ifRoute syscall.RtMsg + binary.Unmarshal(routeMsg.Data[:syscall.SizeofRtMsg], usermem.ByteOrder, &ifRoute) + inetRoute := inet.Route{ + Family: ifRoute.Family, + DstLen: ifRoute.Dst_len, + SrcLen: ifRoute.Src_len, + TOS: ifRoute.Tos, + Table: ifRoute.Table, + Protocol: ifRoute.Protocol, + Scope: ifRoute.Scope, + Type: ifRoute.Type, + Flags: ifRoute.Flags, + } + + // Not clearly documented: syscall.ParseNetlinkRouteAttr will check the + // syscall.NetlinkMessage.Header.Type and skip the struct rtmsg + // accordingly. + attrs, err := syscall.ParseNetlinkRouteAttr(&routeMsg) + if err != nil { + return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid rtattrs: %v", err) + } + + for _, attr := range attrs { + switch attr.Attr.Type { + case syscall.RTA_DST: + inetRoute.DstAddr = attr.Value + case syscall.RTA_SRC: + inetRoute.SrcAddr = attr.Value + case syscall.RTA_GATEWAY: + inetRoute.GatewayAddr = attr.Value + case syscall.RTA_OIF: + expected := int(binary.Size(inetRoute.OutputInterface)) + if len(attr.Value) != expected { + return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid attribute data length (%d bytes, expected %d bytes)", len(attr.Value), expected) + } + binary.Unmarshal(attr.Value, usermem.ByteOrder, &inetRoute.OutputInterface) + } + } + + routes = append(routes, inetRoute) + } + + return routes, nil +} + +func addHostInterfaces(s *Stack) error { + links, err := doNetlinkRouteRequest(syscall.RTM_GETLINK) + if err != nil { + return fmt.Errorf("RTM_GETLINK failed: %v", err) + } + + addrs, err := doNetlinkRouteRequest(syscall.RTM_GETADDR) + if err != nil { + return fmt.Errorf("RTM_GETADDR failed: %v", err) + } + + return ExtractHostInterfaces(links, addrs, s.interfaces, s.interfaceAddrs) +} + +func addHostRoutes(s *Stack) error { + routes, err := doNetlinkRouteRequest(syscall.RTM_GETROUTE) + if err != nil { + return fmt.Errorf("RTM_GETROUTE failed: %v", err) + } + + s.routes, err = ExtractHostRoutes(routes) + if err != nil { + return err + } + + return nil +} + +func doNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) { + data, err := syscall.NetlinkRIB(req, syscall.AF_UNSPEC) + if err != nil { + return nil, err + } + return syscall.ParseNetlinkMessage(data) +} + +func readTCPBufferSizeFile(filename string) (inet.TCPBufferSize, error) { + contents, err := ioutil.ReadFile(filename) + if err != nil { + return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, err) + } + ioseq := usermem.BytesIOSequence(contents) + fields := make([]int32, 3) + if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil { + return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes()) + } + return inet.TCPBufferSize{ + Min: int(fields[0]), + Default: int(fields[1]), + Max: int(fields[2]), + }, nil +} + +// Interfaces implements inet.Stack.Interfaces. +func (s *Stack) Interfaces() map[int32]inet.Interface { + interfaces := make(map[int32]inet.Interface) + for k, v := range s.interfaces { + interfaces[k] = v + } + return interfaces +} + +// InterfaceAddrs implements inet.Stack.InterfaceAddrs. +func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { + addrs := make(map[int32][]inet.InterfaceAddr) + for k, v := range s.interfaceAddrs { + addrs[k] = append([]inet.InterfaceAddr(nil), v...) + } + return addrs +} + +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + return syserror.EACCES +} + +// SupportsIPv6 implements inet.Stack.SupportsIPv6. +func (s *Stack) SupportsIPv6() bool { + return s.supportsIPv6 +} + +// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. +func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { + return s.tcpRecvBufSize, nil +} + +// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. +func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { + return syserror.EACCES +} + +// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. +func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { + return s.tcpSendBufSize, nil +} + +// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. +func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { + return syserror.EACCES +} + +// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. +func (s *Stack) TCPSACKEnabled() (bool, error) { + return s.tcpSACKEnabled, nil +} + +// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. +func (s *Stack) SetTCPSACKEnabled(enabled bool) error { + return syserror.EACCES +} + +// getLine reads one line from proc file, with specified prefix. +// The last argument, withHeader, specifies if it contains line header. +func getLine(f *os.File, prefix string, withHeader bool) string { + data := make([]byte, 4096) + + if _, err := f.Seek(0, 0); err != nil { + return "" + } + + if _, err := io.ReadFull(f, data); err != io.ErrUnexpectedEOF { + return "" + } + + prefix = prefix + ":" + lines := strings.Split(string(data), "\n") + for _, l := range lines { + l = strings.TrimSpace(l) + if strings.HasPrefix(l, prefix) { + if withHeader { + withHeader = false + continue + } + return l + } + } + return "" +} + +func toSlice(i interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(i)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + var ( + snmpTCP bool + rawLine string + sliceStat []uint64 + ) + + switch stat.(type) { + case *inet.StatDev: + if s.netDevFile == nil { + return fmt.Errorf("/proc/net/dev is not opened for hostinet") + } + rawLine = getLine(s.netDevFile, arg, false /* with no header */) + case *inet.StatSNMPIP, *inet.StatSNMPICMP, *inet.StatSNMPICMPMSG, *inet.StatSNMPTCP, *inet.StatSNMPUDP, *inet.StatSNMPUDPLite: + if s.netSNMPFile == nil { + return fmt.Errorf("/proc/net/snmp is not opened for hostinet") + } + rawLine = getLine(s.netSNMPFile, arg, true) + default: + return syserr.ErrEndpointOperation.ToError() + } + + if rawLine == "" { + return fmt.Errorf("Failed to get raw line") + } + + parts := strings.SplitN(rawLine, ":", 2) + if len(parts) != 2 { + return fmt.Errorf("Failed to get prefix from: %q", rawLine) + } + + sliceStat = toSlice(stat) + fields := strings.Fields(strings.TrimSpace(parts[1])) + if len(fields) != len(sliceStat) { + return fmt.Errorf("Failed to parse fields: %q", rawLine) + } + if _, ok := stat.(*inet.StatSNMPTCP); ok { + snmpTCP = true + } + for i := 0; i < len(sliceStat); i++ { + var err error + if snmpTCP && i == 3 { + var tmp int64 + // MaxConn field is signed, RFC 2012. + tmp, err = strconv.ParseInt(fields[i], 10, 64) + sliceStat[i] = uint64(tmp) // Convert back to int before use. + } else { + sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64) + } + if err != nil { + return fmt.Errorf("Failed to parse field %d from: %q, %v", i, rawLine, err) + } + } + + return nil +} + +// RouteTable implements inet.Stack.RouteTable. +func (s *Stack) RouteTable() []inet.Route { + return append([]inet.Route(nil), s.routes...) +} + +// Resume implements inet.Stack.Resume. +func (s *Stack) Resume() {} + +// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } + +// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } + +// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD new file mode 100644 index 000000000..721094bbf --- /dev/null +++ b/pkg/sentry/socket/netfilter/BUILD @@ -0,0 +1,29 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "netfilter", + srcs = [ + "extensions.go", + "netfilter.go", + "owner_matcher.go", + "targets.go", + "tcp_matcher.go", + "udp_matcher.go", + ], + # This target depends on netstack and should only be used by epsocket, + # which is allowed to depend on netstack. + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/log", + "//pkg/sentry/kernel", + "//pkg/syserr", + "//pkg/tcpip", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go new file mode 100644 index 000000000..0336a32d8 --- /dev/null +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -0,0 +1,95 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +// TODO(gvisor.dev/issue/170): The following per-matcher params should be +// supported: +// - Table name +// - Match size +// - User size +// - Hooks +// - Proto +// - Family + +// matchMaker knows how to (un)marshal the matcher named name(). +type matchMaker interface { + // name is the matcher name as stored in the xt_entry_match struct. + name() string + + // marshal converts from an stack.Matcher to an ABI struct. + marshal(matcher stack.Matcher) []byte + + // unmarshal converts from the ABI matcher struct to an + // stack.Matcher. + unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) +} + +// matchMakers maps the name of supported matchers to the matchMaker that +// marshals and unmarshals it. It is immutable after package initialization. +var matchMakers = map[string]matchMaker{} + +// registermatchMaker should be called by match extensions to register them +// with the netfilter package. +func registerMatchMaker(mm matchMaker) { + if _, ok := matchMakers[mm.name()]; ok { + panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name())) + } + matchMakers[mm.name()] = mm +} + +func marshalMatcher(matcher stack.Matcher) []byte { + matchMaker, ok := matchMakers[matcher.Name()] + if !ok { + panic(fmt.Sprintf("Unknown matcher of type %T.", matcher)) + } + return matchMaker.marshal(matcher) +} + +// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and +// data appended at the end. +func marshalEntryMatch(name string, data []byte) []byte { + nflog("marshaling matcher %q", name) + + // We have to pad this struct size to a multiple of 8 bytes. + size := binary.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8) + matcher := linux.KernelXTEntryMatch{ + XTEntryMatch: linux.XTEntryMatch{ + MatchSize: uint16(size), + }, + Data: data, + } + copy(matcher.Name[:], name) + + buf := make([]byte, 0, size) + buf = binary.Marshal(buf, usermem.ByteOrder, matcher) + return append(buf, make([]byte, size-len(buf))...) +} + +func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { + matchMaker, ok := matchMakers[match.Name.String()] + if !ok { + return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) + } + return matchMaker.unmarshal(buf, filter) +} diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go new file mode 100644 index 000000000..f7abe77d3 --- /dev/null +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -0,0 +1,761 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package netfilter helps the sentry interact with netstack's netfilter +// capabilities. +package netfilter + +import ( + "bytes" + "errors" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +// errorTargetName is used to mark targets as error targets. Error targets +// shouldn't be reached - an error has occurred if we fall through to one. +const errorTargetName = "ERROR" + +// redirectTargetName is used to mark targets as redirect targets. Redirect +// targets should be reached for only NAT and Mangle tables. These targets will +// change the destination port/destination IP for packets. +const redirectTargetName = "REDIRECT" + +// enableLogging controls whether to log the (de)serialization of netfilter +// structs between userspace and netstack. These logs are useful when +// developing iptables, but can pollute sentry logs otherwise. +const enableLogging = false + +// emptyFilter is for comparison with a rule's filters to determine whether it +// is also empty. It is immutable. +var emptyFilter = stack.IPHeaderFilter{ + Dst: "\x00\x00\x00\x00", + DstMask: "\x00\x00\x00\x00", + Src: "\x00\x00\x00\x00", + SrcMask: "\x00\x00\x00\x00", +} + +// nflog logs messages related to the writing and reading of iptables. +func nflog(format string, args ...interface{}) { + if enableLogging && log.IsLogging(log.Debug) { + log.Debugf("netfilter: "+format, args...) + } +} + +// GetInfo returns information about iptables. +func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { + // Read in the struct and table name. + var info linux.IPTGetinfo + if _, err := t.CopyIn(outPtr, &info); err != nil { + return linux.IPTGetinfo{}, syserr.FromError(err) + } + + _, info, err := convertNetstackToBinary(stack, info.Name) + if err != nil { + nflog("couldn't convert iptables: %v", err) + return linux.IPTGetinfo{}, syserr.ErrInvalidArgument + } + + nflog("returning info: %+v", info) + return info, nil +} + +// GetEntries returns netstack's iptables rules encoded for the iptables tool. +func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { + // Read in the struct and table name. + var userEntries linux.IPTGetEntries + if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + nflog("couldn't copy in entries %q", userEntries.Name) + return linux.KernelIPTGetEntries{}, syserr.FromError(err) + } + + // Convert netstack's iptables rules to something that the iptables + // tool can understand. + entries, _, err := convertNetstackToBinary(stack, userEntries.Name) + if err != nil { + nflog("couldn't read entries: %v", err) + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + } + if binary.Size(entries) > uintptr(outLen) { + nflog("insufficient GetEntries output size: %d", uintptr(outLen)) + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + } + + return entries, nil +} + +// convertNetstackToBinary converts the iptables as stored in netstack to the +// format expected by the iptables tool. Linux stores each table as a binary +// blob that can only be traversed by parsing a bit, reading some offsets, +// jumping to those offsets, parsing again, etc. +func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) { + table, ok := stack.IPTables().GetTable(tablename.String()) + if !ok { + return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename) + } + + var entries linux.KernelIPTGetEntries + var info linux.IPTGetinfo + info.ValidHooks = table.ValidHooks() + + // The table name has to fit in the struct. + if linux.XT_TABLE_MAXNAMELEN < len(tablename) { + return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename) + } + copy(info.Name[:], tablename[:]) + copy(entries.Name[:], tablename[:]) + + for ruleIdx, rule := range table.Rules { + nflog("convert to binary: current offset: %d", entries.Size) + + // Is this a chain entry point? + for hook, hookRuleIdx := range table.BuiltinChains { + if hookRuleIdx == ruleIdx { + nflog("convert to binary: found hook %d at offset %d", hook, entries.Size) + info.HookEntry[hook] = entries.Size + } + } + // Is this a chain underflow point? + for underflow, underflowRuleIdx := range table.Underflows { + if underflowRuleIdx == ruleIdx { + nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size) + info.Underflow[underflow] = entries.Size + } + } + + // Each rule corresponds to an entry. + entry := linux.KernelIPTEntry{ + IPTEntry: linux.IPTEntry{ + IP: linux.IPTIP{ + Protocol: uint16(rule.Filter.Protocol), + }, + NextOffset: linux.SizeOfIPTEntry, + TargetOffset: linux.SizeOfIPTEntry, + }, + } + copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst) + copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask) + copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src) + copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask) + copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface) + copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + if rule.Filter.DstInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP + } + if rule.Filter.SrcInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP + } + if rule.Filter.OutputInterfaceInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT + } + + for _, matcher := range rule.Matchers { + // Serialize the matcher and add it to the + // entry. + serialized := marshalMatcher(matcher) + nflog("convert to binary: matcher serialized as: %v", serialized) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) + } + entry.Elems = append(entry.Elems, serialized...) + entry.NextOffset += uint16(len(serialized)) + entry.TargetOffset += uint16(len(serialized)) + } + + // Serialize and append the target. + serialized := marshalTarget(rule.Target) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) + } + entry.Elems = append(entry.Elems, serialized...) + entry.NextOffset += uint16(len(serialized)) + + nflog("convert to binary: adding entry: %+v", entry) + + entries.Size += uint32(entry.NextOffset) + entries.Entrytable = append(entries.Entrytable, entry) + info.NumEntries++ + } + + nflog("convert to binary: finished with an marshalled size of %d", info.Size) + info.Size = entries.Size + return entries, info, nil +} + +func marshalTarget(target stack.Target) []byte { + switch tg := target.(type) { + case stack.AcceptTarget: + return marshalStandardTarget(stack.RuleAccept) + case stack.DropTarget: + return marshalStandardTarget(stack.RuleDrop) + case stack.ErrorTarget: + return marshalErrorTarget(errorTargetName) + case stack.UserChainTarget: + return marshalErrorTarget(tg.Name) + case stack.ReturnTarget: + return marshalStandardTarget(stack.RuleReturn) + case stack.RedirectTarget: + return marshalRedirectTarget(tg) + case JumpTarget: + return marshalJumpTarget(tg) + default: + panic(fmt.Errorf("unknown target of type %T", target)) + } +} + +func marshalStandardTarget(verdict stack.RuleVerdict) []byte { + nflog("convert to binary: marshalling standard target") + + // The target's name will be the empty string. + target := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + Verdict: translateFromStandardVerdict(verdict), + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +func marshalErrorTarget(errorName string) []byte { + // This is an error target named error + target := linux.XTErrorTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTErrorTarget, + }, + } + copy(target.Name[:], errorName) + copy(target.Target.Name[:], errorTargetName) + + ret := make([]byte, 0, linux.SizeOfXTErrorTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +func marshalRedirectTarget(rt stack.RedirectTarget) []byte { + // This is a redirect target named redirect + target := linux.XTRedirectTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTRedirectTarget, + }, + } + copy(target.Target.Name[:], redirectTargetName) + + ret := make([]byte, 0, linux.SizeOfXTRedirectTarget) + target.NfRange.RangeSize = 1 + if rt.RangeProtoSpecified { + target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED + } + // Convert port from little endian to big endian. + port := make([]byte, 2) + binary.LittleEndian.PutUint16(port, rt.MinPort) + target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port) + binary.LittleEndian.PutUint16(port, rt.MaxPort) + target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +func marshalJumpTarget(jt JumpTarget) []byte { + nflog("convert to binary: marshalling jump target") + + // The target's name will be the empty string. + target := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + // Verdict is overloaded by the ABI. When positive, it holds + // the jump offset from the start of the table. + Verdict: int32(jt.Offset), + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +// translateFromStandardVerdict translates verdicts the same way as the iptables +// tool. +func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 { + switch verdict { + case stack.RuleAccept: + return -linux.NF_ACCEPT - 1 + case stack.RuleDrop: + return -linux.NF_DROP - 1 + case stack.RuleReturn: + return linux.NF_RETURN + default: + // TODO(gvisor.dev/issue/170): Support Jump. + panic(fmt.Sprintf("unknown standard verdict: %d", verdict)) + } +} + +// translateToStandardTarget translates from the value in a +// linux.XTStandardTarget to an stack.Verdict. +func translateToStandardTarget(val int32) (stack.Target, error) { + // TODO(gvisor.dev/issue/170): Support other verdicts. + switch val { + case -linux.NF_ACCEPT - 1: + return stack.AcceptTarget{}, nil + case -linux.NF_DROP - 1: + return stack.DropTarget{}, nil + case -linux.NF_QUEUE - 1: + return nil, errors.New("unsupported iptables verdict QUEUE") + case linux.NF_RETURN: + return stack.ReturnTarget{}, nil + default: + return nil, fmt.Errorf("unknown iptables verdict %d", val) + } +} + +// SetEntries sets iptables rules for a single table. See +// net/ipv4/netfilter/ip_tables.c:translate_table for reference. +func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { + // Get the basic rules data (struct ipt_replace). + if len(optVal) < linux.SizeOfIPTReplace { + nflog("optVal has insufficient size for replace %d", len(optVal)) + return syserr.ErrInvalidArgument + } + var replace linux.IPTReplace + replaceBuf := optVal[:linux.SizeOfIPTReplace] + optVal = optVal[linux.SizeOfIPTReplace:] + binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace) + + // TODO(gvisor.dev/issue/170): Support other tables. + var table stack.Table + switch replace.Name.String() { + case stack.TablenameFilter: + table = stack.EmptyFilterTable() + case stack.TablenameNat: + table = stack.EmptyNatTable() + default: + nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) + return syserr.ErrInvalidArgument + } + + nflog("set entries: setting entries in table %q", replace.Name.String()) + + // Convert input into a list of rules and their offsets. + var offset uint32 + // offsets maps rule byte offsets to their position in table.Rules. + offsets := map[uint32]int{} + for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { + nflog("set entries: processing entry at offset %d", offset) + + // Get the struct ipt_entry. + if len(optVal) < linux.SizeOfIPTEntry { + nflog("optVal has insufficient size for entry %d", len(optVal)) + return syserr.ErrInvalidArgument + } + var entry linux.IPTEntry + buf := optVal[:linux.SizeOfIPTEntry] + binary.Unmarshal(buf, usermem.ByteOrder, &entry) + initialOptValLen := len(optVal) + optVal = optVal[linux.SizeOfIPTEntry:] + + if entry.TargetOffset < linux.SizeOfIPTEntry { + nflog("entry has too-small target offset %d", entry.TargetOffset) + return syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): We should support more IPTIP + // filtering fields. + filter, err := filterFromIPTIP(entry.IP) + if err != nil { + nflog("bad iptip: %v", err) + return syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): Matchers and targets can specify + // that they only work for certain protocols, hooks, tables. + // Get matchers. + matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry + if len(optVal) < int(matchersSize) { + nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + return syserr.ErrInvalidArgument + } + matchers, err := parseMatchers(filter, optVal[:matchersSize]) + if err != nil { + nflog("failed to parse matchers: %v", err) + return syserr.ErrInvalidArgument + } + optVal = optVal[matchersSize:] + + // Get the target of the rule. + targetSize := entry.NextOffset - entry.TargetOffset + if len(optVal) < int(targetSize) { + nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + return syserr.ErrInvalidArgument + } + target, err := parseTarget(filter, optVal[:targetSize]) + if err != nil { + nflog("failed to parse target: %v", err) + return syserr.ErrInvalidArgument + } + optVal = optVal[targetSize:] + + table.Rules = append(table.Rules, stack.Rule{ + Filter: filter, + Target: target, + Matchers: matchers, + }) + offsets[offset] = int(entryIdx) + offset += uint32(entry.NextOffset) + + if initialOptValLen-len(optVal) != int(entry.NextOffset) { + nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + return syserr.ErrInvalidArgument + } + } + + // Go through the list of supported hooks for this table and, for each + // one, set the rule it corresponds to. + for hook, _ := range replace.HookEntry { + if table.ValidHooks()&(1<<hook) != 0 { + hk := hookFromLinux(hook) + for offset, ruleIdx := range offsets { + if offset == replace.HookEntry[hook] { + table.BuiltinChains[hk] = ruleIdx + } + if offset == replace.Underflow[hook] { + if !validUnderflow(table.Rules[ruleIdx]) { + nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx) + return syserr.ErrInvalidArgument + } + table.Underflows[hk] = ruleIdx + } + } + if ruleIdx := table.BuiltinChains[hk]; ruleIdx == stack.HookUnset { + nflog("hook %v is unset.", hk) + return syserr.ErrInvalidArgument + } + if ruleIdx := table.Underflows[hk]; ruleIdx == stack.HookUnset { + nflog("underflow %v is unset.", hk) + return syserr.ErrInvalidArgument + } + } + } + + // Add the user chains. + for ruleIdx, rule := range table.Rules { + target, ok := rule.Target.(stack.UserChainTarget) + if !ok { + continue + } + + // We found a user chain. Before inserting it into the table, + // check that: + // - There's some other rule after it. + // - There are no matchers. + if ruleIdx == len(table.Rules)-1 { + nflog("user chain must have a rule or default policy") + return syserr.ErrInvalidArgument + } + if len(table.Rules[ruleIdx].Matchers) != 0 { + nflog("user chain's first node must have no matchers") + return syserr.ErrInvalidArgument + } + table.UserChains[target.Name] = ruleIdx + 1 + } + + // Set each jump to point to the appropriate rule. Right now they hold byte + // offsets. + for ruleIdx, rule := range table.Rules { + jump, ok := rule.Target.(JumpTarget) + if !ok { + continue + } + + // Find the rule corresponding to the jump rule offset. + jumpTo, ok := offsets[jump.Offset] + if !ok { + nflog("failed to find a rule to jump to") + return syserr.ErrInvalidArgument + } + jump.RuleNum = jumpTo + rule.Target = jump + table.Rules[ruleIdx] = rule + } + + // TODO(gvisor.dev/issue/170): Support other chains. + // Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now, + // make sure all other chains point to ACCEPT rules. + for hook, ruleIdx := range table.BuiltinChains { + if hook == stack.Forward || hook == stack.Postrouting { + if !isUnconditionalAccept(table.Rules[ruleIdx]) { + nflog("hook %d is unsupported.", hook) + return syserr.ErrInvalidArgument + } + } + } + + // TODO(gvisor.dev/issue/170): Check the following conditions: + // - There are no loops. + // - There are no chains without an unconditional final rule. + // - There are no chains without an unconditional underflow rule. + + stk.IPTables().ReplaceTable(replace.Name.String(), table) + + return nil +} + +// parseMatchers parses 0 or more matchers from optVal. optVal should contain +// only the matchers. +func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) { + nflog("set entries: parsing matchers of size %d", len(optVal)) + var matchers []stack.Matcher + for len(optVal) > 0 { + nflog("set entries: optVal has len %d", len(optVal)) + + // Get the XTEntryMatch. + if len(optVal) < linux.SizeOfXTEntryMatch { + return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal)) + } + var match linux.XTEntryMatch + buf := optVal[:linux.SizeOfXTEntryMatch] + binary.Unmarshal(buf, usermem.ByteOrder, &match) + nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match) + + // Check some invariants. + if match.MatchSize < linux.SizeOfXTEntryMatch { + + return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) + } + if len(optVal) < int(match.MatchSize) { + return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal)) + } + + // Parse the specific matcher. + matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) + if err != nil { + return nil, fmt.Errorf("failed to create matcher: %v", err) + } + matchers = append(matchers, matcher) + + // TODO(gvisor.dev/issue/170): Check the revision field. + optVal = optVal[match.MatchSize:] + } + + if len(optVal) != 0 { + return nil, errors.New("optVal should be exhausted after parsing matchers") + } + + return matchers, nil +} + +// parseTarget parses a target from optVal. optVal should contain only the +// target. +func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) { + nflog("set entries: parsing target of size %d", len(optVal)) + if len(optVal) < linux.SizeOfXTEntryTarget { + return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal)) + } + var target linux.XTEntryTarget + buf := optVal[:linux.SizeOfXTEntryTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &target) + switch target.Name.String() { + case "": + // Standard target. + if len(optVal) != linux.SizeOfXTStandardTarget { + return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal)) + } + var standardTarget linux.XTStandardTarget + buf = optVal[:linux.SizeOfXTStandardTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget) + + if standardTarget.Verdict < 0 { + // A Verdict < 0 indicates a non-jump verdict. + return translateToStandardTarget(standardTarget.Verdict) + } + // A verdict >= 0 indicates a jump. + return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil + + case errorTargetName: + // Error target. + if len(optVal) != linux.SizeOfXTErrorTarget { + return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal)) + } + var errorTarget linux.XTErrorTarget + buf = optVal[:linux.SizeOfXTErrorTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget) + + // Error targets are used in 2 cases: + // * An actual error case. These rules have an error + // named errorTargetName. The last entry of the table + // is usually an error case to catch any packets that + // somehow fall through every rule. + // * To mark the start of a user defined chain. These + // rules have an error with the name of the chain. + switch name := errorTarget.Name.String(); name { + case errorTargetName: + nflog("set entries: error target") + return stack.ErrorTarget{}, nil + default: + // User defined chain. + nflog("set entries: user-defined target %q", name) + return stack.UserChainTarget{Name: name}, nil + } + + case redirectTargetName: + // Redirect target. + if len(optVal) < linux.SizeOfXTRedirectTarget { + return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal)) + } + + if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + + var redirectTarget linux.XTRedirectTarget + buf = optVal[:linux.SizeOfXTRedirectTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget) + + // Copy linux.XTRedirectTarget to stack.RedirectTarget. + var target stack.RedirectTarget + nfRange := redirectTarget.NfRange + + // RangeSize should be 1. + if nfRange.RangeSize != 1 { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + + // TODO(gvisor.dev/issue/170): Check if the flags are valid. + // Also check if we need to map ports or IP. + // For now, redirect target only supports destination port change. + // Port range and IP range are not supported yet. + if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + target.RangeProtoSpecified = true + + target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:]) + target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:]) + + // TODO(gvisor.dev/issue/170): Port range is not supported yet. + if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + + // Convert port from big endian to little endian. + port := make([]byte, 2) + binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort) + target.MinPort = binary.LittleEndian.Uint16(port) + + binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort) + target.MaxPort = binary.LittleEndian.Uint16(port) + return target, nil + } + + // Unknown target. + return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) +} + +func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { + if containsUnsupportedFields(iptip) { + return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) + } + if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) + } + if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) + } + + n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) + if n == -1 { + n = len(iptip.OutputInterface) + } + ifname := string(iptip.OutputInterface[:n]) + + n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) + if n == -1 { + n = len(iptip.OutputInterfaceMask) + } + ifnameMask := string(iptip.OutputInterfaceMask[:n]) + + return stack.IPHeaderFilter{ + Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), + Dst: tcpip.Address(iptip.Dst[:]), + DstMask: tcpip.Address(iptip.DstMask[:]), + DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, + Src: tcpip.Address(iptip.Src[:]), + SrcMask: tcpip.Address(iptip.SrcMask[:]), + SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, + OutputInterface: ifname, + OutputInterfaceMask: ifnameMask, + OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, + }, nil +} + +func containsUnsupportedFields(iptip linux.IPTIP) bool { + // The following features are supported: + // - Protocol + // - Dst and DstMask + // - Src and SrcMask + // - The inverse destination IP check flag + // - OutputInterface, OutputInterfaceMask and its inverse. + var emptyInterface = [linux.IFNAMSIZ]byte{} + // Disable any supported inverse flags. + inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) + return iptip.InputInterface != emptyInterface || + iptip.InputInterfaceMask != emptyInterface || + iptip.Flags != 0 || + iptip.InverseFlags&^inverseMask != 0 +} + +func validUnderflow(rule stack.Rule) bool { + if len(rule.Matchers) != 0 { + return false + } + if rule.Filter != emptyFilter { + return false + } + switch rule.Target.(type) { + case stack.AcceptTarget, stack.DropTarget: + return true + default: + return false + } +} + +func isUnconditionalAccept(rule stack.Rule) bool { + if !validUnderflow(rule) { + return false + } + _, ok := rule.Target.(stack.AcceptTarget) + return ok +} + +func hookFromLinux(hook int) stack.Hook { + switch hook { + case linux.NF_INET_PRE_ROUTING: + return stack.Prerouting + case linux.NF_INET_LOCAL_IN: + return stack.Input + case linux.NF_INET_FORWARD: + return stack.Forward + case linux.NF_INET_LOCAL_OUT: + return stack.Output + case linux.NF_INET_POST_ROUTING: + return stack.Postrouting + } + panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) +} diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go new file mode 100644 index 000000000..1b4e0ad79 --- /dev/null +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -0,0 +1,149 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameOwner = "owner" + +func init() { + registerMatchMaker(ownerMarshaler{}) +} + +// ownerMarshaler implements matchMaker for owner matching. +type ownerMarshaler struct{} + +// name implements matchMaker.name. +func (ownerMarshaler) name() string { + return matcherNameOwner +} + +// marshal implements matchMaker.marshal. +func (ownerMarshaler) marshal(mr stack.Matcher) []byte { + matcher := mr.(*OwnerMatcher) + iptOwnerInfo := linux.IPTOwnerInfo{ + UID: matcher.uid, + GID: matcher.gid, + } + + // Support for UID and GID match. + if matcher.matchUID { + iptOwnerInfo.Match = linux.XT_OWNER_UID + if matcher.invertUID { + iptOwnerInfo.Invert = linux.XT_OWNER_UID + } + } + if matcher.matchGID { + iptOwnerInfo.Match |= linux.XT_OWNER_GID + if matcher.invertGID { + iptOwnerInfo.Invert |= linux.XT_OWNER_GID + } + } + + buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo) + return marshalEntryMatch(matcherNameOwner, binary.Marshal(buf, usermem.ByteOrder, iptOwnerInfo)) +} + +// unmarshal implements matchMaker.unmarshal. +func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { + if len(buf) < linux.SizeOfIPTOwnerInfo { + return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. + var matchData linux.IPTOwnerInfo + binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData) + + var owner OwnerMatcher + owner.uid = matchData.UID + owner.gid = matchData.GID + + // Check flags. + if matchData.Match&linux.XT_OWNER_UID != 0 { + owner.matchUID = true + if matchData.Invert&linux.XT_OWNER_UID != 0 { + owner.invertUID = true + } + } + if matchData.Match&linux.XT_OWNER_GID != 0 { + owner.matchGID = true + if matchData.Invert&linux.XT_OWNER_GID != 0 { + owner.invertGID = true + } + } + + return &owner, nil +} + +type OwnerMatcher struct { + uid uint32 + gid uint32 + matchUID bool + matchGID bool + invertUID bool + invertGID bool +} + +// Name implements Matcher.Name. +func (*OwnerMatcher) Name() string { + return matcherNameOwner +} + +// Match implements Matcher.Match. +func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { + // Support only for OUTPUT chain. + // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. + if hook != stack.Output { + return false, true + } + + // If the packet owner is not set, drop the packet. + if pkt.Owner == nil { + return false, true + } + + var matches bool + // Check for UID match. + if om.matchUID { + if pkt.Owner.UID() == om.uid { + matches = true + } + if matches == om.invertUID { + return false, false + } + } + + // Check for GID match. + if om.matchGID { + matches = false + if pkt.Owner.GID() == om.gid { + matches = true + } + if matches == om.invertGID { + return false, false + } + } + + return true, false +} diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go new file mode 100644 index 000000000..b91ba3ab3 --- /dev/null +++ b/pkg/sentry/socket/netfilter/targets.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +// JumpTarget implements stack.Target. +type JumpTarget struct { + // Offset is the byte offset of the rule to jump to. It is used for + // marshaling and unmarshaling. + Offset uint32 + + // RuleNum is the rule to jump to. + RuleNum int +} + +// Action implements stack.Target.Action. +func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) { + return stack.RuleJump, jt.RuleNum +} diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go new file mode 100644 index 000000000..4f98ee2d5 --- /dev/null +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -0,0 +1,130 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameTCP = "tcp" + +func init() { + registerMatchMaker(tcpMarshaler{}) +} + +// tcpMarshaler implements matchMaker for TCP matching. +type tcpMarshaler struct{} + +// name implements matchMaker.name. +func (tcpMarshaler) name() string { + return matcherNameTCP +} + +// marshal implements matchMaker.marshal. +func (tcpMarshaler) marshal(mr stack.Matcher) []byte { + matcher := mr.(*TCPMatcher) + xttcp := linux.XTTCP{ + SourcePortStart: matcher.sourcePortStart, + SourcePortEnd: matcher.sourcePortEnd, + DestinationPortStart: matcher.destinationPortStart, + DestinationPortEnd: matcher.destinationPortEnd, + } + buf := make([]byte, 0, linux.SizeOfXTTCP) + return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp)) +} + +// unmarshal implements matchMaker.unmarshal. +func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { + if len(buf) < linux.SizeOfXTTCP { + return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. + var matchData linux.XTTCP + binary.Unmarshal(buf[:linux.SizeOfXTTCP], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed XTTCP: %+v", matchData) + + if matchData.Option != 0 || + matchData.FlagMask != 0 || + matchData.FlagCompare != 0 || + matchData.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported TCP matcher flags set") + } + + if filter.Protocol != header.TCPProtocolNumber { + return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber) + } + + return &TCPMatcher{ + sourcePortStart: matchData.SourcePortStart, + sourcePortEnd: matchData.SourcePortEnd, + destinationPortStart: matchData.DestinationPortStart, + destinationPortEnd: matchData.DestinationPortEnd, + }, nil +} + +// TCPMatcher matches TCP packets and their headers. It implements Matcher. +type TCPMatcher struct { + sourcePortStart uint16 + sourcePortEnd uint16 + destinationPortStart uint16 + destinationPortEnd uint16 +} + +// Name implements Matcher.Name. +func (*TCPMatcher) Name() string { + return matcherNameTCP +} + +// Match implements Matcher.Match. +func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } + + tcpHeader := header.TCP(pkt.TransportHeader) + if len(tcpHeader) < header.TCPMinimumSize { + // There's no valid TCP header here, so we drop the packet immediately. + return false, true + } + + // Check whether the source and destination ports are within the + // matching range. + if sourcePort := tcpHeader.SourcePort(); sourcePort < tm.sourcePortStart || tm.sourcePortEnd < sourcePort { + return false, false + } + if destinationPort := tcpHeader.DestinationPort(); destinationPort < tm.destinationPortStart || tm.destinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go new file mode 100644 index 000000000..3f20fc891 --- /dev/null +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -0,0 +1,129 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameUDP = "udp" + +func init() { + registerMatchMaker(udpMarshaler{}) +} + +// udpMarshaler implements matchMaker for UDP matching. +type udpMarshaler struct{} + +// name implements matchMaker.name. +func (udpMarshaler) name() string { + return matcherNameUDP +} + +// marshal implements matchMaker.marshal. +func (udpMarshaler) marshal(mr stack.Matcher) []byte { + matcher := mr.(*UDPMatcher) + xtudp := linux.XTUDP{ + SourcePortStart: matcher.sourcePortStart, + SourcePortEnd: matcher.sourcePortEnd, + DestinationPortStart: matcher.destinationPortStart, + DestinationPortEnd: matcher.destinationPortEnd, + } + buf := make([]byte, 0, linux.SizeOfXTUDP) + return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp)) +} + +// unmarshal implements matchMaker.unmarshal. +func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { + if len(buf) < linux.SizeOfXTUDP { + return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may exceed what's + // strictly necessary to hold matchData. + var matchData linux.XTUDP + binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed XTUDP: %+v", matchData) + + if matchData.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported UDP matcher inverse flags set") + } + + if filter.Protocol != header.UDPProtocolNumber { + return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) + } + + return &UDPMatcher{ + sourcePortStart: matchData.SourcePortStart, + sourcePortEnd: matchData.SourcePortEnd, + destinationPortStart: matchData.DestinationPortStart, + destinationPortEnd: matchData.DestinationPortEnd, + }, nil +} + +// UDPMatcher matches UDP packets and their headers. It implements Matcher. +type UDPMatcher struct { + sourcePortStart uint16 + sourcePortEnd uint16 + destinationPortStart uint16 + destinationPortEnd uint16 +} + +// Name implements Matcher.Name. +func (*UDPMatcher) Name() string { + return matcherNameUDP +} + +// Match implements Matcher.Match. +func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the stack.Check codepath as matchers are added. + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } + + udpHeader := header.UDP(pkt.TransportHeader) + if len(udpHeader) < header.UDPMinimumSize { + // There's no valid UDP header here, so we drop the packet immediately. + return false, true + } + + // Check whether the source and destination ports are within the + // matching range. + if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort { + return false, false + } + if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD new file mode 100644 index 000000000..d5ca3ac56 --- /dev/null +++ b/pkg/sentry/socket/netlink/BUILD @@ -0,0 +1,52 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "netlink", + srcs = [ + "message.go", + "provider.go", + "provider_vfs2.go", + "socket.go", + "socket_vfs2.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/arch", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket", + "//pkg/sentry/socket/netlink/port", + "//pkg/sentry/socket/unix", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "netlink_test", + size = "small", + srcs = [ + "message_test.go", + ], + deps = [ + ":netlink", + "//pkg/abi/linux", + ], +) diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go new file mode 100644 index 000000000..0899c61d1 --- /dev/null +++ b/pkg/sentry/socket/netlink/message.go @@ -0,0 +1,281 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "fmt" + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/usermem" +) + +// alignPad returns the length of padding required for alignment. +// +// Preconditions: align is a power of two. +func alignPad(length int, align uint) int { + return binary.AlignUp(length, align) - length +} + +// Message contains a complete serialized netlink message. +type Message struct { + hdr linux.NetlinkMessageHeader + buf []byte +} + +// NewMessage creates a new Message containing the passed header. +// +// The header length will be updated by Finalize. +func NewMessage(hdr linux.NetlinkMessageHeader) *Message { + return &Message{ + hdr: hdr, + buf: binary.Marshal(nil, usermem.ByteOrder, hdr), + } +} + +// ParseMessage parses the first message seen at buf, returning the rest of the +// buffer. If message is malformed, ok of false is returned. For last message, +// padding check is loose, if there isn't enought padding, whole buf is consumed +// and ok is set to true. +func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) { + b := BytesView(buf) + + hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize) + if !ok { + return + } + var hdr linux.NetlinkMessageHeader + binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr) + + // Msg portion. + totalMsgLen := int(hdr.Length) + _, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize) + if !ok { + return + } + + // Padding. + numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO) + // Linux permits the last message not being aligned, just consume all of it. + // Ref: net/netlink/af_netlink.c:netlink_rcv_skb + if numPad > len(b) { + numPad = len(b) + } + _, ok = b.Extract(numPad) + if !ok { + return + } + + return &Message{ + hdr: hdr, + buf: buf[:totalMsgLen], + }, []byte(b), true +} + +// Header returns the header of this message. +func (m *Message) Header() linux.NetlinkMessageHeader { + return m.hdr +} + +// GetData unmarshals the payload message header from this netlink message, and +// returns the attributes portion. +func (m *Message) GetData(msg interface{}) (AttrsView, bool) { + b := BytesView(m.buf) + + _, ok := b.Extract(linux.NetlinkMessageHeaderSize) + if !ok { + return nil, false + } + + size := int(binary.Size(msg)) + msgBytes, ok := b.Extract(size) + if !ok { + return nil, false + } + binary.Unmarshal(msgBytes, usermem.ByteOrder, msg) + + numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO) + // Linux permits the last message not being aligned, just consume all of it. + // Ref: net/netlink/af_netlink.c:netlink_rcv_skb + if numPad > len(b) { + numPad = len(b) + } + _, ok = b.Extract(numPad) + if !ok { + return nil, false + } + + return AttrsView(b), true +} + +// Finalize returns the []byte containing the entire message, with the total +// length set in the message header. The Message must not be modified after +// calling Finalize. +func (m *Message) Finalize() []byte { + // Update length, which is the first 4 bytes of the header. + usermem.ByteOrder.PutUint32(m.buf, uint32(len(m.buf))) + + // Align the message. Note that the message length in the header (set + // above) is the useful length of the message, not the total aligned + // length. See net/netlink/af_netlink.c:__nlmsg_put. + aligned := binary.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO) + m.putZeros(aligned - len(m.buf)) + return m.buf +} + +// putZeros adds n zeros to the message. +func (m *Message) putZeros(n int) { + for n > 0 { + m.buf = append(m.buf, 0) + n-- + } +} + +// Put serializes v into the message. +func (m *Message) Put(v interface{}) { + m.buf = binary.Marshal(m.buf, usermem.ByteOrder, v) +} + +// PutAttr adds v to the message as a netlink attribute. +// +// Preconditions: The serialized attribute (linux.NetlinkAttrHeaderSize + +// binary.Size(v) fits in math.MaxUint16 bytes. +func (m *Message) PutAttr(atype uint16, v interface{}) { + l := linux.NetlinkAttrHeaderSize + int(binary.Size(v)) + if l > math.MaxUint16 { + panic(fmt.Sprintf("attribute too large: %d", l)) + } + + m.Put(linux.NetlinkAttrHeader{ + Type: atype, + Length: uint16(l), + }) + m.Put(v) + + // Align the attribute. + aligned := binary.AlignUp(l, linux.NLA_ALIGNTO) + m.putZeros(aligned - l) +} + +// PutAttrString adds s to the message as a netlink attribute. +func (m *Message) PutAttrString(atype uint16, s string) { + l := linux.NetlinkAttrHeaderSize + len(s) + 1 + m.Put(linux.NetlinkAttrHeader{ + Type: atype, + Length: uint16(l), + }) + + // String + NUL-termination. + m.Put([]byte(s)) + m.putZeros(1) + + // Align the attribute. + aligned := binary.AlignUp(l, linux.NLA_ALIGNTO) + m.putZeros(aligned - l) +} + +// MessageSet contains a series of netlink messages. +type MessageSet struct { + // Multi indicates that this a multi-part message, to be terminated by + // NLMSG_DONE. NLMSG_DONE is sent even if the set contains only one + // Message. + // + // If Multi is set, all added messages will have NLM_F_MULTI set. + Multi bool + + // PortID is the destination port for all messages. + PortID int32 + + // Seq is the sequence counter for all messages in the set. + Seq uint32 + + // Messages contains the messages in the set. + Messages []*Message +} + +// NewMessageSet creates a new MessageSet. +// +// portID is the destination port to set as PortID in all messages. +// +// seq is the sequence counter to set as seq in all messages in the set. +func NewMessageSet(portID int32, seq uint32) *MessageSet { + return &MessageSet{ + PortID: portID, + Seq: seq, + } +} + +// AddMessage adds a new message to the set and returns it for further +// additions. +// +// The passed header will have Seq, PortID and the multi flag set +// automatically. +func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message { + hdr.Seq = ms.Seq + hdr.PortID = uint32(ms.PortID) + if ms.Multi { + hdr.Flags |= linux.NLM_F_MULTI + } + + m := NewMessage(hdr) + ms.Messages = append(ms.Messages, m) + return m +} + +// AttrsView is a view into the attributes portion of a netlink message. +type AttrsView []byte + +// Empty returns whether there is no attribute left in v. +func (v AttrsView) Empty() bool { + return len(v) == 0 +} + +// ParseFirst parses first netlink attribute at the beginning of v. +func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) { + b := BytesView(v) + + hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize) + if !ok { + return + } + binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr) + + value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize) + if !ok { + return + } + + _, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO)) + if !ok { + return + } + + return hdr, value, AttrsView(b), ok +} + +// BytesView supports extracting data from a byte slice with bounds checking. +type BytesView []byte + +// Extract removes the first n bytes from v and returns it. If n is out of +// bounds, it returns false. +func (v *BytesView) Extract(n int) ([]byte, bool) { + if n < 0 || n > len(*v) { + return nil, false + } + extracted := (*v)[:n] + *v = (*v)[n:] + return extracted, true +} diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go new file mode 100644 index 000000000..ef13d9386 --- /dev/null +++ b/pkg/sentry/socket/netlink/message_test.go @@ -0,0 +1,312 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package message_test + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink" +) + +type dummyNetlinkMsg struct { + Foo uint16 +} + +func TestParseMessage(t *testing.T) { + tests := []struct { + desc string + input []byte + + header linux.NetlinkMessageHeader + dataMsg *dummyNetlinkMsg + restLen int + ok bool + }{ + { + desc: "valid", + input: []byte{ + 0x14, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + header: linux.NetlinkMessageHeader{ + Length: 20, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "valid with next message", + input: []byte{ + 0x14, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + 0xFF, // Next message (rest) + }, + header: linux.NetlinkMessageHeader{ + Length: 20, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 1, + ok: true, + }, + { + desc: "valid for last message without padding", + input: []byte{ + 0x12, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, // Data message + }, + header: linux.NetlinkMessageHeader{ + Length: 18, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "valid for last message not to be aligned", + input: []byte{ + 0x13, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, // Data message + 0x00, // Excessive 1 byte permitted at end + }, + header: linux.NetlinkMessageHeader{ + Length: 19, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "header.Length too short", + input: []byte{ + 0x04, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + ok: false, + }, + { + desc: "header.Length too long", + input: []byte{ + 0xFF, 0xFF, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + ok: false, + }, + { + desc: "header incomplete", + input: []byte{ + 0x04, 0x00, 0x00, 0x00, // Length + }, + ok: false, + }, + { + desc: "empty message", + input: []byte{}, + ok: false, + }, + } + for _, test := range tests { + msg, rest, ok := netlink.ParseMessage(test.input) + if ok != test.ok { + t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok) + continue + } + if !test.ok { + continue + } + if !reflect.DeepEqual(msg.Header(), test.header) { + t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header) + } + + dataMsg := &dummyNetlinkMsg{} + _, dataOk := msg.GetData(dataMsg) + if !dataOk { + t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk) + } else if !reflect.DeepEqual(dataMsg, test.dataMsg) { + t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg) + } + + if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) { + t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want) + } + } +} + +func TestAttrView(t *testing.T) { + tests := []struct { + desc string + input []byte + + // Outputs for ParseFirst. + hdr linux.NetlinkAttrHeader + value []byte + restLen int + ok bool + + // Outputs for Empty. + isEmpty bool + }{ + { + desc: "valid", + input: []byte{ + 0x06, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding + }, + hdr: linux.NetlinkAttrHeader{ + Length: 6, + Type: 1, + }, + value: []byte{0x30, 0x31}, + restLen: 0, + ok: true, + isEmpty: false, + }, + { + desc: "at alignment", + input: []byte{ + 0x08, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + hdr: linux.NetlinkAttrHeader{ + Length: 8, + Type: 1, + }, + value: []byte{0x30, 0x31, 0x32, 0x33}, + restLen: 0, + ok: true, + isEmpty: false, + }, + { + desc: "at alignment with rest data", + input: []byte{ + 0x08, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + 0xFF, 0xFE, // Rest data + }, + hdr: linux.NetlinkAttrHeader{ + Length: 8, + Type: 1, + }, + value: []byte{0x30, 0x31, 0x32, 0x33}, + restLen: 2, + ok: true, + isEmpty: false, + }, + { + desc: "hdr.Length too long", + input: []byte{ + 0xFF, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + ok: false, + isEmpty: false, + }, + { + desc: "hdr.Length too short", + input: []byte{ + 0x01, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + ok: false, + isEmpty: false, + }, + { + desc: "empty", + input: []byte{}, + ok: false, + isEmpty: true, + }, + } + for _, test := range tests { + attrs := netlink.AttrsView(test.input) + + // Test ParseFirst(). + hdr, value, rest, ok := attrs.ParseFirst() + if ok != test.ok { + t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok) + } else if test.ok { + if !reflect.DeepEqual(hdr, test.hdr) { + t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr) + } + if !bytes.Equal(value, test.value) { + t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value) + } + if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) { + t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest) + } + } + + // Test Empty(). + if got, want := attrs.Empty(), test.isEmpty; got != want { + t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want) + } + } +} diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD new file mode 100644 index 000000000..3a22923d8 --- /dev/null +++ b/pkg/sentry/socket/netlink/port/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "port", + srcs = ["port.go"], + visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sync"], +) + +go_test( + name = "port_test", + srcs = ["port_test.go"], + library = ":port", +) diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go new file mode 100644 index 000000000..2cd3afc22 --- /dev/null +++ b/pkg/sentry/socket/netlink/port/port.go @@ -0,0 +1,117 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package port provides port ID allocation for netlink sockets. +// +// A netlink port is any int32 value. Positive ports are typically equivalent +// to the PID of the binding process. If that port is unavailable, negative +// ports are searched to find a free port that will not conflict with other +// PIDS. +package port + +import ( + "fmt" + "math" + "math/rand" + + "gvisor.dev/gvisor/pkg/sync" +) + +// maxPorts is a sanity limit on the maximum number of ports to allocate per +// protocol. +const maxPorts = 10000 + +// Manager allocates netlink port IDs. +// +// +stateify savable +type Manager struct { + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // ports contains a map of allocated ports for each protocol. + ports map[int]map[int32]struct{} +} + +// New creates a new Manager. +func New() *Manager { + return &Manager{ + ports: make(map[int]map[int32]struct{}), + } +} + +// Allocate reserves a new port ID for protocol. hint will be taken if +// available. +func (m *Manager) Allocate(protocol int, hint int32) (int32, bool) { + m.mu.Lock() + defer m.mu.Unlock() + + proto, ok := m.ports[protocol] + if !ok { + proto = make(map[int32]struct{}) + // Port 0 is reserved for the kernel. + proto[0] = struct{}{} + m.ports[protocol] = proto + } + + if len(proto) >= maxPorts { + return 0, false + } + + if _, ok := proto[hint]; !ok { + // Hint is available, reserve it. + proto[hint] = struct{}{} + return hint, true + } + + // Search for any free port in [math.MinInt32, -4096). The positive + // port space is left open for pid-based allocations. This behavior is + // consistent with Linux. + start := int32(math.MinInt32 + rand.Int63n(math.MaxInt32-4096+1)) + curr := start + for { + if _, ok := proto[curr]; !ok { + proto[curr] = struct{}{} + return curr, true + } + + curr-- + if curr >= -4096 { + curr = -4097 + } + if curr == start { + // Nothing found. We should always find a free port + // because maxPorts < -4096 - MinInt32. + panic(fmt.Sprintf("No free port found in %+v", proto)) + } + } +} + +// Release frees the specified port for protocol. +// +// Preconditions: port is already allocated. +func (m *Manager) Release(protocol int, port int32) { + m.mu.Lock() + defer m.mu.Unlock() + + proto, ok := m.ports[protocol] + if !ok { + panic(fmt.Sprintf("Released port %d for protocol %d which has no allocations", port, protocol)) + } + + if _, ok := proto[port]; !ok { + panic(fmt.Sprintf("Released port %d for protocol %d is not allocated", port, protocol)) + } + + delete(proto, port) +} diff --git a/pkg/sentry/socket/netlink/port/port_test.go b/pkg/sentry/socket/netlink/port/port_test.go new file mode 100644 index 000000000..516f6cd6c --- /dev/null +++ b/pkg/sentry/socket/netlink/port/port_test.go @@ -0,0 +1,82 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package port + +import ( + "testing" +) + +func TestAllocateHint(t *testing.T) { + m := New() + + // We can get the hint port. + p, ok := m.Allocate(0, 1) + if !ok { + t.Errorf("m.Allocate got !ok want ok") + } + if p != 1 { + t.Errorf("m.Allocate(0, 1) got %d want 1", p) + } + + // Hint is taken. + p, ok = m.Allocate(0, 1) + if !ok { + t.Errorf("m.Allocate got !ok want ok") + } + if p == 1 { + t.Errorf("m.Allocate(0, 1) got 1 want anything else") + } + + // Hint is available for a different protocol. + p, ok = m.Allocate(1, 1) + if !ok { + t.Errorf("m.Allocate got !ok want ok") + } + if p != 1 { + t.Errorf("m.Allocate(1, 1) got %d want 1", p) + } + + m.Release(0, 1) + + // Hint is available again after release. + p, ok = m.Allocate(0, 1) + if !ok { + t.Errorf("m.Allocate got !ok want ok") + } + if p != 1 { + t.Errorf("m.Allocate(0, 1) got %d want 1", p) + } +} + +func TestAllocateExhausted(t *testing.T) { + m := New() + + // Fill all ports (0 is already reserved). + for i := int32(1); i < maxPorts; i++ { + p, ok := m.Allocate(0, i) + if !ok { + t.Fatalf("m.Allocate got !ok want ok") + } + if p != i { + t.Fatalf("m.Allocate(0, %d) got %d want %d", i, p, i) + } + } + + // Now no more can be allocated. + p, ok := m.Allocate(0, 1) + if ok { + t.Errorf("m.Allocate got %d, ok want !ok", p) + } +} diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go new file mode 100644 index 000000000..0d45e5053 --- /dev/null +++ b/pkg/sentry/socket/netlink/provider.go @@ -0,0 +1,116 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/syserr" +) + +// Protocol is the implementation of a netlink socket protocol. +type Protocol interface { + // Protocol returns the Linux netlink protocol value. + Protocol() int + + // CanSend returns true if this protocol may ever send messages. + // + // TODO(gvisor.dev/issue/1119): This is a workaround to allow + // advertising support for otherwise unimplemented features on sockets + // that will never send messages, thus making those features no-ops. + CanSend() bool + + // ProcessMessage processes a single message from userspace. + // + // If err == nil, any messages added to ms will be sent back to the + // other end of the socket. Setting ms.Multi will cause an NLMSG_DONE + // message to be sent even if ms contains no messages. + ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error +} + +// Provider is a function that creates a new Protocol for a specific netlink +// protocol. +// +// Note that this is distinct from socket.Provider, which is used for all +// socket families. +type Provider func(t *kernel.Task) (Protocol, *syserr.Error) + +// protocols holds a map of all known address protocols and their provider. +var protocols = make(map[int]Provider) + +// RegisterProvider registers the provider of a given address protocol so that +// netlink sockets of that type can be created via socket(2). +// +// Preconditions: May only be called before any netlink sockets are created. +func RegisterProvider(protocol int, provider Provider) { + if p, ok := protocols[protocol]; ok { + panic(fmt.Sprintf("Netlink protocol %d already provided by %+v", protocol, p)) + } + + protocols[protocol] = provider +} + +// LINT.IfChange + +// socketProvider implements socket.Provider. +type socketProvider struct { +} + +// Socket implements socket.Provider.Socket. +func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // Netlink sockets must be specified as datagram or raw, but they + // behave the same regardless of type. + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { + return nil, syserr.ErrSocketNotSupported + } + + provider, ok := protocols[protocol] + if !ok { + return nil, syserr.ErrProtocolNotSupported + } + + p, err := provider(t) + if err != nil { + return nil, err + } + + s, err := NewSocket(t, stype, p) + if err != nil { + return nil, err + } + + d := socket.NewDirent(t, netlinkSocketDevice) + defer d.DecRef() + return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil +} + +// Pair implements socket.Provider.Pair by returning an error. +func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { + // Netlink sockets never supports creating socket pairs. + return nil, nil, syserr.ErrNotSupported +} + +// LINT.ThenChange(./provider_vfs2.go) + +// init registers the socket provider. +func init() { + socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{}) + socket.RegisterProviderVFS2(linux.AF_NETLINK, &socketProviderVFS2{}) +} diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go new file mode 100644 index 000000000..bb205be0d --- /dev/null +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -0,0 +1,69 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" +) + +// socketProviderVFS2 implements socket.Provider. +type socketProviderVFS2 struct { +} + +// Socket implements socket.Provider.Socket. +func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Netlink sockets must be specified as datagram or raw, but they + // behave the same regardless of type. + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { + return nil, syserr.ErrSocketNotSupported + } + + provider, ok := protocols[protocol] + if !ok { + return nil, syserr.ErrProtocolNotSupported + } + + p, err := provider(t) + if err != nil { + return nil, err + } + + s, err := NewVFS2(t, stype, p) + if err != nil { + return nil, err + } + + vfsfd := &s.vfsfd + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Pair implements socket.Provider.Pair by returning an error. +func (*socketProviderVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Netlink sockets never supports creating socket pairs. + return nil, nil, syserr.ErrNotSupported +} diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD new file mode 100644 index 000000000..93127398d --- /dev/null +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "route", + srcs = [ + "protocol.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/socket/netlink", + "//pkg/syserr", + ], +) diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go new file mode 100644 index 000000000..c84d8bd7c --- /dev/null +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -0,0 +1,498 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package route provides a NETLINK_ROUTE socket protocol. +package route + +import ( + "bytes" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + "gvisor.dev/gvisor/pkg/syserr" +) + +// commandKind describes the operational class of a message type. +// +// The route message types use the lower 2 bits of the type to describe class +// of command. +type commandKind int + +const ( + kindNew commandKind = 0x0 + kindDel = 0x1 + kindGet = 0x2 + kindSet = 0x3 +) + +func typeKind(typ uint16) commandKind { + return commandKind(typ & 0x3) +} + +// Protocol implements netlink.Protocol. +// +// +stateify savable +type Protocol struct{} + +var _ netlink.Protocol = (*Protocol)(nil) + +// NewProtocol creates a NETLINK_ROUTE netlink.Protocol. +func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) { + return &Protocol{}, nil +} + +// Protocol implements netlink.Protocol.Protocol. +func (p *Protocol) Protocol() int { + return linux.NETLINK_ROUTE +} + +// CanSend implements netlink.Protocol.CanSend. +func (p *Protocol) CanSend() bool { + return true +} + +// dumpLinks handles RTM_GETLINK dump requests. +func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an + // ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some + // userspace applications (including glibc) still include rtgenmsg. + // Linux has a workaround based on the total message length. + // + // We don't bother to check for either, since we don't support any + // extra attributes that may be included anyways. + // + // The message may also contain netlink attribute IFLA_EXT_MASK, which + // we don't support. + + // The RTM_GETLINK dump response is a set of messages each containing + // an InterfaceInfoMessage followed by a set of netlink attributes. + + // We always send back an NLMSG_DONE. + ms.Multi = true + + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } + + for idx, i := range stack.Interfaces() { + addNewLinkMessage(ms, idx, i) + } + + return nil +} + +// getLinks handles RTM_GETLINK requests. +func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } + + // Parse message. + var ifi linux.InterfaceInfoMessage + attrs, ok := msg.GetData(&ifi) + if !ok { + return syserr.ErrInvalidArgument + } + + // Parse attributes. + var byName []byte + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument + } + attrs = rest + + switch ahdr.Type { + case linux.IFLA_IFNAME: + if len(value) < 1 { + return syserr.ErrInvalidArgument + } + byName = value[:len(value)-1] + + // TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK. + } + } + + found := false + for idx, i := range stack.Interfaces() { + switch { + case ifi.Index > 0: + if idx != ifi.Index { + continue + } + case byName != nil: + if string(byName) != i.Name { + continue + } + default: + // Criteria not specified. + return syserr.ErrInvalidArgument + } + + addNewLinkMessage(ms, idx, i) + found = true + break + } + if !found { + return syserr.ErrNoDevice + } + return nil +} + +// addNewLinkMessage appends RTM_NEWLINK message for the given interface into +// the message set. +func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWLINK, + }) + + m.Put(linux.InterfaceInfoMessage{ + Family: linux.AF_UNSPEC, + Type: i.DeviceType, + Index: idx, + Flags: i.Flags, + }) + + m.PutAttrString(linux.IFLA_IFNAME, i.Name) + m.PutAttr(linux.IFLA_MTU, i.MTU) + + mac := make([]byte, 6) + brd := mac + if len(i.Addr) > 0 { + mac = i.Addr + brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) + } + m.PutAttr(linux.IFLA_ADDRESS, mac) + m.PutAttr(linux.IFLA_BROADCAST, brd) + + // TODO(gvisor.dev/issue/578): There are many more attributes. +} + +// dumpAddrs handles RTM_GETADDR dump requests. +func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + // RTM_GETADDR dump requests need not contain anything more than the + // netlink header and 1 byte protocol family common to all + // NETLINK_ROUTE requests. + // + // TODO(b/68878065): Filter output by passed protocol family. + + // The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each + // containing an InterfaceAddrMessage followed by a set of netlink + // attributes. + + // We always send back an NLMSG_DONE. + ms.Multi = true + + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } + + for id, as := range stack.InterfaceAddrs() { + for _, a := range as { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWADDR, + }) + + m.Put(linux.InterfaceAddrMessage{ + Family: a.Family, + PrefixLen: a.PrefixLen, + Index: uint32(id), + }) + + m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr)) + m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr)) + + // TODO(gvisor.dev/issue/578): There are many more attributes. + } + } + + return nil +} + +// commonPrefixLen reports the length of the longest IP address prefix. +// This is a simplied version from Golang's src/net/addrselect.go. +func commonPrefixLen(a, b []byte) (cpl int) { + for len(a) > 0 { + if a[0] == b[0] { + cpl += 8 + a = a[1:] + b = b[1:] + continue + } + bits := 8 + ab, bb := a[0], b[0] + for { + ab >>= 1 + bb >>= 1 + bits-- + if ab == bb { + cpl += bits + return + } + } + } + return +} + +// fillRoute returns the Route using LPM algorithm. Refer to Linux's +// net/ipv4/route.c:rt_fill_info(). +func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) { + family := uint8(linux.AF_INET) + if len(addr) != 4 { + family = linux.AF_INET6 + } + + idx := -1 // Index of the Route rule to be returned. + idxDef := -1 // Index of the default route rule. + prefix := 0 // Current longest prefix. + for i, route := range routes { + if route.Family != family { + continue + } + + if len(route.GatewayAddr) > 0 && route.DstLen == 0 { + idxDef = i + continue + } + + cpl := commonPrefixLen(addr, route.DstAddr) + if cpl < int(route.DstLen) { + continue + } + cpl = int(route.DstLen) + if cpl > prefix { + idx = i + prefix = cpl + } + } + if idx == -1 { + idx = idxDef + } + if idx == -1 { + return inet.Route{}, syserr.ErrNoRoute + } + + route := routes[idx] + if family == linux.AF_INET { + route.DstLen = 32 + } else { + route.DstLen = 128 + } + route.DstAddr = addr + route.Flags |= linux.RTM_F_CLONED // This route is cloned. + return route, nil +} + +// parseForDestination parses a message as format of RouteMessage-RtAttr-dst. +func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) { + var rtMsg linux.RouteMessage + attrs, ok := msg.GetData(&rtMsg) + if !ok { + return nil, syserr.ErrInvalidArgument + } + // iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See + // commit bc234301af12. Note we don't check this flag for backward + // compatibility. + if rtMsg.Flags != 0 && rtMsg.Flags != linux.RTM_F_LOOKUP_TABLE { + return nil, syserr.ErrNotSupported + } + + // Expect first attribute is RTA_DST. + if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST { + return value, nil + } + return nil, syserr.ErrInvalidArgument +} + +// dumpRoutes handles RTM_GETROUTE requests. +func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + // RTM_GETROUTE dump requests need not contain anything more than the + // netlink header and 1 byte protocol family common to all + // NETLINK_ROUTE requests. + + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network routes. + return nil + } + + hdr := msg.Header() + routeTables := stack.RouteTable() + + if hdr.Flags == linux.NLM_F_REQUEST { + dst, err := parseForDestination(msg) + if err != nil { + return err + } + route, err := fillRoute(routeTables, dst) + if err != nil { + // TODO(gvisor.dev/issue/1237): return NLMSG_ERROR with ENETUNREACH. + return syserr.ErrNotSupported + } + routeTables = append([]inet.Route{}, route) + } else if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP { + // We always send back an NLMSG_DONE. + ms.Multi = true + } else { + // TODO(b/68878065): Only above cases are supported. + return syserr.ErrNotSupported + } + + for _, rt := range routeTables { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWROUTE, + }) + + m.Put(linux.RouteMessage{ + Family: rt.Family, + DstLen: rt.DstLen, + SrcLen: rt.SrcLen, + TOS: rt.TOS, + + // Always return the main table since we don't have multiple + // routing tables. + Table: linux.RT_TABLE_MAIN, + Protocol: rt.Protocol, + Scope: rt.Scope, + Type: rt.Type, + + Flags: rt.Flags, + }) + + m.PutAttr(254, []byte{123}) + if rt.DstLen > 0 { + m.PutAttr(linux.RTA_DST, rt.DstAddr) + } + if rt.SrcLen > 0 { + m.PutAttr(linux.RTA_SRC, rt.SrcAddr) + } + if rt.OutputInterface != 0 { + m.PutAttr(linux.RTA_OIF, rt.OutputInterface) + } + if len(rt.GatewayAddr) > 0 { + m.PutAttr(linux.RTA_GATEWAY, rt.GatewayAddr) + } + + // TODO(gvisor.dev/issue/578): There are many more attributes. + } + + return nil +} + +// newAddr handles RTM_NEWADDR requests. +func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network stack. + return syserr.ErrProtocolNotSupported + } + + var ifa linux.InterfaceAddrMessage + attrs, ok := msg.GetData(&ifa) + if !ok { + return syserr.ErrInvalidArgument + } + + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument + } + attrs = rest + + switch ahdr.Type { + case linux.IFA_LOCAL: + err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ + Family: ifa.Family, + PrefixLen: ifa.PrefixLen, + Flags: ifa.Flags, + Addr: value, + }) + if err == syscall.EEXIST { + flags := msg.Header().Flags + if flags&linux.NLM_F_EXCL != 0 { + return syserr.ErrExists + } + } else if err != nil { + return syserr.ErrInvalidArgument + } + } + } + return nil +} + +// ProcessMessage implements netlink.Protocol.ProcessMessage. +func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + hdr := msg.Header() + + // All messages start with a 1 byte protocol family. + var family uint8 + if _, ok := msg.GetData(&family); !ok { + // Linux ignores messages missing the protocol family. See + // net/core/rtnetlink.c:rtnetlink_rcv_msg. + return nil + } + + // Non-GET message types require CAP_NET_ADMIN. + if typeKind(hdr.Type) != kindGet { + creds := auth.CredentialsFromContext(ctx) + if !creds.HasCapability(linux.CAP_NET_ADMIN) { + return syserr.ErrPermissionDenied + } + } + + if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP { + // TODO(b/68878065): Only the dump variant of the types below are + // supported. + switch hdr.Type { + case linux.RTM_GETLINK: + return p.dumpLinks(ctx, msg, ms) + case linux.RTM_GETADDR: + return p.dumpAddrs(ctx, msg, ms) + case linux.RTM_GETROUTE: + return p.dumpRoutes(ctx, msg, ms) + default: + return syserr.ErrNotSupported + } + } else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST { + switch hdr.Type { + case linux.RTM_GETLINK: + return p.getLink(ctx, msg, ms) + case linux.RTM_GETROUTE: + return p.dumpRoutes(ctx, msg, ms) + case linux.RTM_NEWADDR: + return p.newAddr(ctx, msg, ms) + default: + return syserr.ErrNotSupported + } + } + return syserr.ErrNotSupported +} + +// init registers the NETLINK_ROUTE provider. +func init() { + netlink.RegisterProvider(linux.NETLINK_ROUTE, NewProtocol) +} diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go new file mode 100644 index 000000000..81f34c5a2 --- /dev/null +++ b/pkg/sentry/socket/netlink/socket.go @@ -0,0 +1,780 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package netlink provides core functionality for netlink sockets. +package netlink + +import ( + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/device" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const sizeOfInt32 int = 4 + +const ( + // minBufferSize is the smallest size of a send buffer. + minSendBufferSize = 4 << 10 // 4096 bytes. + + // defaultSendBufferSize is the default size for the send buffer. + defaultSendBufferSize = 16 * 1024 + + // maxBufferSize is the largest size a send buffer can grow to. + maxSendBufferSize = 4 << 20 // 4MB +) + +var errNoFilter = syserr.New("no filter attached", linux.ENOENT) + +// netlinkSocketDevice is the netlink socket virtual device. +var netlinkSocketDevice = device.NewAnonDevice() + +// LINT.IfChange + +// Socket is the base socket type for netlink sockets. +// +// This implementation only supports userspace sending and receiving messages +// to/from the kernel. +// +// Socket implements socket.Socket and transport.Credentialer. +// +// +stateify savable +type Socket struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + socket.SendReceiveTimeout + + // ports provides netlink port allocation. + ports *port.Manager + + // protocol is the netlink protocol implementation. + protocol Protocol + + // skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for + // netlink sockets. + skType linux.SockType + + // ep is a datagram unix endpoint used to buffer messages sent from the + // kernel to userspace. RecvMsg reads messages from this endpoint. + ep transport.Endpoint + + // connection is the kernel's connection to ep, used to write messages + // sent to userspace. + connection transport.ConnectedEndpoint + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // bound indicates that portid is valid. + bound bool + + // portID is the port ID allocated for this socket. + portID int32 + + // sendBufferSize is the send buffer "size". We don't actually have a + // fixed buffer but only consume this many bytes. + sendBufferSize uint32 + + // passcred indicates if this socket wants SCM credentials. + passcred bool + + // filter indicates that this socket has a BPF filter "installed". + // + // TODO(gvisor.dev/issue/1119): We don't actually support filtering, + // this is just bookkeeping for tracking add/remove. + filter bool +} + +var _ socket.Socket = (*Socket)(nil) +var _ transport.Credentialer = (*Socket)(nil) + +// NewSocket creates a new Socket. +func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) { + // Datagram endpoint used to buffer kernel -> user messages. + ep := transport.NewConnectionless(t) + + // Bind the endpoint for good measure so we can connect to it. The + // bound address will never be exposed. + if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { + ep.Close() + return nil, err + } + + // Create a connection from which the kernel can write messages. + connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) + if err != nil { + ep.Close() + return nil, err + } + + return &Socket{ + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, + }, nil +} + +// Release implements fs.FileOperations.Release. +func (s *socketOpsCommon) Release() { + s.connection.Release() + s.ep.Close() + + if s.bound { + s.ports.Release(s.protocol.Protocol(), s.portID) + } +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { + // ep holds messages to be read and thus handles EventIn readiness. + ready := s.ep.Readiness(mask) + + if mask&waiter.EventOut == waiter.EventOut { + // sendMsg handles messages synchronously and is thus always + // ready for writing. + ready |= waiter.EventOut + } + + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.ep.EventRegister(e, mask) + // Writable readiness never changes, so no registration is needed. +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { + s.ep.EventUnregister(e) +} + +// Passcred implements transport.Credentialer.Passcred. +func (s *socketOpsCommon) Passcred() bool { + s.mu.Lock() + passcred := s.passcred + s.mu.Unlock() + return passcred +} + +// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. +func (s *socketOpsCommon) ConnectedPasscred() bool { + // This socket is connected to the kernel, which doesn't need creds. + // + // This is arbitrary, as ConnectedPasscred on this type has no callers. + return false +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { + // TODO(b/68878065): no ioctls supported. + return 0, syserror.ENOTTY +} + +// ExtractSockAddr extracts the SockAddrNetlink from b. +func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { + if len(b) < linux.SockAddrNetlinkSize { + return nil, syserr.ErrBadAddress + } + + var sa linux.SockAddrNetlink + binary.Unmarshal(b[:linux.SockAddrNetlinkSize], usermem.ByteOrder, &sa) + + if sa.Family != linux.AF_NETLINK { + return nil, syserr.ErrInvalidArgument + } + + return &sa, nil +} + +// bindPort binds this socket to a port, preferring 'port' if it is available. +// +// port of 0 defaults to the ThreadGroup ID. +// +// Preconditions: mu is held. +func (s *socketOpsCommon) bindPort(t *kernel.Task, port int32) *syserr.Error { + if s.bound { + // Re-binding is only allowed if the port doesn't change. + if port != s.portID { + return syserr.ErrInvalidArgument + } + + return nil + } + + if port == 0 { + port = int32(t.ThreadGroup().ID()) + } + port, ok := s.ports.Allocate(s.protocol.Protocol(), port) + if !ok { + return syserr.ErrBusy + } + + s.portID = port + s.bound = true + return nil +} + +// Bind implements socket.Socket.Bind. +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + a, err := ExtractSockAddr(sockaddr) + if err != nil { + return err + } + + // No support for multicast groups yet. + if a.Groups != 0 { + return syserr.ErrPermissionDenied + } + + s.mu.Lock() + defer s.mu.Unlock() + + return s.bindPort(t, int32(a.PortID)) +} + +// Connect implements socket.Socket.Connect. +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + a, err := ExtractSockAddr(sockaddr) + if err != nil { + return err + } + + // No support for multicast groups yet. + if a.Groups != 0 { + return syserr.ErrPermissionDenied + } + + s.mu.Lock() + defer s.mu.Unlock() + + if a.PortID == 0 { + // Netlink sockets default to connected to the kernel, but + // connecting anyways automatically binds if not already bound. + if !s.bound { + // Pass port 0 to get an auto-selected port ID. + return s.bindPort(t, 0) + } + return nil + } + + // We don't support non-kernel destination ports. Linux returns EPERM + // if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so + // we emulate that. + return syserr.ErrPermissionDenied +} + +// Accept implements socket.Socket.Accept. +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Netlink sockets never support accept. + return 0, nil, 0, syserr.ErrNotSupported +} + +// Listen implements socket.Socket.Listen. +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { + // Netlink sockets never support listen. + return syserr.ErrNotSupported +} + +// Shutdown implements socket.Socket.Shutdown. +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { + // Netlink sockets never support shutdown. + return syserr.ErrNotSupported +} + +// GetSockOpt implements socket.Socket.GetSockOpt. +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + switch level { + case linux.SOL_SOCKET: + switch name { + case linux.SO_SNDBUF: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + s.mu.Lock() + defer s.mu.Unlock() + return int32(s.sendBufferSize), nil + + case linux.SO_RCVBUF: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + // We don't have limit on receiving size. + return int32(math.MaxInt32), nil + + case linux.SO_PASSCRED: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + var passcred int32 + if s.Passcred() { + passcred = 1 + } + return passcred, nil + + default: + socket.GetSockOptEmitUnimplementedEvent(t, name) + } + + case linux.SOL_NETLINK: + switch name { + case linux.NETLINK_BROADCAST_ERROR, + linux.NETLINK_CAP_ACK, + linux.NETLINK_DUMP_STRICT_CHK, + linux.NETLINK_EXT_ACK, + linux.NETLINK_LIST_MEMBERSHIPS, + linux.NETLINK_NO_ENOBUFS, + linux.NETLINK_PKTINFO: + + t.Kernel().EmitUnimplementedEvent(t) + } + } + // TODO(b/68878065): other sockopts are not supported. + return nil, syserr.ErrProtocolNotAvailable +} + +// SetSockOpt implements socket.Socket.SetSockOpt. +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { + switch level { + case linux.SOL_SOCKET: + switch name { + case linux.SO_SNDBUF: + if len(opt) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + size := usermem.ByteOrder.Uint32(opt) + if size < minSendBufferSize { + size = minSendBufferSize + } else if size > maxSendBufferSize { + size = maxSendBufferSize + } + s.mu.Lock() + s.sendBufferSize = size + s.mu.Unlock() + return nil + + case linux.SO_RCVBUF: + if len(opt) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + // We don't have limit on receiving size. So just accept anything as + // valid for compatibility. + return nil + + case linux.SO_PASSCRED: + if len(opt) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + passcred := usermem.ByteOrder.Uint32(opt) + + s.mu.Lock() + s.passcred = passcred != 0 + s.mu.Unlock() + return nil + + case linux.SO_ATTACH_FILTER: + // TODO(gvisor.dev/issue/1119): We don't actually + // support filtering. If this socket can't ever send + // messages, then there is nothing to filter and we can + // advertise support. Otherwise, be conservative and + // return an error. + if s.protocol.CanSend() { + socket.SetSockOptEmitUnimplementedEvent(t, name) + return syserr.ErrProtocolNotAvailable + } + + s.mu.Lock() + s.filter = true + s.mu.Unlock() + return nil + + case linux.SO_DETACH_FILTER: + // TODO(gvisor.dev/issue/1119): See above. + if s.protocol.CanSend() { + socket.SetSockOptEmitUnimplementedEvent(t, name) + return syserr.ErrProtocolNotAvailable + } + + s.mu.Lock() + filter := s.filter + s.filter = false + s.mu.Unlock() + + if !filter { + return errNoFilter + } + + return nil + + default: + socket.SetSockOptEmitUnimplementedEvent(t, name) + } + + case linux.SOL_NETLINK: + switch name { + case linux.NETLINK_ADD_MEMBERSHIP, + linux.NETLINK_BROADCAST_ERROR, + linux.NETLINK_CAP_ACK, + linux.NETLINK_DROP_MEMBERSHIP, + linux.NETLINK_DUMP_STRICT_CHK, + linux.NETLINK_EXT_ACK, + linux.NETLINK_LISTEN_ALL_NSID, + linux.NETLINK_NO_ENOBUFS, + linux.NETLINK_PKTINFO: + + t.Kernel().EmitUnimplementedEvent(t) + } + + } + // TODO(b/68878065): other sockopts are not supported. + return syserr.ErrProtocolNotAvailable +} + +// GetSockName implements socket.Socket.GetSockName. +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + s.mu.Lock() + defer s.mu.Unlock() + + sa := &linux.SockAddrNetlink{ + Family: linux.AF_NETLINK, + PortID: uint32(s.portID), + } + return sa, uint32(binary.Size(sa)), nil +} + +// GetPeerName implements socket.Socket.GetPeerName. +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + sa := &linux.SockAddrNetlink{ + Family: linux.AF_NETLINK, + // TODO(b/68878065): Support non-kernel peers. For now the peer + // must be the kernel. + PortID: 0, + } + return sa, uint32(binary.Size(sa)), nil +} + +// RecvMsg implements socket.Socket.RecvMsg. +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { + from := &linux.SockAddrNetlink{ + Family: linux.AF_NETLINK, + PortID: 0, + } + fromLen := uint32(binary.Size(from)) + + trunc := flags&linux.MSG_TRUNC != 0 + + r := unix.EndpointReader{ + Ctx: t, + Endpoint: s.ep, + Peek: flags&linux.MSG_PEEK != 0, + } + + doRead := func() (int64, error) { + return dst.CopyOutFrom(t, &r) + } + + // If MSG_TRUNC is set with a zero byte destination then we still need + // to read the message and discard it, or in the case where MSG_PEEK is + // set, leave it be. In both cases the full message length must be + // returned. + if trunc && dst.Addrs.NumBytes() == 0 { + doRead = func() (int64, error) { + err := r.Truncate() + // Always return zero for bytes read since the destination size is + // zero. + return 0, err + } + } + + if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + var mflags int + if n < int64(r.MsgSize) { + mflags |= linux.MSG_TRUNC + } + if trunc { + n = int64(r.MsgSize) + } + return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) + } + + // We'll have to block. Register for notification and keep trying to + // receive all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + for { + if n, err := doRead(); err != syserror.ErrWouldBlock { + var mflags int + if n < int64(r.MsgSize) { + mflags |= linux.MSG_TRUNC + } + if trunc { + n = int64(r.MsgSize) + } + return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain + } + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + } +} + +// Read implements fs.FileOperations.Read. +func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &unix.EndpointReader{ + Endpoint: s.ep, + }) +} + +// kernelSCM implements control.SCMCredentials with credentials that represent +// the kernel itself rather than a Task. +// +// +stateify savable +type kernelSCM struct{} + +// Equals implements transport.CredentialsControlMessage.Equals. +func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool { + _, ok := oc.(kernelSCM) + return ok +} + +// Credentials implements control.SCMCredentials.Credentials. +func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) { + return 0, auth.RootUID, auth.RootGID +} + +// kernelCreds is the concrete version of kernelSCM used in all creds. +var kernelCreds = &kernelSCM{} + +// sendResponse sends the response messages in ms back to userspace. +func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { + // Linux combines multiple netlink messages into a single datagram. + bufs := make([][]byte, 0, len(ms.Messages)) + for _, m := range ms.Messages { + bufs = append(bufs, m.Finalize()) + } + + // All messages are from the kernel. + cms := transport.ControlMessages{ + Credentials: kernelCreds, + } + + if len(bufs) > 0 { + // RecvMsg never receives the address, so we don't need to send + // one. + _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{}) + // If the buffer is full, we simply drop messages, just like + // Linux. + if err != nil && err != syserr.ErrWouldBlock { + return err + } + if notify { + s.connection.SendNotify() + } + } + + // N.B. multi-part messages should still send NLMSG_DONE even if + // MessageSet contains no messages. + // + // N.B. NLMSG_DONE is always sent in a different datagram. See + // net/netlink/af_netlink.c:netlink_dump. + if ms.Multi { + m := NewMessage(linux.NetlinkMessageHeader{ + Type: linux.NLMSG_DONE, + Flags: linux.NLM_F_MULTI, + Seq: ms.Seq, + PortID: uint32(ms.PortID), + }) + + // Add the dump_done_errno payload. + m.Put(int64(0)) + + _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) + if err != nil && err != syserr.ErrWouldBlock { + return err + } + if notify { + s.connection.SendNotify() + } + } + + return nil +} + +func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.NLMSG_ERROR, + }) + m.Put(linux.NetlinkErrorMessage{ + Error: int32(-err.ToLinux().Number()), + Header: hdr, + }) +} + +func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.NLMSG_ERROR, + }) + m.Put(linux.NetlinkErrorMessage{ + Error: 0, + Header: hdr, + }) +} + +// processMessages handles each message in buf, passing it to the protocol +// handler for final handling. +func (s *socketOpsCommon) processMessages(ctx context.Context, buf []byte) *syserr.Error { + for len(buf) > 0 { + msg, rest, ok := ParseMessage(buf) + if !ok { + // Linux ignores messages that are too short. See + // net/netlink/af_netlink.c:netlink_rcv_skb. + break + } + buf = rest + hdr := msg.Header() + + // Ignore control messages. + if hdr.Type < linux.NLMSG_MIN_TYPE { + continue + } + + ms := NewMessageSet(s.portID, hdr.Seq) + if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil { + dumpErrorMesage(hdr, ms, err) + } else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { + dumpAckMesage(hdr, ms) + } + + if err := s.sendResponse(ctx, ms); err != nil { + return err + } + } + + return nil +} + +// sendMsg is the core of message send, used for SendMsg and Write. +func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { + dstPort := int32(0) + + if len(to) != 0 { + a, err := ExtractSockAddr(to) + if err != nil { + return 0, err + } + + // No support for multicast groups yet. + if a.Groups != 0 { + return 0, syserr.ErrPermissionDenied + } + + dstPort = int32(a.PortID) + } + + if dstPort != 0 { + // Non-kernel destinations not supported yet. Treat as if + // NL_CFG_F_NONROOT_SEND is not set. + return 0, syserr.ErrPermissionDenied + } + + s.mu.Lock() + defer s.mu.Unlock() + + // For simplicity, and consistency with Linux, we copy in the entire + // message up front. + if src.NumBytes() > int64(s.sendBufferSize) { + return 0, syserr.ErrMessageTooLong + } + + buf := make([]byte, src.NumBytes()) + n, err := src.CopyIn(ctx, buf) + if err != nil { + // Don't partially consume messages. + return 0, syserr.FromError(err) + } + + if err := s.processMessages(ctx, buf); err != nil { + return 0, err + } + + return n, nil +} + +// SendMsg implements socket.Socket.SendMsg. +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { + return s.sendMsg(t, src, to, flags, controlMessages) +} + +// Write implements fs.FileOperations.Write. +func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) + return int64(n), err.ToError() +} + +// State implements socket.Socket.State. +func (s *socketOpsCommon) State() uint32 { + return s.ep.State() +} + +// Type implements socket.Socket.Type. +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { + return linux.AF_NETLINK, s.skType, s.protocol.Protocol() +} + +// LINT.ThenChange(./socket_vfs2.go) diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go new file mode 100644 index 000000000..dbcd8b49a --- /dev/null +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -0,0 +1,152 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 is the base VFS2 socket type for netlink sockets. +// +// This implementation only supports userspace sending and receiving messages +// to/from the kernel. +// +// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD + + socketOpsCommon +} + +var _ socket.SocketVFS2 = (*SocketVFS2)(nil) +var _ transport.Credentialer = (*SocketVFS2)(nil) + +// NewVFS2 creates a new SocketVFS2. +func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketVFS2, *syserr.Error) { + // Datagram endpoint used to buffer kernel -> user messages. + ep := transport.NewConnectionless(t) + + // Bind the endpoint for good measure so we can connect to it. The + // bound address will never be exposed. + if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { + ep.Close() + return nil, err + } + + // Create a connection from which the kernel can write messages. + connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) + if err != nil { + ep.Close() + return nil, err + } + + fd := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, + } + fd.LockFD.Init(&vfs.FileLocks{}) + return fd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) { + // TODO(b/68878065): no ioctls supported. + return 0, syserror.ENOTTY +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &unix.EndpointReader{ + Endpoint: s.ep, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) + return int64(n), err.ToError() +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD new file mode 100644 index 000000000..b6434923c --- /dev/null +++ b/pkg/sentry/socket/netlink/uevent/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "uevent", + srcs = ["protocol.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/kernel", + "//pkg/sentry/socket/netlink", + "//pkg/syserr", + ], +) diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go new file mode 100644 index 000000000..029ba21b5 --- /dev/null +++ b/pkg/sentry/socket/netlink/uevent/protocol.go @@ -0,0 +1,60 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package uevent provides a NETLINK_KOBJECT_UEVENT socket protocol. +// +// NETLINK_KOBJECT_UEVENT sockets send udev-style device events. gVisor does +// not support any device events, so these sockets never send any messages. +package uevent + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + "gvisor.dev/gvisor/pkg/syserr" +) + +// Protocol implements netlink.Protocol. +// +// +stateify savable +type Protocol struct{} + +var _ netlink.Protocol = (*Protocol)(nil) + +// NewProtocol creates a NETLINK_KOBJECT_UEVENT netlink.Protocol. +func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) { + return &Protocol{}, nil +} + +// Protocol implements netlink.Protocol.Protocol. +func (p *Protocol) Protocol() int { + return linux.NETLINK_KOBJECT_UEVENT +} + +// CanSend implements netlink.Protocol.CanSend. +func (p *Protocol) CanSend() bool { + return false +} + +// ProcessMessage implements netlink.Protocol.ProcessMessage. +func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + // Silently ignore all messages. + return nil +} + +// init registers the NETLINK_KOBJECT_UEVENT provider. +func init() { + netlink.RegisterProvider(linux.NETLINK_KOBJECT_UEVENT, NewProtocol) +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD new file mode 100644 index 000000000..ea6ebd0e2 --- /dev/null +++ b/pkg/sentry/socket/netstack/BUILD @@ -0,0 +1,56 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "netstack", + srcs = [ + "device.go", + "netstack.go", + "netstack_vfs2.go", + "provider.go", + "provider_vfs2.go", + "save_restore.go", + "stack.go", + ], + visibility = [ + "//pkg/sentry:internal", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/binary", + "//pkg/context", + "//pkg/log", + "//pkg/metric", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket", + "//pkg/sentry/socket/netfilter", + "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/socket/netstack/device.go b/pkg/sentry/socket/netstack/device.go new file mode 100644 index 000000000..fbeb89fb8 --- /dev/null +++ b/pkg/sentry/socket/netstack/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import "gvisor.dev/gvisor/pkg/sentry/device" + +// netstackDevice is the endpoint socket virtual device. +var netstackDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go new file mode 100644 index 000000000..3b248a953 --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack.go @@ -0,0 +1,3143 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package netstack provides an implementation of the socket.Socket interface +// that is backed by a tcpip.Endpoint. +// +// It does not depend on any particular endpoint implementation, and thus can +// be used to expose certain endpoints to the sentry while leaving others out, +// for example, TCP endpoints and Unix-domain endpoints. +// +// Lock ordering: netstack => mm: ioSequencePayload copies user memory inside +// tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during +// this operation. +package netstack + +import ( + "bytes" + "io" + "math" + "reflect" + "sync/atomic" + "syscall" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func mustCreateMetric(name, description string) *tcpip.StatCounter { + var cm tcpip.StatCounter + metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, cm.Value) + return &cm +} + +func mustCreateGauge(name, description string) *tcpip.StatCounter { + var cm tcpip.StatCounter + metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, cm.Value) + return &cm +} + +// Metrics contains metrics exported by netstack. +var Metrics = tcpip.Stats{ + UnknownProtocolRcvdPackets: mustCreateMetric("/netstack/unknown_protocol_received_packets", "Number of packets received by netstack that were for an unknown or unsupported protocol."), + MalformedRcvdPackets: mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."), + DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."), + ICMP: tcpip.ICMPStats{ + V4PacketsSent: tcpip.ICMPv4SentPacketStats{ + ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ + Echo: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."), + SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."), + Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."), + Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."), + TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."), + InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."), + InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."), + }, + Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."), + }, + V4PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ + ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ + Echo: mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."), + SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."), + Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."), + Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."), + TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."), + InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."), + InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."), + }, + Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."), + }, + V6PacketsSent: tcpip.ICMPv6SentPacketStats{ + ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ + EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."), + PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."), + RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."), + RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."), + NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."), + NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."), + RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."), + }, + Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."), + }, + V6PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ + ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ + EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."), + PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."), + RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."), + RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."), + NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."), + NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."), + RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."), + }, + Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."), + }, + }, + IP: tcpip.IPStats{ + PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."), + InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."), + InvalidSourceAddressesReceived: mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Total number of IP packets received with an unknown or invalid source address."), + PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), + PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."), + OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), + MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."), + MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."), + }, + TCP: tcpip.TCPStats{ + ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), + PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), + CurrentEstablished: mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), + CurrentConnected: mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."), + EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), + EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."), + EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), + ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), + ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), + ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), + ListenOverflowSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."), + ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."), + FailedConnectionAttempts: mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."), + ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."), + InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."), + SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."), + SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."), + ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."), + ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."), + Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."), + FastRecovery: mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."), + SACKRecovery: mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."), + SlowStartRetransmits: mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."), + FastRetransmit: mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."), + Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), + ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), + }, + UDP: tcpip.UDPStats{ + PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), + UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."), + ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."), + MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), + PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), + PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), + ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."), + }, +} + +// DefaultTTL is linux's default TTL. All network protocols in all stacks used +// with this package must have this value set as their default TTL. +const DefaultTTL = 64 + +const sizeOfInt32 int = 4 + +var errStackType = syserr.New("expected but did not receive a netstack.Stack", linux.EINVAL) + +// ntohs converts a 16-bit number from network byte order to host byte order. It +// assumes that the host is little endian. +func ntohs(v uint16) uint16 { + return v<<8 | v>>8 +} + +// htons converts a 16-bit number from host byte order to network byte order. It +// assumes that the host is little endian. +func htons(v uint16) uint16 { + return ntohs(v) +} + +// commonEndpoint represents the intersection of a tcpip.Endpoint and a +// transport.Endpoint. +type commonEndpoint interface { + // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and + // transport.Endpoint.GetLocalAddress. + GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + + // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and + // transport.Endpoint.GetRemoteAddress. + GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) + + // Readiness implements tcpip.Endpoint.Readiness and + // transport.Endpoint.Readiness. + Readiness(mask waiter.EventMask) waiter.EventMask + + // SetSockOpt implements tcpip.Endpoint.SetSockOpt and + // transport.Endpoint.SetSockOpt. + SetSockOpt(interface{}) *tcpip.Error + + // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and + // transport.Endpoint.SetSockOptBool. + SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error + + // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and + // transport.Endpoint.SetSockOptInt. + SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error + + // GetSockOpt implements tcpip.Endpoint.GetSockOpt and + // transport.Endpoint.GetSockOpt. + GetSockOpt(interface{}) *tcpip.Error + + // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and + // transport.Endpoint.GetSockOpt. + GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) + + // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and + // transport.Endpoint.GetSockOpt. + GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) +} + +// LINT.IfChange + +// SocketOperations encapsulates all the state needed to represent a network stack +// endpoint in the kernel context. +// +// +stateify savable +type SocketOperations struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + socket.SendReceiveTimeout + *waiter.Queue + + family int + Endpoint tcpip.Endpoint + skType linux.SockType + protocol int + + // readViewHasData is 1 iff readView has data to be read, 0 otherwise. + // Must be accessed using atomic operations. It must only be written + // with readMu held but can be read without holding readMu. The latter + // is required to avoid deadlocks in epoll Readiness checks. + readViewHasData uint32 + + // readMu protects access to the below fields. + readMu sync.Mutex `state:"nosave"` + // readView contains the remaining payload from the last packet. + readView buffer.View + // readCM holds control message information for the last packet read + // from Endpoint. + readCM tcpip.ControlMessages + sender tcpip.FullAddress + + // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps + // of returned messages can be returned via control messages. When + // false, the same timestamp is instead stored and can be read via the + // SIOCGSTAMP ioctl. It is protected by readMu. See socket(7). + sockOptTimestamp bool + // timestampValid indicates whether timestamp for SIOCGSTAMP has been + // set. It is protected by readMu. + timestampValid bool + // timestampNS holds the timestamp to use with SIOCTSTAMP. It is only + // valid when timestampValid is true. It is protected by readMu. + timestampNS int64 + + // sockOptInq corresponds to TCP_INQ. It is implemented at this level + // because it takes into account data from readView. + sockOptInq bool +} + +// New creates a new endpoint socket. +func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { + if skType == linux.SOCK_STREAM { + if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + } + + dirent := socket.NewDirent(t, netstackDevice) + defer dirent.DecRef() + return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, + }), nil +} + +var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{})) +var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{})) +var sockAddrLinkSize = int(binary.Size(linux.SockAddrLink{})) + +// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the +// netstack representation taking any addresses into account. +func bytesToIPAddress(addr []byte) tcpip.Address { + if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) { + return "" + } + return tcpip.Address(addr) +} + +// AddressAndFamily reads an sockaddr struct from the given address and +// converts it to the FullAddress format. It supports AF_UNIX, AF_INET, +// AF_INET6, and AF_PACKET addresses. +// +// AddressAndFamily returns an address and its family. +func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { + // Make sure we have at least 2 bytes for the address family. + if len(addr) < 2 { + return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument + } + + // Get the rest of the fields based on the address family. + switch family := usermem.ByteOrder.Uint16(addr); family { + case linux.AF_UNIX: + path := addr[2:] + if len(path) > linux.UnixPathMax { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + // Drop the terminating NUL (if one exists) and everything after + // it for filesystem (non-abstract) addresses. + if len(path) > 0 && path[0] != 0 { + if n := bytes.IndexByte(path[1:], 0); n >= 0 { + path = path[:n+1] + } + } + return tcpip.FullAddress{ + Addr: tcpip.Address(path), + }, family, nil + + case linux.AF_INET: + var a linux.SockAddrInet + if len(addr) < sockAddrInetSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a) + + out := tcpip.FullAddress{ + Addr: bytesToIPAddress(a.Addr[:]), + Port: ntohs(a.Port), + } + return out, family, nil + + case linux.AF_INET6: + var a linux.SockAddrInet6 + if len(addr) < sockAddrInet6Size { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a) + + out := tcpip.FullAddress{ + Addr: bytesToIPAddress(a.Addr[:]), + Port: ntohs(a.Port), + } + if isLinkLocal(out.Addr) { + out.NIC = tcpip.NICID(a.Scope_id) + } + return out, family, nil + + case linux.AF_PACKET: + var a linux.SockAddrLink + if len(addr) < sockAddrLinkSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a) + if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + + // TODO(b/129292371): Return protocol too. + return tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + }, family, nil + + case linux.AF_UNSPEC: + return tcpip.FullAddress{}, family, nil + + default: + return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported + } +} + +func (s *socketOpsCommon) isPacketBased() bool { + return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW +} + +// fetchReadView updates the readView field of the socket if it's currently +// empty. It assumes that the socket is locked. +// +// Precondition: s.readMu must be held. +func (s *socketOpsCommon) fetchReadView() *syserr.Error { + if len(s.readView) > 0 { + return nil + } + s.readView = nil + s.sender = tcpip.FullAddress{} + + v, cms, err := s.Endpoint.Read(&s.sender) + if err != nil { + atomic.StoreUint32(&s.readViewHasData, 0) + return syserr.TranslateNetstackError(err) + } + + s.readView = v + s.readCM = cms + atomic.StoreUint32(&s.readViewHasData, 1) + + return nil +} + +// Release implements fs.FileOperations.Release. +func (s *socketOpsCommon) Release() { + s.Endpoint.Close() +} + +// Read implements fs.FileOperations.Read. +func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) + if err == syserr.ErrWouldBlock { + return int64(n), syserror.ErrWouldBlock + } + if err != nil { + return 0, err.ToError() + } + return int64(n), nil +} + +// WriteTo implements fs.FileOperations.WriteTo. +func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) { + s.readMu.Lock() + + // Copy as much data as possible. + done := int64(0) + for count > 0 { + // This may return a blocking error. + if err := s.fetchReadView(); err != nil { + s.readMu.Unlock() + return done, err.ToError() + } + + // Write to the underlying file. + n, err := dst.Write(s.readView) + done += int64(n) + count -= int64(n) + if dup { + // That's all we support for dup. This is generally + // supported by any Linux system calls, but the + // expectation is that now a caller will call read to + // actually remove these bytes from the socket. + break + } + + // Drop that part of the view. + s.readView.TrimFront(n) + if err != nil { + s.readMu.Unlock() + return done, err + } + } + + s.readMu.Unlock() + return done, nil +} + +// ioSequencePayload implements tcpip.Payload. +// +// t copies user memory bytes on demand based on the requested size. +type ioSequencePayload struct { + ctx context.Context + src usermem.IOSequence +} + +// FullPayload implements tcpip.Payloader.FullPayload +func (i *ioSequencePayload) FullPayload() ([]byte, *tcpip.Error) { + return i.Payload(int(i.src.NumBytes())) +} + +// Payload implements tcpip.Payloader.Payload. +func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) { + if max := int(i.src.NumBytes()); size > max { + size = max + } + v := buffer.NewView(size) + if _, err := i.src.CopyIn(i.ctx, v); err != nil { + return nil, tcpip.ErrBadAddress + } + return v, nil +} + +// DropFirst drops the first n bytes from underlying src. +func (i *ioSequencePayload) DropFirst(n int) { + i.src = i.src.DropFirst(int(n)) +} + +// Write implements fs.FileOperations.Write. +func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + f := &ioSequencePayload{ctx: ctx, src: src} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err + } + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) + } + + if err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + + if int64(n) < src.NumBytes() { + return int64(n), syserror.ErrWouldBlock + } + + return int64(n), nil +} + +// readerPayload implements tcpip.Payloader. +// +// It allocates a view and reads from a reader on-demand, based on available +// capacity in the endpoint. +type readerPayload struct { + ctx context.Context + r io.Reader + count int64 + err error +} + +// FullPayload implements tcpip.Payloader.FullPayload. +func (r *readerPayload) FullPayload() ([]byte, *tcpip.Error) { + return r.Payload(int(r.count)) +} + +// Payload implements tcpip.Payloader.Payload. +func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) { + if size > int(r.count) { + size = int(r.count) + } + v := buffer.NewView(size) + n, err := r.r.Read(v) + if n > 0 { + // We ignore the error here. It may re-occur on subsequent + // reads, but for now we can enqueue some amount of data. + r.count -= int64(n) + return v[:n], nil + } + if err == syserror.ErrWouldBlock { + return nil, tcpip.ErrWouldBlock + } else if err != nil { + r.err = err // Save for propation. + return nil, tcpip.ErrBadAddress + } + + // There is no data and no error. Return an error, which will propagate + // r.err, which will be nil. This is the desired result: (0, nil). + return nil, tcpip.ErrBadAddress +} + +// ReadFrom implements fs.FileOperations.ReadFrom. +func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { + f := &readerPayload{ctx: ctx, r: r, count: count} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{ + // Reads may be destructive but should be very fast, + // so we can't release the lock while copying data. + Atomic: true, + }) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err + } + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{ + Atomic: true, // See above. + }) + } + if err == tcpip.ErrWouldBlock { + return n, syserror.ErrWouldBlock + } else if err != nil { + return int64(n), f.err // Propagate error. + } + + return int64(n), nil +} + +// Readiness returns a mask of ready events for socket s. +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { + r := s.Endpoint.Readiness(mask) + + // Check our cached value iff the caller asked for readability and the + // endpoint itself is currently not readable. + if (mask & ^r & waiter.EventIn) != 0 { + if atomic.LoadUint32(&s.readViewHasData) == 1 { + r |= waiter.EventIn + } + } + + return r +} + +func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { + if family == uint16(s.family) { + return nil + } + if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { + v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption) + if err != nil { + return syserr.TranslateNetstackError(err) + } + if !v { + return nil + } + } + return syserr.ErrInvalidArgument +} + +// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the +// receiver's family is AF_INET6. +// +// This is a hack to work around the fact that both IPv4 and IPv6 ANY are +// represented by the empty string. +// +// TODO(gvisor.dev/issue/1556): remove this function. +func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { + if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { + addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00" + } + return addr +} + +// Connect implements the linux syscall connect(2) for sockets backed by +// tpcip.Endpoint. +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + addr, family, err := AddressAndFamily(sockaddr) + if err != nil { + return err + } + + if family == linux.AF_UNSPEC { + err := s.Endpoint.Disconnect() + if err == tcpip.ErrNotSupported { + return syserr.ErrAddressFamilyNotSupported + } + return syserr.TranslateNetstackError(err) + } + + if err := s.checkFamily(family, false /* exact */); err != nil { + return err + } + addr = s.mapFamily(addr, family) + + // Always return right away in the non-blocking case. + if !blocking { + return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) + } + + // Register for notification when the endpoint becomes writable, then + // initiate the connection. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + + if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting { + if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { + // TCP unlike UDP returns EADDRNOTAVAIL when it can't + // find an available local ephemeral port. + if err == tcpip.ErrNoPortAvailable { + return syserr.ErrAddressNotAvailable + } + } + + return syserr.TranslateNetstackError(err) + } + + // It's pending, so we have to wait for a notification, and fetch the + // result once the wait completes. + if err := t.Block(ch); err != nil { + return syserr.FromError(err) + } + + // Call Connect() again after blocking to find connect's result. + return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) +} + +// Bind implements the linux syscall bind(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + if len(sockaddr) < 2 { + return syserr.ErrInvalidArgument + } + + family := usermem.ByteOrder.Uint16(sockaddr) + var addr tcpip.FullAddress + + // Bind for AF_PACKET requires only family, protocol and ifindex. + // In function AddressAndFamily, we check the address length which is + // not needed for AF_PACKET bind. + if family == linux.AF_PACKET { + var a linux.SockAddrLink + if len(sockaddr) < sockAddrLinkSize { + return syserr.ErrInvalidArgument + } + binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a) + + if a.Protocol != uint16(s.protocol) { + return syserr.ErrInvalidArgument + } + + addr = tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + } + } else { + var err *syserr.Error + addr, family, err = AddressAndFamily(sockaddr) + if err != nil { + return err + } + + if err = s.checkFamily(family, true /* exact */); err != nil { + return err + } + + addr = s.mapFamily(addr, family) + } + + // Issue the bind request to the endpoint. + return syserr.TranslateNetstackError(s.Endpoint.Bind(addr)) +} + +// Listen implements the linux syscall listen(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { + return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog)) +} + +// blockingAccept implements a blocking version of accept(2), that is, if no +// connections are ready to be accept, it will block until one becomes ready. +func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + // Try to accept the connection again; if it fails, then wait until we + // get a notification. + for { + if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock { + return ep, wq, syserr.TranslateNetstackError(err) + } + + if err := t.Block(ch); err != nil { + return nil, nil, syserr.FromError(err) + } + } +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, wq, terr := s.Endpoint.Accept() + if terr != nil { + if terr != tcpip.ErrWouldBlock || !blocking { + return 0, nil, 0, syserr.TranslateNetstackError(terr) + } + + var err *syserr.Error + ep, wq, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := New(t, s.family, s.skType, s.protocol, wq, ep) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if flags&linux.SOCK_NONBLOCK != 0 { + flags := ns.Flags() + flags.NonBlocking = true + ns.SetFlags(flags.Settable()) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer and write it to peer slice. + var err *syserr.Error + addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + + t.Kernel().RecordSocket(ns) + + return fd, addr, addrLen, syserr.FromError(e) +} + +// ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags. +func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { + var f tcpip.ShutdownFlags + switch how { + case linux.SHUT_RD: + f = tcpip.ShutdownRead + case linux.SHUT_WR: + f = tcpip.ShutdownWrite + case linux.SHUT_RDWR: + f = tcpip.ShutdownRead | tcpip.ShutdownWrite + default: + return 0, syserr.ErrInvalidArgument + } + return f, nil +} + +// Shutdown implements the linux syscall shutdown(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { + f, err := ConvertShutdown(how) + if err != nil { + return err + } + + // Issue shutdown request. + return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f)) +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketOperations rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptTimestamp { + val = 1 + } + return val, nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptInq { + val = 1 + } + return val, nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) + if err != nil { + return nil, err + } + return info, nil + + case linux.IPT_SO_GET_ENTRIES: + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return entries, nil + + } + } + + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) +} + +// GetSockOpt can be used to implement the linux syscall getsockopt(2) for +// sockets backed by a commonEndpoint. +func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { + switch level { + case linux.SOL_SOCKET: + return getSockOptSocket(t, s, ep, family, skType, name, outLen) + + case linux.SOL_TCP: + return getSockOptTCP(t, ep, name, outLen) + + case linux.SOL_IPV6: + return getSockOptIPv6(t, ep, name, outLen) + + case linux.SOL_IP: + return getSockOptIP(t, ep, name, outLen, family) + + case linux.SOL_UDP, + linux.SOL_ICMPV6, + linux.SOL_RAW, + linux.SOL_PACKET: + + t.Kernel().EmitUnimplementedEvent(t) + } + + return nil, syserr.ErrProtocolNotAvailable +} + +func boolToInt32(v bool) int32 { + if v { + return 1 + } + return 0 +} + +// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { + // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. + switch name { + case linux.SO_ERROR: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + // Get the last error and convert it. + err := ep.GetSockOpt(tcpip.ErrorOption{}) + if err == nil { + return int32(0), nil + } + return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil + + case linux.SO_PEERCRED: + if family != linux.AF_UNIX || outLen < syscall.SizeofUcred { + return nil, syserr.ErrInvalidArgument + } + + tcred := t.Credentials() + return syscall.Ucred{ + Pid: int32(t.ThreadGroup().ID()), + Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), + Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), + }, nil + + case linux.SO_PASSCRED: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.PasscredOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.SO_SNDBUF: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + size, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + if size > math.MaxInt32 { + size = math.MaxInt32 + } + + return int32(size), nil + + case linux.SO_RCVBUF: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + size, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + if size > math.MaxInt32 { + size = math.MaxInt32 + } + + return int32(size), nil + + case linux.SO_REUSEADDR: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.SO_REUSEPORT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReusePortOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.SO_BINDTODEVICE: + var v tcpip.BindToDeviceOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + if v == 0 { + return []byte{}, nil + } + if outLen < linux.IFNAMSIZ { + return nil, syserr.ErrInvalidArgument + } + s := t.NetworkContext() + if s == nil { + return nil, syserr.ErrNoDevice + } + nic, ok := s.Interfaces()[int32(v)] + if !ok { + // The NICID no longer indicates a valid interface, probably because that + // interface was removed. + return nil, syserr.ErrUnknownDevice + } + return append([]byte(nic.Name), 0), nil + + case linux.SO_BROADCAST: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.BroadcastOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.SO_KEEPALIVE: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.SO_LINGER: + if outLen < linux.SizeOfLinger { + return nil, syserr.ErrInvalidArgument + } + return linux.Linger{}, nil + + case linux.SO_SNDTIMEO: + // TODO(igudger): Linux allows shorter lengths for partial results. + if outLen < linux.SizeOfTimeval { + return nil, syserr.ErrInvalidArgument + } + + return linux.NsecToTimeval(s.SendTimeout()), nil + + case linux.SO_RCVTIMEO: + // TODO(igudger): Linux allows shorter lengths for partial results. + if outLen < linux.SizeOfTimeval { + return nil, syserr.ErrInvalidArgument + } + + return linux.NsecToTimeval(s.RecvTimeout()), nil + + case linux.SO_OOBINLINE: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.OutOfBandInlineOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.SO_NO_CHECK: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.NoChecksumOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + default: + socket.GetSockOptEmitUnimplementedEvent(t, name) + } + return nil, syserr.ErrProtocolNotAvailable +} + +// getSockOptTCP implements GetSockOpt when level is SOL_TCP. +func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { + switch name { + case linux.TCP_NODELAY: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.DelayOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(!v), nil + + case linux.TCP_CORK: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.CorkOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.TCP_QUICKACK: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.QuickAckOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.TCP_MAXSEG: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.MaxSegOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.TCP_KEEPIDLE: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.KeepaliveIdleOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + + case linux.TCP_KEEPINTVL: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.KeepaliveIntervalOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + + case linux.TCP_KEEPCNT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.TCP_USER_TIMEOUT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPUserTimeoutOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Millisecond), nil + + case linux.TCP_INFO: + var v tcpip.TCPInfoOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + // TODO(b/64800844): Translate fields once they are added to + // tcpip.TCPInfoOption. + info := linux.TCPInfo{} + + // Linux truncates the output binary to outLen. + ib := binary.Marshal(nil, usermem.ByteOrder, &info) + if len(ib) > outLen { + ib = ib[:outLen] + } + + return ib, nil + + case linux.TCP_CC_INFO, + linux.TCP_NOTSENT_LOWAT, + linux.TCP_ZEROCOPY_RECEIVE: + + t.Kernel().EmitUnimplementedEvent(t) + + case linux.TCP_CONGESTION: + if outLen <= 0 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.CongestionControlOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + // We match linux behaviour here where it returns the lower of + // TCP_CA_NAME_MAX bytes or the value of the option length. + // + // This is Linux's net/tcp.h TCP_CA_NAME_MAX. + const tcpCANameMax = 16 + + toCopy := tcpCANameMax + if outLen < tcpCANameMax { + toCopy = outLen + } + b := make([]byte, toCopy) + copy(b, v) + return b, nil + + case linux.TCP_LINGER2: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPLingerTimeoutOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + + case linux.TCP_DEFER_ACCEPT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPDeferAcceptOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + + case linux.TCP_SYNCNT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.TCP_WINDOW_CLAMP: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + default: + emitUnimplementedEventTCP(t, name) + } + return nil, syserr.ErrProtocolNotAvailable +} + +// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. +func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { + switch name { + case linux.IPV6_V6ONLY: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.V6OnlyOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.IPV6_PATHMTU: + t.Kernel().EmitUnimplementedEvent(t) + + case linux.IPV6_TCLASS: + // Length handling for parity with Linux. + if outLen == 0 { + return make([]byte, 0), nil + } + v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + uintv := uint32(v) + // Linux truncates the output binary to outLen. + ib := binary.Marshal(nil, usermem.ByteOrder, &uintv) + // Handle cases where outLen is lesser than sizeOfInt32. + if len(ib) > outLen { + ib = ib[:outLen] + } + return ib, nil + + case linux.IPV6_RECVTCLASS: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + default: + emitUnimplementedEventIPv6(t, name) + } + return nil, syserr.ErrProtocolNotAvailable +} + +// getSockOptIP implements GetSockOpt when level is SOL_IP. +func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) { + switch name { + case linux.IP_TTL: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TTLOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + // Fill in the default value, if needed. + if v == 0 { + v = DefaultTTL + } + + return int32(v), nil + + case linux.IP_MULTICAST_TTL: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.IP_MULTICAST_IF: + if outLen < len(linux.InetAddr{}) { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.MulticastInterfaceOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) + + return a.(*linux.SockAddrInet).Addr, nil + + case linux.IP_MULTICAST_LOOP: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.IP_TOS: + // Length handling for parity with Linux. + if outLen == 0 { + return []byte(nil), nil + } + v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + if outLen < sizeOfInt32 { + return uint8(v), nil + } + return int32(v), nil + + case linux.IP_RECVTOS: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + case linux.IP_PKTINFO: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + + default: + emitUnimplementedEventIP(t, name) + } + return nil, syserr.ErrProtocolNotAvailable +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketOperations rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIPTReplace { + return syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal) + + case linux.IPT_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + } + } + + return SetSockOpt(t, s, s.Endpoint, level, name, optVal) +} + +// SetSockOpt can be used to implement the linux syscall setsockopt(2) for +// sockets backed by a commonEndpoint. +func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { + switch level { + case linux.SOL_SOCKET: + return setSockOptSocket(t, s, ep, name, optVal) + + case linux.SOL_TCP: + return setSockOptTCP(t, ep, name, optVal) + + case linux.SOL_IPV6: + return setSockOptIPv6(t, ep, name, optVal) + + case linux.SOL_IP: + return setSockOptIP(t, ep, name, optVal) + + case linux.SOL_UDP, + linux.SOL_ICMPV6, + linux.SOL_RAW, + linux.SOL_PACKET: + + t.Kernel().EmitUnimplementedEvent(t) + } + + // Default to the old behavior; hand off to network stack. + return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) +} + +// setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. +func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + switch name { + case linux.SO_SNDBUF: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.SendBufferSizeOption, int(v))) + + case linux.SO_RCVBUF: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, int(v))) + + case linux.SO_REUSEADDR: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0)) + + case linux.SO_REUSEPORT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0)) + + case linux.SO_BINDTODEVICE: + n := bytes.IndexByte(optVal, 0) + if n == -1 { + n = len(optVal) + } + name := string(optVal[:n]) + if name == "" { + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0))) + } + s := t.NetworkContext() + if s == nil { + return syserr.ErrNoDevice + } + for nicID, nic := range s.Interfaces() { + if nic.Name == name { + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID))) + } + } + return syserr.ErrUnknownDevice + + case linux.SO_BROADCAST: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.BroadcastOption, v != 0)) + + case linux.SO_PASSCRED: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0)) + + case linux.SO_KEEPALIVE: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0)) + + case linux.SO_SNDTIMEO: + if len(optVal) < linux.SizeOfTimeval { + return syserr.ErrInvalidArgument + } + + var v linux.Timeval + binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v) + if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { + return syserr.ErrDomain + } + s.SetSendTimeout(v.ToNsecCapped()) + return nil + + case linux.SO_RCVTIMEO: + if len(optVal) < linux.SizeOfTimeval { + return syserr.ErrInvalidArgument + } + + var v linux.Timeval + binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v) + if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { + return syserr.ErrDomain + } + s.SetRecvTimeout(v.ToNsecCapped()) + return nil + + case linux.SO_OOBINLINE: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + + if v == 0 { + socket.SetSockOptEmitUnimplementedEvent(t, name) + } + + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v))) + + case linux.SO_NO_CHECK: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0)) + + case linux.SO_LINGER: + if len(optVal) < linux.SizeOfLinger { + return syserr.ErrInvalidArgument + } + + var v linux.Linger + binary.Unmarshal(optVal[:linux.SizeOfLinger], usermem.ByteOrder, &v) + + if v != (linux.Linger{}) { + socket.SetSockOptEmitUnimplementedEvent(t, name) + } + + return nil + + default: + socket.SetSockOptEmitUnimplementedEvent(t, name) + } + + // Default to the old behavior; hand off to network stack. + return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) +} + +// setSockOptTCP implements SetSockOpt when level is SOL_TCP. +func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + switch name { + case linux.TCP_NODELAY: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0)) + + case linux.TCP_CORK: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0)) + + case linux.TCP_QUICKACK: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0)) + + case linux.TCP_MAXSEG: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v))) + + case linux.TCP_KEEPIDLE: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + if v < 1 || v > linux.MAX_TCP_KEEPIDLE { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)))) + + case linux.TCP_KEEPINTVL: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + if v < 1 || v > linux.MAX_TCP_KEEPINTVL { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)))) + + case linux.TCP_KEEPCNT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + if v < 1 || v > linux.MAX_TCP_KEEPCNT { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v))) + + case linux.TCP_USER_TIMEOUT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < 0 { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)))) + + case linux.TCP_CONGESTION: + v := tcpip.CongestionControlOption(optVal) + if err := ep.SetSockOpt(v); err != nil { + return syserr.TranslateNetstackError(err) + } + return nil + + case linux.TCP_LINGER2: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)))) + + case linux.TCP_DEFER_ACCEPT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < 0 { + v = 0 + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + + case linux.TCP_SYNCNT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) + + case linux.TCP_WINDOW_CLAMP: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) + + case linux.TCP_REPAIR_OPTIONS: + t.Kernel().EmitUnimplementedEvent(t) + + default: + emitUnimplementedEventTCP(t, name) + } + + // Default to the old behavior; hand off to network stack. + return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) +} + +// setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. +func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + switch name { + case linux.IPV6_V6ONLY: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.V6OnlyOption, v != 0)) + + case linux.IPV6_ADD_MEMBERSHIP, + linux.IPV6_DROP_MEMBERSHIP, + linux.IPV6_IPSEC_POLICY, + linux.IPV6_JOIN_ANYCAST, + linux.IPV6_LEAVE_ANYCAST, + // TODO(b/148887420): Add support for IPV6_PKTINFO. + linux.IPV6_PKTINFO, + linux.IPV6_ROUTER_ALERT, + linux.IPV6_XFRM_POLICY, + linux.MCAST_BLOCK_SOURCE, + linux.MCAST_JOIN_GROUP, + linux.MCAST_JOIN_SOURCE_GROUP, + linux.MCAST_LEAVE_GROUP, + linux.MCAST_LEAVE_SOURCE_GROUP, + linux.MCAST_UNBLOCK_SOURCE: + + t.Kernel().EmitUnimplementedEvent(t) + + case linux.IPV6_TCLASS: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < -1 || v > 255 { + return syserr.ErrInvalidArgument + } + if v == -1 { + v = 0 + } + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v))) + + case linux.IPV6_RECVTCLASS: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0)) + + default: + emitUnimplementedEventIPv6(t, name) + } + + // Default to the old behavior; hand off to network stack. + return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) +} + +var ( + inetMulticastRequestSize = int(binary.Size(linux.InetMulticastRequest{})) + inetMulticastRequestWithNICSize = int(binary.Size(linux.InetMulticastRequestWithNIC{})) +) + +// copyInMulticastRequest copies in a variable-size multicast request. The +// kernel determines which structure was passed by its length. IP_MULTICAST_IF +// supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and +// IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this, +// allowAddr controls whether in_addr is accepted or rejected. +func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) { + if len(optVal) < len(linux.InetAddr{}) { + return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument + } + + if len(optVal) < inetMulticastRequestSize { + if !allowAddr { + return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument + } + + var req linux.InetMulticastRequestWithNIC + copy(req.InterfaceAddr[:], optVal) + return req, nil + } + + if len(optVal) >= inetMulticastRequestWithNICSize { + var req linux.InetMulticastRequestWithNIC + binary.Unmarshal(optVal[:inetMulticastRequestWithNICSize], usermem.ByteOrder, &req) + return req, nil + } + + var req linux.InetMulticastRequestWithNIC + binary.Unmarshal(optVal[:inetMulticastRequestSize], usermem.ByteOrder, &req.InetMulticastRequest) + return req, nil +} + +// parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf. +// +// net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options. +func parseIntOrChar(buf []byte) (int32, *syserr.Error) { + if len(buf) == 0 { + return 0, syserr.ErrInvalidArgument + } + + if len(buf) >= sizeOfInt32 { + return int32(usermem.ByteOrder.Uint32(buf)), nil + } + + return int32(buf[0]), nil +} + +// setSockOptIP implements SetSockOpt when level is SOL_IP. +func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + switch name { + case linux.IP_MULTICAST_TTL: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + if v == -1 { + // Linux translates -1 to 1. + v = 1 + } + if v < 0 || v > 255 { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v))) + + case linux.IP_ADD_MEMBERSHIP: + req, err := copyInMulticastRequest(optVal, false /* allowAddr */) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{ + NIC: tcpip.NICID(req.InterfaceIndex), + // TODO(igudger): Change AddMembership to use the standard + // any address representation. + InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]), + MulticastAddr: tcpip.Address(req.MulticastAddr[:]), + })) + + case linux.IP_DROP_MEMBERSHIP: + req, err := copyInMulticastRequest(optVal, false /* allowAddr */) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{ + NIC: tcpip.NICID(req.InterfaceIndex), + // TODO(igudger): Change DropMembership to use the standard + // any address representation. + InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]), + MulticastAddr: tcpip.Address(req.MulticastAddr[:]), + })) + + case linux.IP_MULTICAST_IF: + req, err := copyInMulticastRequest(optVal, true /* allowAddr */) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{ + NIC: tcpip.NICID(req.InterfaceIndex), + InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]), + })) + + case linux.IP_MULTICAST_LOOP: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0)) + + case linux.MCAST_JOIN_GROUP: + // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. + t.Kernel().EmitUnimplementedEvent(t) + return syserr.ErrInvalidArgument + + case linux.IP_TTL: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + // -1 means default TTL. + if v == -1 { + v = 0 + } else if v < 1 || v > 255 { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v))) + + case linux.IP_TOS: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v))) + + case linux.IP_RECVTOS: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0)) + + case linux.IP_PKTINFO: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0)) + + case linux.IP_HDRINCL: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0)) + + case linux.IP_ADD_SOURCE_MEMBERSHIP, + linux.IP_BIND_ADDRESS_NO_PORT, + linux.IP_BLOCK_SOURCE, + linux.IP_CHECKSUM, + linux.IP_DROP_SOURCE_MEMBERSHIP, + linux.IP_FREEBIND, + linux.IP_IPSEC_POLICY, + linux.IP_MINTTL, + linux.IP_MSFILTER, + linux.IP_MTU_DISCOVER, + linux.IP_MULTICAST_ALL, + linux.IP_NODEFRAG, + linux.IP_OPTIONS, + linux.IP_PASSSEC, + linux.IP_RECVERR, + linux.IP_RECVFRAGSIZE, + linux.IP_RECVOPTS, + linux.IP_RECVORIGDSTADDR, + linux.IP_RECVTTL, + linux.IP_RETOPTS, + linux.IP_TRANSPARENT, + linux.IP_UNBLOCK_SOURCE, + linux.IP_UNICAST_IF, + linux.IP_XFRM_POLICY, + linux.MCAST_BLOCK_SOURCE, + linux.MCAST_JOIN_SOURCE_GROUP, + linux.MCAST_LEAVE_GROUP, + linux.MCAST_LEAVE_SOURCE_GROUP, + linux.MCAST_MSFILTER, + linux.MCAST_UNBLOCK_SOURCE: + + t.Kernel().EmitUnimplementedEvent(t) + } + + // Default to the old behavior; hand off to network stack. + return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) +} + +// emitUnimplementedEventTCP emits unimplemented event if name is valid. This +// function contains names that are common between Get and SetSockOpt when +// level is SOL_TCP. +func emitUnimplementedEventTCP(t *kernel.Task, name int) { + switch name { + case linux.TCP_CONGESTION, + linux.TCP_CORK, + linux.TCP_FASTOPEN, + linux.TCP_FASTOPEN_CONNECT, + linux.TCP_FASTOPEN_KEY, + linux.TCP_FASTOPEN_NO_COOKIE, + linux.TCP_QUEUE_SEQ, + linux.TCP_REPAIR, + linux.TCP_REPAIR_QUEUE, + linux.TCP_REPAIR_WINDOW, + linux.TCP_SAVED_SYN, + linux.TCP_SAVE_SYN, + linux.TCP_THIN_DUPACK, + linux.TCP_THIN_LINEAR_TIMEOUTS, + linux.TCP_TIMESTAMP, + linux.TCP_ULP: + + t.Kernel().EmitUnimplementedEvent(t) + } +} + +// emitUnimplementedEventIPv6 emits unimplemented event if name is valid. It +// contains names that are common between Get and SetSockOpt when level is +// SOL_IPV6. +func emitUnimplementedEventIPv6(t *kernel.Task, name int) { + switch name { + case linux.IPV6_2292DSTOPTS, + linux.IPV6_2292HOPLIMIT, + linux.IPV6_2292HOPOPTS, + linux.IPV6_2292PKTINFO, + linux.IPV6_2292PKTOPTIONS, + linux.IPV6_2292RTHDR, + linux.IPV6_ADDR_PREFERENCES, + linux.IPV6_AUTOFLOWLABEL, + linux.IPV6_DONTFRAG, + linux.IPV6_DSTOPTS, + linux.IPV6_FLOWINFO, + linux.IPV6_FLOWINFO_SEND, + linux.IPV6_FLOWLABEL_MGR, + linux.IPV6_FREEBIND, + linux.IPV6_HOPOPTS, + linux.IPV6_MINHOPCOUNT, + linux.IPV6_MTU, + linux.IPV6_MTU_DISCOVER, + linux.IPV6_MULTICAST_ALL, + linux.IPV6_MULTICAST_HOPS, + linux.IPV6_MULTICAST_IF, + linux.IPV6_MULTICAST_LOOP, + linux.IPV6_RECVDSTOPTS, + linux.IPV6_RECVERR, + linux.IPV6_RECVFRAGSIZE, + linux.IPV6_RECVHOPLIMIT, + linux.IPV6_RECVHOPOPTS, + linux.IPV6_RECVORIGDSTADDR, + linux.IPV6_RECVPATHMTU, + linux.IPV6_RECVPKTINFO, + linux.IPV6_RECVRTHDR, + linux.IPV6_RTHDR, + linux.IPV6_RTHDRDSTOPTS, + linux.IPV6_TCLASS, + linux.IPV6_TRANSPARENT, + linux.IPV6_UNICAST_HOPS, + linux.IPV6_UNICAST_IF, + linux.MCAST_MSFILTER, + linux.IPV6_ADDRFORM: + + t.Kernel().EmitUnimplementedEvent(t) + } +} + +// emitUnimplementedEventIP emits unimplemented event if name is valid. It +// contains names that are common between Get and SetSockOpt when level is +// SOL_IP. +func emitUnimplementedEventIP(t *kernel.Task, name int) { + switch name { + case linux.IP_TOS, + linux.IP_TTL, + linux.IP_HDRINCL, + linux.IP_OPTIONS, + linux.IP_ROUTER_ALERT, + linux.IP_RECVOPTS, + linux.IP_RETOPTS, + linux.IP_PKTINFO, + linux.IP_PKTOPTIONS, + linux.IP_MTU_DISCOVER, + linux.IP_RECVERR, + linux.IP_RECVTTL, + linux.IP_RECVTOS, + linux.IP_MTU, + linux.IP_FREEBIND, + linux.IP_IPSEC_POLICY, + linux.IP_XFRM_POLICY, + linux.IP_PASSSEC, + linux.IP_TRANSPARENT, + linux.IP_ORIGDSTADDR, + linux.IP_MINTTL, + linux.IP_NODEFRAG, + linux.IP_CHECKSUM, + linux.IP_BIND_ADDRESS_NO_PORT, + linux.IP_RECVFRAGSIZE, + linux.IP_MULTICAST_IF, + linux.IP_MULTICAST_TTL, + linux.IP_MULTICAST_LOOP, + linux.IP_ADD_MEMBERSHIP, + linux.IP_DROP_MEMBERSHIP, + linux.IP_UNBLOCK_SOURCE, + linux.IP_BLOCK_SOURCE, + linux.IP_ADD_SOURCE_MEMBERSHIP, + linux.IP_DROP_SOURCE_MEMBERSHIP, + linux.IP_MSFILTER, + linux.MCAST_JOIN_GROUP, + linux.MCAST_BLOCK_SOURCE, + linux.MCAST_UNBLOCK_SOURCE, + linux.MCAST_LEAVE_GROUP, + linux.MCAST_JOIN_SOURCE_GROUP, + linux.MCAST_LEAVE_SOURCE_GROUP, + linux.MCAST_MSFILTER, + linux.IP_MULTICAST_ALL, + linux.IP_UNICAST_IF: + + t.Kernel().EmitUnimplementedEvent(t) + } +} + +// isLinkLocal determines if the given IPv6 address is link-local. This is the +// case when it has the fe80::/10 prefix. This check is used to determine when +// the NICID is relevant for a given IPv6 address. +func isLinkLocal(addr tcpip.Address) bool { + return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80 +} + +// ConvertAddress converts the given address to a native format. +func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) { + switch family { + case linux.AF_UNIX: + var out linux.SockAddrUnix + out.Family = linux.AF_UNIX + l := len([]byte(addr.Addr)) + for i := 0; i < l; i++ { + out.Path[i] = int8(addr.Addr[i]) + } + + // Linux returns the used length of the address struct (including the + // null terminator) for filesystem paths. The Family field is 2 bytes. + // It is sometimes allowed to exclude the null terminator if the + // address length is the max. Abstract and empty paths always return + // the full exact length. + if l == 0 || out.Path[0] == 0 || l == len(out.Path) { + return &out, uint32(2 + l) + } + return &out, uint32(3 + l) + + case linux.AF_INET: + var out linux.SockAddrInet + copy(out.Addr[:], addr.Addr) + out.Family = linux.AF_INET + out.Port = htons(addr.Port) + return &out, uint32(sockAddrInetSize) + + case linux.AF_INET6: + var out linux.SockAddrInet6 + if len(addr.Addr) == header.IPv4AddressSize { + // Copy address in v4-mapped format. + copy(out.Addr[12:], addr.Addr) + out.Addr[10] = 0xff + out.Addr[11] = 0xff + } else { + copy(out.Addr[:], addr.Addr) + } + out.Family = linux.AF_INET6 + out.Port = htons(addr.Port) + if isLinkLocal(addr.Addr) { + out.Scope_id = uint32(addr.NIC) + } + return &out, uint32(sockAddrInet6Size) + + case linux.AF_PACKET: + // TODO(b/129292371): Return protocol too. + var out linux.SockAddrLink + out.Family = linux.AF_PACKET + out.InterfaceIndex = int32(addr.NIC) + out.HardwareAddrLen = header.EthernetAddressSize + copy(out.HardwareAddr[:], addr.Addr) + return &out, uint32(sockAddrLinkSize) + + default: + return nil, 0 + } +} + +// GetSockName implements the linux syscall getsockname(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addr, err := s.Endpoint.GetLocalAddress() + if err != nil { + return nil, 0, syserr.TranslateNetstackError(err) + } + + a, l := ConvertAddress(s.family, addr) + return a, l, nil +} + +// GetPeerName implements the linux syscall getpeername(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addr, err := s.Endpoint.GetRemoteAddress() + if err != nil { + return nil, 0, syserr.TranslateNetstackError(err) + } + + a, l := ConvertAddress(s.family, addr) + return a, l, nil +} + +// coalescingRead is the fast path for non-blocking, non-peek, stream-based +// case. It coalesces as many packets as possible before returning to the +// caller. +// +// Precondition: s.readMu must be locked. +func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { + var err *syserr.Error + var copied int + + // Copy as many views as possible into the user-provided buffer. + for { + // Always do at least one fetchReadView, even if the number of bytes to + // read is 0. + err = s.fetchReadView() + if err != nil { + break + } + if dst.NumBytes() == 0 { + break + } + + var n int + var e error + if discard { + n = len(s.readView) + if int64(n) > dst.NumBytes() { + n = int(dst.NumBytes()) + } + } else { + n, e = dst.CopyOut(ctx, s.readView) + // Set the control message, even if 0 bytes were read. + if e == nil { + s.updateTimestamp() + } + } + copied += n + s.readView.TrimFront(n) + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) + } + + dst = dst.DropFirst(n) + if e != nil { + err = syserr.FromError(e) + break + } + } + + // If we managed to copy something, we must deliver it. + if copied > 0 { + s.Endpoint.ModerateRecvBuf(copied) + return copied, nil + } + + return 0, err +} + +func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) { + if !s.sockOptInq { + return + } + rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if err != nil { + return + } + cmsg.IP.HasInq = true + cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed) +} + +// nonBlockingRead issues a non-blocking read. +// +// TODO(b/78348848): Support timestamps for stream sockets. +func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { + isPacket := s.isPacketBased() + + // Fast path for regular reads from stream (e.g., TCP) endpoints. Note + // that senderRequested is ignored for stream sockets. + if !peek && !isPacket { + // TCP sockets discard the data if MSG_TRUNC is set. + // + // This behavior is documented in man 7 tcp: + // Since version 2.4, Linux supports the use of MSG_TRUNC in the flags + // argument of recv(2) (and recvmsg(2)). This flag causes the received + // bytes of data to be discarded, rather than passed back in a + // caller-supplied buffer. + s.readMu.Lock() + n, err := s.coalescingRead(ctx, dst, trunc) + cmsg := s.controlMessages() + s.fillCmsgInq(&cmsg) + s.readMu.Unlock() + return n, 0, nil, 0, cmsg, err + } + + s.readMu.Lock() + defer s.readMu.Unlock() + + if err := s.fetchReadView(); err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, err + } + + if !isPacket && peek && trunc { + // MSG_TRUNC with MSG_PEEK on a TCP socket returns the + // amount that could be read. + rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if err != nil { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) + } + available := len(s.readView) + int(rql) + bufLen := int(dst.NumBytes()) + if available < bufLen { + return available, 0, nil, 0, socket.ControlMessages{}, nil + } + return bufLen, 0, nil, 0, socket.ControlMessages{}, nil + } + + n, err := dst.CopyOut(ctx, s.readView) + // Set the control message, even if 0 bytes were read. + if err == nil { + s.updateTimestamp() + } + var addr linux.SockAddr + var addrLen uint32 + if isPacket && senderRequested { + addr, addrLen = ConvertAddress(s.family, s.sender) + } + + if peek { + if l := len(s.readView); trunc && l > n { + // isPacket must be true. + return l, linux.MSG_TRUNC, addr, addrLen, s.controlMessages(), syserr.FromError(err) + } + + if isPacket || err != nil { + return n, 0, addr, addrLen, s.controlMessages(), syserr.FromError(err) + } + + // We need to peek beyond the first message. + dst = dst.DropFirst(n) + num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) { + n, _, err := s.Endpoint.Peek(dsts) + // TODO(b/78348848): Handle peek timestamp. + if err != nil { + return int64(n), syserr.TranslateNetstackError(err).ToError() + } + return int64(n), nil + }}) + n += int(num) + if err == syserror.ErrWouldBlock && n > 0 { + // We got some data, so no need to return an error. + err = nil + } + return n, 0, nil, 0, s.controlMessages(), syserr.FromError(err) + } + + var msgLen int + if isPacket { + msgLen = len(s.readView) + s.readView = nil + } else { + msgLen = int(n) + s.readView.TrimFront(int(n)) + } + + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) + } + + var flags int + if msgLen > int(n) { + flags |= linux.MSG_TRUNC + } + + if trunc { + n = msgLen + } + + cmsg := s.controlMessages() + s.fillCmsgInq(&cmsg) + return n, flags, addr, addrLen, cmsg, syserr.FromError(err) +} + +func (s *socketOpsCommon) controlMessages() socket.ControlMessages { + return socket.ControlMessages{ + IP: tcpip.ControlMessages{ + HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, + Timestamp: s.readCM.Timestamp, + HasTOS: s.readCM.HasTOS, + TOS: s.readCM.TOS, + HasTClass: s.readCM.HasTClass, + TClass: s.readCM.TClass, + HasIPPacketInfo: s.readCM.HasIPPacketInfo, + PacketInfo: s.readCM.PacketInfo, + }, + } +} + +// updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after +// successfully writing packet data out to userspace. +// +// Precondition: s.readMu must be locked. +func (s *socketOpsCommon) updateTimestamp() { + // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. + if !s.sockOptTimestamp { + s.timestampValid = true + s.timestampNS = s.readCM.Timestamp + } +} + +// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { + trunc := flags&linux.MSG_TRUNC != 0 + peek := flags&linux.MSG_PEEK != 0 + dontWait := flags&linux.MSG_DONTWAIT != 0 + waitAll := flags&linux.MSG_WAITALL != 0 + if senderRequested && !s.isPacketBased() { + // Stream sockets ignore the sender address. + senderRequested = false + } + n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) + + if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 { + // In this situation we should return EAGAIN. + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain + } + + if err != nil && (err != syserr.ErrWouldBlock || dontWait) { + // Read failed and we should not retry. + return 0, 0, nil, 0, socket.ControlMessages{}, err + } + + if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) { + // We got all the data we need. + return + } + + // Don't overwrite any data we received. + dst = dst.DropFirst(n) + + // We'll have to block. Register for notifications and keep trying to + // send all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + for { + var rn int + rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) + n += rn + if err != nil && err != syserr.ErrWouldBlock { + // Always stop on errors other than would block as we generally + // won't be able to get any more data. Eat the error if we got + // any data. + if n > 0 { + err = nil + } + return + } + if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) { + // We got all the data we need. + return + } + dst = dst.DropFirst(rn) + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if n > 0 { + return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil + } + if err == syserror.ETIMEDOUT { + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain + } + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + } +} + +// SendMsg implements the linux syscall sendmsg(2) for sockets backed by +// tcpip.Endpoint. +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { + // Reject Unix control messages. + if !controlMessages.Unix.Empty() { + return 0, syserr.ErrInvalidArgument + } + + var addr *tcpip.FullAddress + if len(to) > 0 { + addrBuf, family, err := AddressAndFamily(to) + if err != nil { + return 0, err + } + if err := s.checkFamily(family, false /* exact */); err != nil { + return 0, err + } + addrBuf = s.mapFamily(addrBuf, family) + + addr = &addrBuf + } + + opts := tcpip.WriteOptions{ + To: addr, + More: flags&linux.MSG_MORE != 0, + EndOfRecord: flags&linux.MSG_EOR != 0, + } + + v := &ioSequencePayload{t, src} + n, resCh, err := s.Endpoint.Write(v, opts) + if resCh != nil { + if err := t.Block(resCh); err != nil { + return 0, syserr.FromError(err) + } + n, _, err = s.Endpoint.Write(v, opts) + } + dontWait := flags&linux.MSG_DONTWAIT != 0 + if err == nil && (n >= v.src.NumBytes() || dontWait) { + // Complete write. + return int(n), nil + } + if err != nil && (err != tcpip.ErrWouldBlock || dontWait) { + return int(n), syserr.TranslateNetstackError(err) + } + + // We'll have to block. Register for notification and keep trying to + // send all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + + v.DropFirst(int(n)) + total := n + for { + n, _, err = s.Endpoint.Write(v, opts) + v.DropFirst(int(n)) + total += n + + if err != nil && err != tcpip.ErrWouldBlock && total == 0 { + return 0, syserr.TranslateNetstackError(err) + } + + if err == nil && v.src.NumBytes() == 0 || err != nil && err != tcpip.ErrWouldBlock { + return int(total), nil + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + return int(total), syserr.ErrTryAgain + } + // handleIOError will consume errors from t.Block if needed. + return int(total), syserr.FromError(err) + } + } +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, io, args) +} + +func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint + // sockets. + // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. + switch args[1].Int() { + case syscall.SIOCGSTAMP: + s.readMu.Lock() + defer s.readMu.Unlock() + if !s.timestampValid { + return 0, syserror.ENOENT + } + + tv := linux.NsecToTimeval(s.timestampNS) + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCINQ: + v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if terr != nil { + return 0, syserr.TranslateNetstackError(terr).ToError() + } + + // Add bytes removed from the endpoint but not yet sent to the caller. + s.readMu.Lock() + v += len(s.readView) + s.readMu.Unlock() + + if v > math.MaxInt32 { + v = math.MaxInt32 + } + + // Copy result to userspace. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + } + + return Ioctl(ctx, s.Endpoint, io, args) +} + +// Ioctl performs a socket ioctl. +func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch arg := int(args[1].Int()); arg { + case syscall.SIOCGIFFLAGS, + syscall.SIOCGIFADDR, + syscall.SIOCGIFBRDADDR, + syscall.SIOCGIFDSTADDR, + syscall.SIOCGIFHWADDR, + syscall.SIOCGIFINDEX, + syscall.SIOCGIFMAP, + syscall.SIOCGIFMETRIC, + syscall.SIOCGIFMTU, + syscall.SIOCGIFNAME, + syscall.SIOCGIFNETMASK, + syscall.SIOCGIFTXQLEN: + + var ifr linux.IFReq + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { + return 0, err.ToError() + } + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case syscall.SIOCGIFCONF: + // Return a list of interface addresses or the buffer size + // necessary to hold the list. + var ifc linux.IFConf + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + if err := ifconfIoctl(ctx, io, &ifc); err != nil { + return 0, err + } + + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{ + AddressSpaceActive: true, + }) + + return 0, err + + case linux.TIOCINQ: + v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption) + if terr != nil { + return 0, syserr.TranslateNetstackError(terr).ToError() + } + + if v > math.MaxInt32 { + v = math.MaxInt32 + } + // Copy result to userspace. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCOUTQ: + v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption) + if terr != nil { + return 0, syserr.TranslateNetstackError(terr).ToError() + } + + if v > math.MaxInt32 { + v = math.MaxInt32 + } + + // Copy result to userspace. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: + unimpl.EmitUnimplementedEvent(ctx) + } + + return 0, syserror.ENOTTY +} + +// interfaceIoctl implements interface requests. +func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { + var ( + iface inet.Interface + index int32 + found bool + ) + + // Find the relevant device. + stack := inet.StackFromContext(ctx) + if stack == nil { + return syserr.ErrNoDevice + } + + // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to + // identify a device. + if arg == syscall.SIOCGIFNAME { + // Gets the name of the interface given the interface index + // stored in ifr_ifindex. + index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4])) + if iface, ok := stack.Interfaces()[index]; ok { + ifr.SetName(iface.Name) + return nil + } + return syserr.ErrNoDevice + } + + // Find the relevant device. + for index, iface = range stack.Interfaces() { + if iface.Name == ifr.Name() { + found = true + break + } + } + if !found { + return syserr.ErrNoDevice + } + + switch arg { + case syscall.SIOCGIFINDEX: + // Copy out the index to the data. + usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) + + case syscall.SIOCGIFHWADDR: + // Copy the hardware address out. + ifr.Data[0] = 6 // IEEE802.2 arp type. + ifr.Data[1] = 0 + n := copy(ifr.Data[2:], iface.Addr) + for i := 2 + n; i < len(ifr.Data); i++ { + ifr.Data[i] = 0 // Clear padding. + } + usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n)) + + case syscall.SIOCGIFFLAGS: + f, err := interfaceStatusFlags(stack, iface.Name) + if err != nil { + return err + } + // Drop the flags that don't fit in the size that we need to return. This + // matches Linux behavior. + usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) + + case syscall.SIOCGIFADDR: + // Copy the IPv4 address out. + for _, addr := range stack.InterfaceAddrs()[index] { + // This ioctl is only compatible with AF_INET addresses. + if addr.Family != linux.AF_INET { + continue + } + copy(ifr.Data[4:8], addr.Addr) + break + } + + case syscall.SIOCGIFMETRIC: + // Gets the metric of the device. As per netdevice(7), this + // always just sets ifr_metric to 0. + usermem.ByteOrder.PutUint32(ifr.Data[:4], 0) + + case syscall.SIOCGIFMTU: + // Gets the MTU of the device. + usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) + + case syscall.SIOCGIFMAP: + // Gets the hardware parameters of the device. + // TODO(gvisor.dev/issue/505): Implement. + + case syscall.SIOCGIFTXQLEN: + // Gets the transmit queue length of the device. + // TODO(gvisor.dev/issue/505): Implement. + + case syscall.SIOCGIFDSTADDR: + // Gets the destination address of a point-to-point device. + // TODO(gvisor.dev/issue/505): Implement. + + case syscall.SIOCGIFBRDADDR: + // Gets the broadcast address of a device. + // TODO(gvisor.dev/issue/505): Implement. + + case syscall.SIOCGIFNETMASK: + // Gets the network mask of a device. + for _, addr := range stack.InterfaceAddrs()[index] { + // This ioctl is only compatible with AF_INET addresses. + if addr.Family != linux.AF_INET { + continue + } + // Populate ifr.ifr_netmask (type sockaddr). + usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(linux.AF_INET)) + usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0) + var mask uint32 = 0xffffffff << (32 - addr.PrefixLen) + // Netmask is expected to be returned as a big endian + // value. + binary.BigEndian.PutUint32(ifr.Data[4:8], mask) + break + } + + default: + // Not a valid call. + return syserr.ErrInvalidArgument + } + + return nil +} + +// ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. +func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { + // If Ptr is NULL, return the necessary buffer size via Len. + // Otherwise, write up to Len bytes starting at Ptr containing ifreq + // structs. + stack := inet.StackFromContext(ctx) + if stack == nil { + return syserr.ErrNoDevice.ToError() + } + + if ifc.Ptr == 0 { + ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq) + return nil + } + + max := ifc.Len + ifc.Len = 0 + for key, ifaceAddrs := range stack.InterfaceAddrs() { + iface := stack.Interfaces()[key] + for _, ifaceAddr := range ifaceAddrs { + // Don't write past the end of the buffer. + if ifc.Len+int32(linux.SizeOfIFReq) > max { + break + } + if ifaceAddr.Family != linux.AF_INET { + continue + } + + // Populate ifr.ifr_addr. + ifr := linux.IFReq{} + ifr.SetName(iface.Name) + usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) + usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0) + copy(ifr.Data[4:8], ifaceAddr.Addr[:4]) + + // Copy the ifr to userspace. + dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) + ifc.Len += int32(linux.SizeOfIFReq) + if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return err + } + } + } + return nil +} + +// interfaceStatusFlags returns status flags for an interface in the stack. +// Flag values and meanings are described in greater detail in netdevice(7) in +// the SIOCGIFFLAGS section. +func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) { + // We should only ever be passed a netstack.Stack. + epstack, ok := stack.(*Stack) + if !ok { + return 0, errStackType + } + + // Find the NIC corresponding to this interface. + for _, info := range epstack.Stack.NICInfo() { + if info.Name == name { + return nicStateFlagsToLinux(info.Flags), nil + } + } + return 0, syserr.ErrNoDevice +} + +func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { + var rv uint32 + if f.Up { + rv |= linux.IFF_UP | linux.IFF_LOWER_UP + } + if f.Running { + rv |= linux.IFF_RUNNING + } + if f.Promiscuous { + rv |= linux.IFF_PROMISC + } + if f.Loopback { + rv |= linux.IFF_LOOPBACK + } + return rv +} + +// State implements socket.Socket.State. State translates the internal state +// returned by netstack to values defined by Linux. +func (s *socketOpsCommon) State() uint32 { + if s.family != linux.AF_INET && s.family != linux.AF_INET6 { + // States not implemented for this socket's family. + return 0 + } + + switch { + case s.skType == linux.SOCK_STREAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_TCP: + // TCP socket. + switch tcp.EndpointState(s.Endpoint.State()) { + case tcp.StateEstablished: + return linux.TCP_ESTABLISHED + case tcp.StateSynSent: + return linux.TCP_SYN_SENT + case tcp.StateSynRecv: + return linux.TCP_SYN_RECV + case tcp.StateFinWait1: + return linux.TCP_FIN_WAIT1 + case tcp.StateFinWait2: + return linux.TCP_FIN_WAIT2 + case tcp.StateTimeWait: + return linux.TCP_TIME_WAIT + case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: + return linux.TCP_CLOSE + case tcp.StateCloseWait: + return linux.TCP_CLOSE_WAIT + case tcp.StateLastAck: + return linux.TCP_LAST_ACK + case tcp.StateListen: + return linux.TCP_LISTEN + case tcp.StateClosing: + return linux.TCP_CLOSING + default: + // Internal or unknown state. + return 0 + } + case s.skType == linux.SOCK_DGRAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_UDP: + // UDP socket. + switch udp.EndpointState(s.Endpoint.State()) { + case udp.StateInitial, udp.StateBound, udp.StateClosed: + return linux.TCP_CLOSE + case udp.StateConnected: + return linux.TCP_ESTABLISHED + default: + return 0 + } + case s.skType == linux.SOCK_DGRAM && s.protocol == syscall.IPPROTO_ICMP || s.protocol == syscall.IPPROTO_ICMPV6: + // TODO(b/112063468): Export states for ICMP sockets. + case s.skType == linux.SOCK_RAW: + // TODO(b/112063468): Export states for raw sockets. + default: + // Unknown transport protocol, how did we make this socket? + log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem()) + return 0 + } + + return 0 +} + +// Type implements socket.Socket.Type. +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { + return s.family, s.skType, s.protocol +} + +// LINT.ThenChange(./netstack_vfs2.go) diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go new file mode 100644 index 000000000..d65a89316 --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -0,0 +1,330 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 encapsulates all the state needed to represent a network stack +// endpoint in the kernel context. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&SocketVFS2{}) + +// NewVFS2 creates a new endpoint socket. +func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { + if skType == linux.SOCK_STREAM { + if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + } + + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + + s := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, + } + s.LockFD.Init(&vfs.FileLocks{}) + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) + if err == syserr.ErrWouldBlock { + return int64(n), syserror.ErrWouldBlock + } + if err != nil { + return 0, err.ToError() + } + return int64(n), nil +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + f := &ioSequencePayload{ctx: ctx, src: src} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err + } + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) + } + + if err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + + if int64(n) < src.NumBytes() { + return int64(n), syserror.ErrWouldBlock + } + + return int64(n), nil +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, wq, terr := s.Endpoint.Accept() + if terr != nil { + if terr != tcpip.ErrWouldBlock || !blocking { + return 0, nil, 0, syserr.TranslateNetstackError(terr) + } + + var err *syserr.Error + ep, wq, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := NewVFS2(t, s.family, s.skType, s.protocol, wq, ep) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { + return 0, nil, 0, syserr.FromError(err) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer and write it to peer slice. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + + t.Kernel().RecordSocketVFS2(ns) + + return fd, addr, addrLen, syserr.FromError(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, uio, args) +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptTimestamp { + val = 1 + } + return val, nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptInq { + val = 1 + } + return val, nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) + if err != nil { + return nil, err + } + return info, nil + + case linux.IPT_SO_GET_ENTRIES: + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return entries, nil + + } + } + + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIPTReplace { + return syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal) + + case linux.IPT_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + } + } + + return SetSockOpt(t, s, s.Endpoint, level, name, optVal) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go new file mode 100644 index 000000000..ead3b2b79 --- /dev/null +++ b/pkg/sentry/socket/netstack/provider.go @@ -0,0 +1,199 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// provider is an inet socket provider. +type provider struct { + family int + netProto tcpip.NetworkProtocolNumber +} + +// getTransportProtocol figures out transport protocol. Currently only TCP, +// UDP, and ICMP are supported. The bool return value is true when this socket +// is associated with a transport protocol. This is only false for SOCK_RAW, +// IPPROTO_IP sockets. +func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, bool, *syserr.Error) { + switch stype { + case linux.SOCK_STREAM: + if protocol != 0 && protocol != syscall.IPPROTO_TCP { + return 0, true, syserr.ErrInvalidArgument + } + return tcp.ProtocolNumber, true, nil + + case linux.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + return udp.ProtocolNumber, true, nil + case syscall.IPPROTO_ICMP: + return header.ICMPv4ProtocolNumber, true, nil + case syscall.IPPROTO_ICMPV6: + return header.ICMPv6ProtocolNumber, true, nil + } + + case linux.SOCK_RAW: + // Raw sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(ctx) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return 0, true, syserr.ErrNotPermitted + } + + switch protocol { + case syscall.IPPROTO_ICMP: + return header.ICMPv4ProtocolNumber, true, nil + case syscall.IPPROTO_ICMPV6: + return header.ICMPv6ProtocolNumber, true, nil + case syscall.IPPROTO_UDP: + return header.UDPProtocolNumber, true, nil + case syscall.IPPROTO_TCP: + return header.TCPProtocolNumber, true, nil + // IPPROTO_RAW signifies that the raw socket isn't assigned to + // a transport protocol. Users will be able to write packets' + // IP headers and won't receive anything. + case syscall.IPPROTO_RAW: + return tcpip.TransportProtocolNumber(0), false, nil + } + } + return 0, true, syserr.ErrProtocolNotSupported +} + +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. +func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // Fail right away if we don't have a stack. + stack := t.NetworkContext() + if stack == nil { + // Don't propagate an error here. Instead, allow the socket + // code to continue searching for another provider. + return nil, nil + } + eps, ok := stack.(*Stack) + if !ok { + return nil, nil + } + + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocket(t, eps, stype, protocol) + } + + // Figure out the transport protocol. + transProto, associated, err := getTransportProtocol(t, stype, protocol) + if err != nil { + return nil, err + } + + // Create the endpoint. + var ep tcpip.Endpoint + var e *tcpip.Error + wq := &waiter.Queue{} + if stype == linux.SOCK_RAW { + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) + } else { + ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } + } + if e != nil { + return nil, syserr.TranslateNetstackError(e) + } + + return New(t, p.family, stype, int(transProto), wq, ep) +} + +func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return New(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + +// LINT.ThenChange(./provider_vfs2.go) + +// Pair just returns nil sockets (not supported). +func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { + return nil, nil, nil +} + +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. +func init() { + // Providers backed by netstack. + p := []provider{ + { + family: linux.AF_INET, + netProto: ipv4.ProtocolNumber, + }, + { + family: linux.AF_INET6, + netProto: ipv6.ProtocolNumber, + }, + { + family: linux.AF_PACKET, + }, + } + + for i := range p { + socket.RegisterProvider(p[i].family, &p[i]) + } +} diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go new file mode 100644 index 000000000..2a01143f6 --- /dev/null +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -0,0 +1,141 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/waiter" +) + +// providerVFS2 is an inet socket provider. +type providerVFS2 struct { + family int + netProto tcpip.NetworkProtocolNumber +} + +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. +func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Fail right away if we don't have a stack. + stack := t.NetworkContext() + if stack == nil { + // Don't propagate an error here. Instead, allow the socket + // code to continue searching for another provider. + return nil, nil + } + eps, ok := stack.(*Stack) + if !ok { + return nil, nil + } + + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocketVFS2(t, eps, stype, protocol) + } + + // Figure out the transport protocol. + transProto, associated, err := getTransportProtocol(t, stype, protocol) + if err != nil { + return nil, err + } + + // Create the endpoint. + var ep tcpip.Endpoint + var e *tcpip.Error + wq := &waiter.Queue{} + if stype == linux.SOCK_RAW { + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) + } else { + ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } + } + if e != nil { + return nil, syserr.TranslateNetstackError(e) + } + + return NewVFS2(t, p.family, stype, int(transProto), wq, ep) +} + +func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return NewVFS2(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + +// Pair just returns nil sockets (not supported). +func (*providerVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + return nil, nil, nil +} + +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. +func init() { + // Providers backed by netstack. + p := []providerVFS2{ + { + family: linux.AF_INET, + netProto: ipv4.ProtocolNumber, + }, + { + family: linux.AF_INET6, + netProto: ipv6.ProtocolNumber, + }, + { + family: linux.AF_PACKET, + }, + } + + for i := range p { + socket.RegisterProviderVFS2(p[i].family, &p[i]) + } +} diff --git a/pkg/sentry/socket/netstack/save_restore.go b/pkg/sentry/socket/netstack/save_restore.go new file mode 100644 index 000000000..c7aaf722a --- /dev/null +++ b/pkg/sentry/socket/netstack/save_restore.go @@ -0,0 +1,27 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +// afterLoad is invoked by stateify. +func (s *Stack) afterLoad() { + s.Stack = stack.StackFromEnv // FIXME(b/36201077) + if s.Stack == nil { + panic("can't restore without netstack/tcpip/stack.Stack") + } +} diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go new file mode 100644 index 000000000..548442b96 --- /dev/null +++ b/pkg/sentry/socket/netstack/stack.go @@ -0,0 +1,386 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" +) + +// Stack implements inet.Stack for netstack/tcpip/stack.Stack. +// +// +stateify savable +type Stack struct { + Stack *stack.Stack `state:"manual"` +} + +// SupportsIPv6 implements Stack.SupportsIPv6. +func (s *Stack) SupportsIPv6() bool { + return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber) +} + +// Interfaces implements inet.Stack.Interfaces. +func (s *Stack) Interfaces() map[int32]inet.Interface { + is := make(map[int32]inet.Interface) + for id, ni := range s.Stack.NICInfo() { + var devType uint16 + if ni.Flags.Loopback { + devType = linux.ARPHRD_LOOPBACK + } + is[int32(id)] = inet.Interface{ + Name: ni.Name, + Addr: []byte(ni.LinkAddress), + Flags: uint32(nicStateFlagsToLinux(ni.Flags)), + DeviceType: devType, + MTU: ni.MTU, + } + } + return is +} + +// InterfaceAddrs implements inet.Stack.InterfaceAddrs. +func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { + nicAddrs := make(map[int32][]inet.InterfaceAddr) + for id, ni := range s.Stack.NICInfo() { + var addrs []inet.InterfaceAddr + for _, a := range ni.ProtocolAddresses { + var family uint8 + switch a.Protocol { + case ipv4.ProtocolNumber: + family = linux.AF_INET + case ipv6.ProtocolNumber: + family = linux.AF_INET6 + default: + log.Warningf("Unknown network protocol in %+v", a) + continue + } + + addrs = append(addrs, inet.InterfaceAddr{ + Family: family, + PrefixLen: uint8(a.AddressWithPrefix.PrefixLen), + Addr: []byte(a.AddressWithPrefix.Address), + // TODO(b/68878065): Other fields. + }) + } + nicAddrs[int32(id)] = addrs + } + return nicAddrs +} + +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + var ( + protocol tcpip.NetworkProtocolNumber + address tcpip.Address + ) + switch addr.Family { + case linux.AF_INET: + if len(addr.Addr) < header.IPv4AddressSize { + return syserror.EINVAL + } + if addr.PrefixLen > header.IPv4AddressSize*8 { + return syserror.EINVAL + } + protocol = ipv4.ProtocolNumber + address = tcpip.Address(addr.Addr[:header.IPv4AddressSize]) + + case linux.AF_INET6: + if len(addr.Addr) < header.IPv6AddressSize { + return syserror.EINVAL + } + if addr.PrefixLen > header.IPv6AddressSize*8 { + return syserror.EINVAL + } + protocol = ipv6.ProtocolNumber + address = tcpip.Address(addr.Addr[:header.IPv6AddressSize]) + + default: + return syserror.ENOTSUP + } + + protocolAddress := tcpip.ProtocolAddress{ + Protocol: protocol, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address, + PrefixLen: int(addr.PrefixLen), + }, + } + + // Attach address to interface. + if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + return syserr.TranslateNetstackError(err).ToError() + } + + // Add route for local network. + s.Stack.AddRoute(tcpip.Route{ + Destination: protocolAddress.AddressWithPrefix.Subnet(), + Gateway: "", // No gateway for local network. + NIC: tcpip.NICID(idx), + }) + return nil +} + +// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. +func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { + var rs tcp.ReceiveBufferSizeOption + err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs) + return inet.TCPBufferSize{ + Min: rs.Min, + Default: rs.Default, + Max: rs.Max, + }, syserr.TranslateNetstackError(err).ToError() +} + +// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. +func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { + rs := tcp.ReceiveBufferSizeOption{ + Min: size.Min, + Default: size.Default, + Max: size.Max, + } + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError() +} + +// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. +func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { + var ss tcp.SendBufferSizeOption + err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss) + return inet.TCPBufferSize{ + Min: ss.Min, + Default: ss.Default, + Max: ss.Max, + }, syserr.TranslateNetstackError(err).ToError() +} + +// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. +func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { + ss := tcp.SendBufferSizeOption{ + Min: size.Min, + Default: size.Default, + Max: size.Max, + } + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError() +} + +// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. +func (s *Stack) TCPSACKEnabled() (bool, error) { + var sack tcp.SACKEnabled + err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack) + return bool(sack), syserr.TranslateNetstackError(err).ToError() +} + +// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. +func (s *Stack) SetTCPSACKEnabled(enabled bool) error { + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError() +} + +// Statistics implements inet.Stack.Statistics. +func (s *Stack) Statistics(stat interface{}, arg string) error { + switch stats := stat.(type) { + case *inet.StatDev: + for _, ni := range s.Stack.NICInfo() { + if ni.Name != arg { + continue + } + // TODO(gvisor.dev/issue/2103) Support stubbed stats. + *stats = inet.StatDev{ + // Receive section. + ni.Stats.Rx.Bytes.Value(), // bytes. + ni.Stats.Rx.Packets.Value(), // packets. + 0, // errs. + 0, // drop. + 0, // fifo. + 0, // frame. + 0, // compressed. + 0, // multicast. + // Transmit section. + ni.Stats.Tx.Bytes.Value(), // bytes. + ni.Stats.Tx.Packets.Value(), // packets. + 0, // errs. + 0, // drop. + 0, // fifo. + 0, // colls. + 0, // carrier. + 0, // compressed. + } + break + } + case *inet.StatSNMPIP: + ip := Metrics.IP + // TODO(gvisor.dev/issue/969) Support stubbed stats. + *stats = inet.StatSNMPIP{ + 0, // Ip/Forwarding. + 0, // Ip/DefaultTTL. + ip.PacketsReceived.Value(), // InReceives. + 0, // Ip/InHdrErrors. + ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors. + 0, // Ip/ForwDatagrams. + 0, // Ip/InUnknownProtos. + 0, // Ip/InDiscards. + ip.PacketsDelivered.Value(), // InDelivers. + ip.PacketsSent.Value(), // OutRequests. + ip.OutgoingPacketErrors.Value(), // OutDiscards. + 0, // Ip/OutNoRoutes. + 0, // Support Ip/ReasmTimeout. + 0, // Support Ip/ReasmReqds. + 0, // Support Ip/ReasmOKs. + 0, // Support Ip/ReasmFails. + 0, // Support Ip/FragOKs. + 0, // Support Ip/FragFails. + 0, // Support Ip/FragCreates. + } + case *inet.StatSNMPICMP: + in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats + out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats + // TODO(gvisor.dev/issue/969) Support stubbed stats. + *stats = inet.StatSNMPICMP{ + 0, // Icmp/InMsgs. + Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors. + 0, // Icmp/InCsumErrors. + in.DstUnreachable.Value(), // InDestUnreachs. + in.TimeExceeded.Value(), // InTimeExcds. + in.ParamProblem.Value(), // InParmProbs. + in.SrcQuench.Value(), // InSrcQuenchs. + in.Redirect.Value(), // InRedirects. + in.Echo.Value(), // InEchos. + in.EchoReply.Value(), // InEchoReps. + in.Timestamp.Value(), // InTimestamps. + in.TimestampReply.Value(), // InTimestampReps. + in.InfoRequest.Value(), // InAddrMasks. + in.InfoReply.Value(), // InAddrMaskReps. + 0, // Icmp/OutMsgs. + Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors. + out.DstUnreachable.Value(), // OutDestUnreachs. + out.TimeExceeded.Value(), // OutTimeExcds. + out.ParamProblem.Value(), // OutParmProbs. + out.SrcQuench.Value(), // OutSrcQuenchs. + out.Redirect.Value(), // OutRedirects. + out.Echo.Value(), // OutEchos. + out.EchoReply.Value(), // OutEchoReps. + out.Timestamp.Value(), // OutTimestamps. + out.TimestampReply.Value(), // OutTimestampReps. + out.InfoRequest.Value(), // OutAddrMasks. + out.InfoReply.Value(), // OutAddrMaskReps. + } + case *inet.StatSNMPTCP: + tcp := Metrics.TCP + // RFC 2012 (updates 1213): SNMPv2-MIB-TCP. + *stats = inet.StatSNMPTCP{ + 1, // RtoAlgorithm. + 200, // RtoMin. + 120000, // RtoMax. + (1<<64 - 1), // MaxConn. + tcp.ActiveConnectionOpenings.Value(), // ActiveOpens. + tcp.PassiveConnectionOpenings.Value(), // PassiveOpens. + tcp.FailedConnectionAttempts.Value(), // AttemptFails. + tcp.EstablishedResets.Value(), // EstabResets. + tcp.CurrentEstablished.Value(), // CurrEstab. + tcp.ValidSegmentsReceived.Value(), // InSegs. + tcp.SegmentsSent.Value(), // OutSegs. + tcp.Retransmits.Value(), // RetransSegs. + tcp.InvalidSegmentsReceived.Value(), // InErrs. + tcp.ResetsSent.Value(), // OutRsts. + tcp.ChecksumErrors.Value(), // InCsumErrors. + } + case *inet.StatSNMPUDP: + udp := Metrics.UDP + // TODO(gvisor.dev/issue/969) Support stubbed stats. + *stats = inet.StatSNMPUDP{ + udp.PacketsReceived.Value(), // InDatagrams. + udp.UnknownPortErrors.Value(), // NoPorts. + 0, // Udp/InErrors. + udp.PacketsSent.Value(), // OutDatagrams. + udp.ReceiveBufferErrors.Value(), // RcvbufErrors. + 0, // Udp/SndbufErrors. + udp.ChecksumErrors.Value(), // Udp/InCsumErrors. + 0, // Udp/IgnoredMulti. + } + default: + return syserr.ErrEndpointOperation.ToError() + } + return nil +} + +// RouteTable implements inet.Stack.RouteTable. +func (s *Stack) RouteTable() []inet.Route { + var routeTable []inet.Route + + for _, rt := range s.Stack.GetRouteTable() { + var family uint8 + switch len(rt.Destination.ID()) { + case header.IPv4AddressSize: + family = linux.AF_INET + case header.IPv6AddressSize: + family = linux.AF_INET6 + default: + log.Warningf("Unknown network protocol in route %+v", rt) + continue + } + + routeTable = append(routeTable, inet.Route{ + Family: family, + DstLen: uint8(rt.Destination.Prefix()), // The CIDR prefix for the destination. + + // Always return unspecified protocol since we have no notion of + // protocol for routes. + Protocol: linux.RTPROT_UNSPEC, + // Set statically to LINK scope for now. + // + // TODO(gvisor.dev/issue/595): Set scope for routes. + Scope: linux.RT_SCOPE_LINK, + Type: linux.RTN_UNICAST, + + DstAddr: []byte(rt.Destination.ID()), + OutputInterface: int32(rt.NIC), + GatewayAddr: []byte(rt.Gateway), + }) + } + + return routeTable +} + +// IPTables returns the stack's iptables. +func (s *Stack) IPTables() (*stack.IPTables, error) { + return s.Stack.IPTables(), nil +} + +// Resume implements inet.Stack.Resume. +func (s *Stack) Resume() { + s.Stack.Resume() +} + +// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { + return s.Stack.RegisteredEndpoints() +} + +// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { + return s.Stack.CleanupEndpoints() +} + +// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) { + s.Stack.RestoreCleanupEndpoints(es) +} diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go new file mode 100644 index 000000000..fcd7f9d7f --- /dev/null +++ b/pkg/sentry/socket/socket.go @@ -0,0 +1,461 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package socket provides the interfaces that need to be provided by socket +// implementations and providers, as well as per family demultiplexing of socket +// creation. +package socket + +import ( + "fmt" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/device" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ControlMessages represents the union of unix control messages and tcpip +// control messages. +type ControlMessages struct { + Unix transport.ControlMessages + IP tcpip.ControlMessages +} + +// Release releases Unix domain socket credentials and rights. +func (c *ControlMessages) Release() { + c.Unix.Release() +} + +// Socket is an interface combining fs.FileOperations and SocketOps, +// representing a VFS1 socket file. +type Socket interface { + fs.FileOperations + SocketOps +} + +// SocketVFS2 is an interface combining vfs.FileDescription and SocketOps, +// representing a VFS2 socket file. +type SocketVFS2 interface { + vfs.FileDescriptionImpl + SocketOps +} + +// SocketOps is the interface containing socket syscalls used by the syscall +// layer to redirect them to the appropriate implementation. +// +// It is implemented by both Socket and SocketVFS2. +type SocketOps interface { + // Connect implements the connect(2) linux syscall. + Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error + + // Accept implements the accept4(2) linux syscall. + // Returns fd, real peer address length and error. Real peer address + // length is only set if len(peer) > 0. + Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) + + // Bind implements the bind(2) linux syscall. + Bind(t *kernel.Task, sockaddr []byte) *syserr.Error + + // Listen implements the listen(2) linux syscall. + Listen(t *kernel.Task, backlog int) *syserr.Error + + // Shutdown implements the shutdown(2) linux syscall. + Shutdown(t *kernel.Task, how int) *syserr.Error + + // GetSockOpt implements the getsockopt(2) linux syscall. + GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) + + // SetSockOpt implements the setsockopt(2) linux syscall. + SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error + + // GetSockName implements the getsockname(2) linux syscall. + // + // addrLen is the address length to be returned to the application, not + // necessarily the actual length of the address. + GetSockName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error) + + // GetPeerName implements the getpeername(2) linux syscall. + // + // addrLen is the address length to be returned to the application, not + // necessarily the actual length of the address. + GetPeerName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error) + + // RecvMsg implements the recvmsg(2) linux syscall. + // + // senderAddrLen is the address length to be returned to the application, + // not necessarily the actual length of the address. + // + // flags control how RecvMsg should be completed. msgFlags indicate how + // the RecvMsg call was completed. Note that control message truncation + // may still be required even if the MSG_CTRUNC bit is not set in + // msgFlags. In that case, the caller should set MSG_CTRUNC appropriately. + // + // If err != nil, the recv was not successful. + RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error) + + // SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take + // ownership of the ControlMessage on error. + // + // If n > 0, err will either be nil or an error from t.Block. + SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages ControlMessages) (n int, err *syserr.Error) + + // SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means + // no timeout, and negative means DONTWAIT. + SetRecvTimeout(nanoseconds int64) + + // RecvTimeout gets the current timeout (in ns) for recv operations. Zero + // means no timeout, and negative means DONTWAIT. + RecvTimeout() int64 + + // SetSendTimeout sets the timeout (in ns) for send operations. Zero means + // no timeout, and negative means DONTWAIT. + SetSendTimeout(nanoseconds int64) + + // SendTimeout gets the current timeout (in ns) for send operations. Zero + // means no timeout, and negative means DONTWAIT. + SendTimeout() int64 + + // State returns the current state of the socket, as represented by Linux in + // procfs. The returned state value is protocol-specific. + State() uint32 + + // Type returns the family, socket type and protocol of the socket. + Type() (family int, skType linux.SockType, protocol int) +} + +// Provider is the interface implemented by providers of sockets for specific +// address families (e.g., AF_INET). +type Provider interface { + // Socket creates a new socket. + // + // If a nil Socket _and_ a nil error is returned, it means that the + // protocol is not supported. A non-nil error should only be returned + // if the protocol is supported, but an error occurs during creation. + Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) + + // Pair creates a pair of connected sockets. + // + // See Socket for error information. + Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) +} + +// families holds a map of all known address families and their providers. +var families = make(map[int][]Provider) + +// RegisterProvider registers the provider of a given address family so that +// sockets of that type can be created via socket() and/or socketpair() +// syscalls. +// +// This should only be called during the initialization of the address family. +func RegisterProvider(family int, provider Provider) { + families[family] = append(families[family], provider) +} + +// New creates a new socket with the given family, type and protocol. +func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + for _, p := range families[family] { + s, err := p.Socket(t, stype, protocol) + if err != nil { + return nil, err + } + if s != nil { + t.Kernel().RecordSocket(s) + return s, nil + } + } + + return nil, syserr.ErrAddressFamilyNotSupported +} + +// Pair creates a new connected socket pair with the given family, type and +// protocol. +func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { + providers, ok := families[family] + if !ok { + return nil, nil, syserr.ErrAddressFamilyNotSupported + } + + for _, p := range providers { + s1, s2, err := p.Pair(t, stype, protocol) + if err != nil { + return nil, nil, err + } + if s1 != nil && s2 != nil { + k := t.Kernel() + k.RecordSocket(s1) + k.RecordSocket(s2) + return s1, s2, nil + } + } + + return nil, nil, syserr.ErrSocketNotSupported +} + +// NewDirent returns a sockfs fs.Dirent that resides on device d. +func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent { + ino := d.NextIno() + iops := &fsutil.SimpleFileInode{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + }, linux.SOCKFS_MAGIC), + } + inode := fs.NewInode(ctx, iops, fs.NewPseudoMountSource(ctx), fs.StableAttr{ + Type: fs.Socket, + DeviceID: d.DeviceID(), + InodeID: ino, + BlockSize: usermem.PageSize, + }) + + // Dirent name matches net/socket.c:sockfs_dname. + return fs.NewDirent(ctx, inode, fmt.Sprintf("socket:[%d]", ino)) +} + +// ProviderVFS2 is the vfs2 interface implemented by providers of sockets for +// specific address families (e.g., AF_INET). +type ProviderVFS2 interface { + // Socket creates a new socket. + // + // If a nil Socket _and_ a nil error is returned, it means that the + // protocol is not supported. A non-nil error should only be returned + // if the protocol is supported, but an error occurs during creation. + Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) + + // Pair creates a pair of connected sockets. + // + // See Socket for error information. + Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) +} + +// familiesVFS2 holds a map of all known address families and their providers. +var familiesVFS2 = make(map[int][]ProviderVFS2) + +// RegisterProviderVFS2 registers the provider of a given address family so that +// sockets of that type can be created via socket() and/or socketpair() +// syscalls. +// +// This should only be called during the initialization of the address family. +func RegisterProviderVFS2(family int, provider ProviderVFS2) { + familiesVFS2[family] = append(familiesVFS2[family], provider) +} + +// NewVFS2 creates a new socket with the given family, type and protocol. +func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + for _, p := range familiesVFS2[family] { + s, err := p.Socket(t, stype, protocol) + if err != nil { + return nil, err + } + if s != nil { + t.Kernel().RecordSocketVFS2(s) + return s, nil + } + } + + return nil, syserr.ErrAddressFamilyNotSupported +} + +// PairVFS2 creates a new connected socket pair with the given family, type and +// protocol. +func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + providers, ok := familiesVFS2[family] + if !ok { + return nil, nil, syserr.ErrAddressFamilyNotSupported + } + + for _, p := range providers { + s1, s2, err := p.Pair(t, stype, protocol) + if err != nil { + return nil, nil, err + } + if s1 != nil && s2 != nil { + k := t.Kernel() + k.RecordSocketVFS2(s1) + k.RecordSocketVFS2(s2) + return s1, s2, nil + } + } + + return nil, nil, syserr.ErrSocketNotSupported +} + +// SendReceiveTimeout stores timeouts for send and receive calls. +// +// It is meant to be embedded into Socket implementations to help satisfy the +// interface. +// +// Care must be taken when copying SendReceiveTimeout as it contains atomic +// variables. +// +// +stateify savable +type SendReceiveTimeout struct { + // send is length of the send timeout in nanoseconds. + // + // send must be accessed atomically. + send int64 + + // recv is length of the receive timeout in nanoseconds. + // + // recv must be accessed atomically. + recv int64 +} + +// SetRecvTimeout implements Socket.SetRecvTimeout. +func (to *SendReceiveTimeout) SetRecvTimeout(nanoseconds int64) { + atomic.StoreInt64(&to.recv, nanoseconds) +} + +// RecvTimeout implements Socket.RecvTimeout. +func (to *SendReceiveTimeout) RecvTimeout() int64 { + return atomic.LoadInt64(&to.recv) +} + +// SetSendTimeout implements Socket.SetSendTimeout. +func (to *SendReceiveTimeout) SetSendTimeout(nanoseconds int64) { + atomic.StoreInt64(&to.send, nanoseconds) +} + +// SendTimeout implements Socket.SendTimeout. +func (to *SendReceiveTimeout) SendTimeout() int64 { + return atomic.LoadInt64(&to.send) +} + +// GetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid. +// It contains names that are valid for GetSockOpt when level is SOL_SOCKET. +func GetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) { + switch name { + case linux.SO_ACCEPTCONN, + linux.SO_BPF_EXTENSIONS, + linux.SO_COOKIE, + linux.SO_DOMAIN, + linux.SO_ERROR, + linux.SO_GET_FILTER, + linux.SO_INCOMING_NAPI_ID, + linux.SO_MEMINFO, + linux.SO_PEERCRED, + linux.SO_PEERGROUPS, + linux.SO_PEERNAME, + linux.SO_PEERSEC, + linux.SO_PROTOCOL, + linux.SO_SNDLOWAT, + linux.SO_TYPE: + + t.Kernel().EmitUnimplementedEvent(t) + + default: + emitUnimplementedEvent(t, name) + } +} + +// SetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid. +// It contains names that are valid for SetSockOpt when level is SOL_SOCKET. +func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) { + switch name { + case linux.SO_ATTACH_BPF, + linux.SO_ATTACH_FILTER, + linux.SO_ATTACH_REUSEPORT_CBPF, + linux.SO_ATTACH_REUSEPORT_EBPF, + linux.SO_CNX_ADVICE, + linux.SO_DETACH_FILTER, + linux.SO_RCVBUFFORCE, + linux.SO_SNDBUFFORCE: + + t.Kernel().EmitUnimplementedEvent(t) + + default: + emitUnimplementedEvent(t, name) + } +} + +// emitUnimplementedEvent emits unimplemented event if name is valid. It +// contains names that are common between Get and SetSocketOpt when level is +// SOL_SOCKET. +func emitUnimplementedEvent(t *kernel.Task, name int) { + switch name { + case linux.SO_BINDTODEVICE, + linux.SO_BROADCAST, + linux.SO_BSDCOMPAT, + linux.SO_BUSY_POLL, + linux.SO_DEBUG, + linux.SO_DONTROUTE, + linux.SO_INCOMING_CPU, + linux.SO_KEEPALIVE, + linux.SO_LINGER, + linux.SO_LOCK_FILTER, + linux.SO_MARK, + linux.SO_MAX_PACING_RATE, + linux.SO_NOFCS, + linux.SO_OOBINLINE, + linux.SO_PASSCRED, + linux.SO_PASSSEC, + linux.SO_PEEK_OFF, + linux.SO_PRIORITY, + linux.SO_RCVBUF, + linux.SO_RCVLOWAT, + linux.SO_RCVTIMEO, + linux.SO_REUSEADDR, + linux.SO_REUSEPORT, + linux.SO_RXQ_OVFL, + linux.SO_SELECT_ERR_QUEUE, + linux.SO_SNDBUF, + linux.SO_SNDTIMEO, + linux.SO_TIMESTAMP, + linux.SO_TIMESTAMPING, + linux.SO_TIMESTAMPNS, + linux.SO_TXTIME, + linux.SO_WIFI_STATUS, + linux.SO_ZEROCOPY: + + t.Kernel().EmitUnimplementedEvent(t) + } +} + +// UnmarshalSockAddr unmarshals memory representing a struct sockaddr to one of +// the ABI socket address types. +// +// Precondition: data must be long enough to represent a socket address of the +// given family. +func UnmarshalSockAddr(family int, data []byte) linux.SockAddr { + switch family { + case syscall.AF_INET: + var addr linux.SockAddrInet + binary.Unmarshal(data[:syscall.SizeofSockaddrInet4], usermem.ByteOrder, &addr) + return &addr + case syscall.AF_INET6: + var addr linux.SockAddrInet6 + binary.Unmarshal(data[:syscall.SizeofSockaddrInet6], usermem.ByteOrder, &addr) + return &addr + case syscall.AF_UNIX: + var addr linux.SockAddrUnix + binary.Unmarshal(data[:syscall.SizeofSockaddrUnix], usermem.ByteOrder, &addr) + return &addr + case syscall.AF_NETLINK: + var addr linux.SockAddrNetlink + binary.Unmarshal(data[:syscall.SizeofSockaddrNetlink], usermem.ByteOrder, &addr) + return &addr + default: + panic(fmt.Sprintf("Unsupported socket family %v", family)) + } +} diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD new file mode 100644 index 000000000..cca5e70f1 --- /dev/null +++ b/pkg/sentry/socket/unix/BUILD @@ -0,0 +1,39 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "unix", + srcs = [ + "device.go", + "io.go", + "unix.go", + "unix_vfs2.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/refs", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/netstack", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/socket/unix/device.go b/pkg/sentry/socket/unix/device.go new file mode 100644 index 000000000..db01ac4c9 --- /dev/null +++ b/pkg/sentry/socket/unix/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unix + +import "gvisor.dev/gvisor/pkg/sentry/device" + +// unixSocketDevice is the unix socket virtual device. +var unixSocketDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go new file mode 100644 index 000000000..129949990 --- /dev/null +++ b/pkg/sentry/socket/unix/io.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unix + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// EndpointWriter implements safemem.Writer that writes to a transport.Endpoint. +// +// EndpointWriter is not thread-safe. +type EndpointWriter struct { + Ctx context.Context + + // Endpoint is the transport.Endpoint to write to. + Endpoint transport.Endpoint + + // Control is the control messages to send. + Control transport.ControlMessages + + // To is the endpoint to send to. May be nil. + To transport.BoundEndpoint +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (w *EndpointWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + return safemem.FromVecWriterFunc{func(bufs [][]byte) (int64, error) { + n, err := w.Endpoint.SendMsg(w.Ctx, bufs, w.Control, w.To) + if err != nil { + return int64(n), err.ToError() + } + return int64(n), nil + }}.WriteFromBlocks(srcs) +} + +// EndpointReader implements safemem.Reader that reads from a +// transport.Endpoint. +// +// EndpointReader is not thread-safe. +type EndpointReader struct { + Ctx context.Context + + // Endpoint is the transport.Endpoint to read from. + Endpoint transport.Endpoint + + // Creds indicates if credential control messages are requested. + Creds bool + + // NumRights is the number of SCM_RIGHTS FDs requested. + NumRights int + + // Peek indicates that the data should not be consumed from the + // endpoint. + Peek bool + + // MsgSize is the size of the message that was read from. For stream + // sockets, it is the amount read. + MsgSize int64 + + // From, if not nil, will be set with the address read from. + From *tcpip.FullAddress + + // Control contains the received control messages. + Control transport.ControlMessages + + // ControlTrunc indicates that SCM_RIGHTS FDs were discarded based on + // the value of NumRights. + ControlTrunc bool +} + +// Truncate calls RecvMsg on the endpoint without writing to a destination. +func (r *EndpointReader) Truncate() error { + // Ignore bytes read since it will always be zero. + _, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, r.Creds, r.NumRights, r.Peek, r.From) + r.Control = c + r.ControlTrunc = ct + r.MsgSize = ms + if err != nil { + return err.ToError() + } + return nil +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) { + n, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, bufs, r.Creds, r.NumRights, r.Peek, r.From) + r.Control = c + r.ControlTrunc = ct + r.MsgSize = ms + if err != nil { + return int64(n), err.ToError() + } + return int64(n), nil + }}.ReadToBlocks(dsts) +} diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD new file mode 100644 index 000000000..c708b6030 --- /dev/null +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -0,0 +1,41 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "transport_message_list", + out = "transport_message_list.go", + package = "transport", + prefix = "message", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*message", + "Linker": "*message", + }, +) + +go_library( + name = "transport", + srcs = [ + "connectioned.go", + "connectioned_state.go", + "connectionless.go", + "queue.go", + "transport_message_list.go", + "unix.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/ilist", + "//pkg/log", + "//pkg/refs", + "//pkg/sync", + "//pkg/syserr", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go new file mode 100644 index 000000000..a1e49cc57 --- /dev/null +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -0,0 +1,486 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transport + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/waiter" +) + +// UniqueIDProvider generates a sequence of unique identifiers useful for, +// among other things, lock ordering. +type UniqueIDProvider interface { + // UniqueID returns a new unique identifier. + UniqueID() uint64 +} + +// A ConnectingEndpoint is a connectioned unix endpoint that is attempting to +// establish a bidirectional connection with a BoundEndpoint. +type ConnectingEndpoint interface { + // ID returns the endpoint's globally unique identifier. This identifier + // must be used to determine locking order if more than one endpoint is + // to be locked in the same codepath. The endpoint with the smaller + // identifier must be locked before endpoints with larger identifiers. + ID() uint64 + + // Passcred implements socket.Credentialer.Passcred. + Passcred() bool + + // Type returns the socket type, typically either SockStream or + // SockSeqpacket. The connection attempt must be aborted if this + // value doesn't match the ConnectableEndpoint's type. + Type() linux.SockType + + // GetLocalAddress returns the bound path. + GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + + // Locker protects the following methods. While locked, only the holder of + // the lock can change the return value of the protected methods. + sync.Locker + + // Connected returns true iff the ConnectingEndpoint is in the connected + // state. ConnectingEndpoints can only be connected to a single endpoint, + // so the connection attempt must be aborted if this returns true. + Connected() bool + + // Listening returns true iff the ConnectingEndpoint is in the listening + // state. ConnectingEndpoints cannot make connections while listening, so + // the connection attempt must be aborted if this returns true. + Listening() bool + + // WaiterQueue returns a pointer to the endpoint's waiter queue. + WaiterQueue() *waiter.Queue +} + +// connectionedEndpoint is a Unix-domain connected or connectable endpoint and implements +// ConnectingEndpoint, ConnectableEndpoint and tcpip.Endpoint. +// +// connectionedEndpoints must be in connected state in order to transfer data. +// +// This implementation includes STREAM and SEQPACKET Unix sockets created with +// socket(2), accept(2) or socketpair(2) and dgram unix sockets created with +// socketpair(2). See unix_connectionless.go for the implementation of DGRAM +// Unix sockets created with socket(2). +// +// The state is much simpler than a TCP endpoint, so it is not encoded +// explicitly. Instead we enforce the following invariants: +// +// receiver != nil, connected != nil => connected. +// path != "" && acceptedChan == nil => bound, not listening. +// path != "" && acceptedChan != nil => bound and listening. +// +// Only one of these will be true at any moment. +// +// +stateify savable +type connectionedEndpoint struct { + baseEndpoint + + // id is the unique endpoint identifier. This is used exclusively for + // lock ordering within connect. + id uint64 + + // idGenerator is used to generate new unique endpoint identifiers. + idGenerator UniqueIDProvider + + // stype is used by connecting sockets to ensure that they are the + // same type. The value is typically either tcpip.SockSeqpacket or + // tcpip.SockStream. + stype linux.SockType + + // acceptedChan is per the TCP endpoint implementation. Note that the + // sockets in this channel are _already in the connected state_, and + // have another associated connectionedEndpoint. + // + // If nil, then no listen call has been made. + acceptedChan chan *connectionedEndpoint `state:".([]*connectionedEndpoint)"` +} + +var ( + _ = BoundEndpoint((*connectionedEndpoint)(nil)) + _ = Endpoint((*connectionedEndpoint)(nil)) +) + +// NewConnectioned creates a new unbound connectionedEndpoint. +func NewConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) Endpoint { + return &connectionedEndpoint{ + baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, + id: uid.UniqueID(), + idGenerator: uid, + stype: stype, + } +} + +// NewPair allocates a new pair of connected unix-domain connectionedEndpoints. +func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { + a := &connectionedEndpoint{ + baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, + id: uid.UniqueID(), + idGenerator: uid, + stype: stype, + } + b := &connectionedEndpoint{ + baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, + id: uid.UniqueID(), + idGenerator: uid, + stype: stype, + } + + q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} + q1.EnableLeakCheck("transport.queue") + q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} + q2.EnableLeakCheck("transport.queue") + + if stype == linux.SOCK_STREAM { + a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} + b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}} + } else { + a.receiver = &queueReceiver{q1} + b.receiver = &queueReceiver{q2} + } + + q2.IncRef() + a.connected = &connectedEndpoint{ + endpoint: b, + writeQueue: q2, + } + q1.IncRef() + b.connected = &connectedEndpoint{ + endpoint: a, + writeQueue: q1, + } + + return a, b +} + +// NewExternal creates a new externally backed Endpoint. It behaves like a +// socketpair. +func NewExternal(ctx context.Context, stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { + return &connectionedEndpoint{ + baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected}, + id: uid.UniqueID(), + idGenerator: uid, + stype: stype, + } +} + +// ID implements ConnectingEndpoint.ID. +func (e *connectionedEndpoint) ID() uint64 { + return e.id +} + +// Type implements ConnectingEndpoint.Type and Endpoint.Type. +func (e *connectionedEndpoint) Type() linux.SockType { + return e.stype +} + +// WaiterQueue implements ConnectingEndpoint.WaiterQueue. +func (e *connectionedEndpoint) WaiterQueue() *waiter.Queue { + return e.Queue +} + +// isBound returns true iff the connectionedEndpoint is bound (but not +// listening). +func (e *connectionedEndpoint) isBound() bool { + return e.path != "" && e.acceptedChan == nil +} + +// Listening implements ConnectingEndpoint.Listening. +func (e *connectionedEndpoint) Listening() bool { + return e.acceptedChan != nil +} + +// Close puts the connectionedEndpoint in a closed state and frees all +// resources associated with it. +// +// The socket will be a fresh state after a call to close and may be reused. +// That is, close may be used to "unbind" or "disconnect" the socket in error +// paths. +func (e *connectionedEndpoint) Close() { + e.Lock() + var c ConnectedEndpoint + var r Receiver + switch { + case e.Connected(): + e.connected.CloseSend() + e.receiver.CloseRecv() + // Still have unread data? If yes, we set this into the write + // end so that the peer can get ECONNRESET) when it does read. + if e.receiver.RecvQueuedSize() > 0 { + e.connected.CloseUnread() + } + c = e.connected + r = e.receiver + e.connected = nil + e.receiver = nil + case e.isBound(): + e.path = "" + case e.Listening(): + close(e.acceptedChan) + for n := range e.acceptedChan { + n.Close() + } + e.acceptedChan = nil + e.path = "" + } + e.Unlock() + if c != nil { + c.CloseNotify() + c.Release() + } + if r != nil { + r.CloseNotify() + r.Release() + } +} + +// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. +func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { + if ce.Type() != e.stype { + return syserr.ErrWrongProtocolForSocket + } + + // Check if ce is e to avoid a deadlock. + if ce, ok := ce.(*connectionedEndpoint); ok && ce == e { + return syserr.ErrInvalidEndpointState + } + + // Do a dance to safely acquire locks on both endpoints. + if e.id < ce.ID() { + e.Lock() + ce.Lock() + } else { + ce.Lock() + e.Lock() + } + + // Check connecting state. + if ce.Connected() { + e.Unlock() + ce.Unlock() + return syserr.ErrAlreadyConnected + } + if ce.Listening() { + e.Unlock() + ce.Unlock() + return syserr.ErrInvalidEndpointState + } + + // Check bound state. + if !e.Listening() { + e.Unlock() + ce.Unlock() + return syserr.ErrConnectionRefused + } + + // Create a newly bound connectionedEndpoint. + ne := &connectionedEndpoint{ + baseEndpoint: baseEndpoint{ + path: e.path, + Queue: &waiter.Queue{}, + }, + id: e.idGenerator.UniqueID(), + idGenerator: e.idGenerator, + stype: e.stype, + } + + readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} + readQueue.EnableLeakCheck("transport.queue") + ne.connected = &connectedEndpoint{ + endpoint: ce, + writeQueue: readQueue, + } + + writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} + writeQueue.EnableLeakCheck("transport.queue") + if e.stype == linux.SOCK_STREAM { + ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} + } else { + ne.receiver = &queueReceiver{readQueue: writeQueue} + } + + select { + case e.acceptedChan <- ne: + // Commit state. + writeQueue.IncRef() + connected := &connectedEndpoint{ + endpoint: ne, + writeQueue: writeQueue, + } + readQueue.IncRef() + if e.stype == linux.SOCK_STREAM { + returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected) + } else { + returnConnect(&queueReceiver{readQueue: readQueue}, connected) + } + + // Notify can deadlock if we are holding these locks. + e.Unlock() + ce.Unlock() + + // Notify on both ends. + e.Notify(waiter.EventIn) + ce.WaiterQueue().Notify(waiter.EventOut) + + return nil + default: + // Busy; return ECONNREFUSED per spec. + ne.Close() + e.Unlock() + ce.Unlock() + return syserr.ErrConnectionRefused + } +} + +// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect. +func (e *connectionedEndpoint) UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) { + return nil, syserr.ErrConnectionRefused +} + +// Connect attempts to directly connect to another Endpoint. +// Implements Endpoint.Connect. +func (e *connectionedEndpoint) Connect(ctx context.Context, server BoundEndpoint) *syserr.Error { + returnConnect := func(r Receiver, ce ConnectedEndpoint) { + e.receiver = r + e.connected = ce + } + + return server.BidirectionalConnect(ctx, e, returnConnect) +} + +// Listen starts listening on the connection. +func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error { + e.Lock() + defer e.Unlock() + if e.Listening() { + // Adjust the size of the channel iff we can fix existing + // pending connections into the new one. + if len(e.acceptedChan) > backlog { + return syserr.ErrInvalidEndpointState + } + origChan := e.acceptedChan + e.acceptedChan = make(chan *connectionedEndpoint, backlog) + close(origChan) + for ep := range origChan { + e.acceptedChan <- ep + } + return nil + } + if !e.isBound() { + return syserr.ErrInvalidEndpointState + } + + // Normal case. + e.acceptedChan = make(chan *connectionedEndpoint, backlog) + return nil +} + +// Accept accepts a new connection. +func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) { + e.Lock() + defer e.Unlock() + + if !e.Listening() { + return nil, syserr.ErrInvalidEndpointState + } + + select { + case ne := <-e.acceptedChan: + return ne, nil + + default: + // Nothing left. + return nil, syserr.ErrWouldBlock + } +} + +// Bind binds the connection. +// +// For Unix connectionedEndpoints, this _only sets the address associated with +// the socket_. Work associated with sockets in the filesystem or finding those +// sockets must be done by a higher level. +// +// Bind will fail only if the socket is connected, bound or the passed address +// is invalid (the empty string). +func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error { + e.Lock() + defer e.Unlock() + if e.isBound() || e.Listening() { + return syserr.ErrAlreadyBound + } + if addr.Addr == "" { + // The empty string is not permitted. + return syserr.ErrBadLocalAddress + } + if commit != nil { + if err := commit(); err != nil { + return err + } + } + + // Save the bound address. + e.path = string(addr.Addr) + return nil +} + +// SendMsg writes data and a control message to the endpoint's peer. +// This method does not block if the data cannot be written. +func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) { + // Stream sockets do not support specifying the endpoint. Seqpacket + // sockets ignore the passed endpoint. + if e.stype == linux.SOCK_STREAM && to != nil { + return 0, syserr.ErrNotSupported + } + return e.baseEndpoint.SendMsg(ctx, data, c, to) +} + +// Readiness returns the current readiness of the connectionedEndpoint. For +// example, if waiter.EventIn is set, the connectionedEndpoint is immediately +// readable. +func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + e.Lock() + defer e.Unlock() + + ready := waiter.EventMask(0) + switch { + case e.Connected(): + if mask&waiter.EventIn != 0 && e.receiver.Readable() { + ready |= waiter.EventIn + } + if mask&waiter.EventOut != 0 && e.connected.Writable() { + ready |= waiter.EventOut + } + case e.Listening(): + if mask&waiter.EventIn != 0 && len(e.acceptedChan) > 0 { + ready |= waiter.EventIn + } + } + + return ready +} + +// State implements socket.Socket.State. +func (e *connectionedEndpoint) State() uint32 { + e.Lock() + defer e.Unlock() + + if e.Connected() { + return linux.SS_CONNECTED + } + return linux.SS_UNCONNECTED +} diff --git a/pkg/sentry/socket/unix/transport/connectioned_state.go b/pkg/sentry/socket/unix/transport/connectioned_state.go new file mode 100644 index 000000000..7e02a5db8 --- /dev/null +++ b/pkg/sentry/socket/unix/transport/connectioned_state.go @@ -0,0 +1,53 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transport + +// saveAcceptedChan is invoked by stateify. +func (e *connectionedEndpoint) saveAcceptedChan() []*connectionedEndpoint { + // If acceptedChan is nil (i.e. we are not listening) then we will save nil. + // Otherwise we create a (possibly empty) slice of the values in acceptedChan and + // save that. + var acceptedSlice []*connectionedEndpoint + if e.acceptedChan != nil { + // Swap out acceptedChan with a new empty channel of the same capacity. + saveChan := e.acceptedChan + e.acceptedChan = make(chan *connectionedEndpoint, cap(saveChan)) + + // Create a new slice with the same len and capacity as the channel. + acceptedSlice = make([]*connectionedEndpoint, len(saveChan), cap(saveChan)) + // Drain acceptedChan into saveSlice, and fill up the new acceptChan at the + // same time. + for i := range acceptedSlice { + ep := <-saveChan + acceptedSlice[i] = ep + e.acceptedChan <- ep + } + close(saveChan) + } + return acceptedSlice +} + +// loadAcceptedChan is invoked by stateify. +func (e *connectionedEndpoint) loadAcceptedChan(acceptedSlice []*connectionedEndpoint) { + // If acceptedSlice is nil, then acceptedChan should also be nil. + if acceptedSlice != nil { + // Otherwise, create a new channel with the same capacity as acceptedSlice. + e.acceptedChan = make(chan *connectionedEndpoint, cap(acceptedSlice)) + // Seed the channel with values from acceptedSlice. + for _, ep := range acceptedSlice { + e.acceptedChan <- ep + } + } +} diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go new file mode 100644 index 000000000..4b06d63ac --- /dev/null +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -0,0 +1,218 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transport + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/waiter" +) + +// connectionlessEndpoint is a unix endpoint for unix sockets that support operating in +// a connectionless fashon. +// +// Specifically, this means datagram unix sockets not created with +// socketpair(2). +// +// +stateify savable +type connectionlessEndpoint struct { + baseEndpoint +} + +var ( + _ = BoundEndpoint((*connectionlessEndpoint)(nil)) + _ = Endpoint((*connectionlessEndpoint)(nil)) +) + +// NewConnectionless creates a new unbound dgram endpoint. +func NewConnectionless(ctx context.Context) Endpoint { + ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} + q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} + q.EnableLeakCheck("transport.queue") + ep.receiver = &queueReceiver{readQueue: &q} + return ep +} + +// isBound returns true iff the endpoint is bound. +func (e *connectionlessEndpoint) isBound() bool { + return e.path != "" +} + +// Close puts the endpoint in a closed state and frees all resources associated +// with it. +func (e *connectionlessEndpoint) Close() { + e.Lock() + if e.connected != nil { + e.connected.Release() + e.connected = nil + } + + if e.isBound() { + e.path = "" + } + + e.receiver.CloseRecv() + r := e.receiver + e.receiver = nil + e.Unlock() + + r.CloseNotify() + r.Release() +} + +// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. +func (e *connectionlessEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { + return syserr.ErrConnectionRefused +} + +// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect. +func (e *connectionlessEndpoint) UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) { + e.Lock() + r := e.receiver + e.Unlock() + if r == nil { + return nil, syserr.ErrConnectionRefused + } + q := r.(*queueReceiver).readQueue + if !q.TryIncRef() { + return nil, syserr.ErrConnectionRefused + } + return &connectedEndpoint{ + endpoint: e, + writeQueue: q, + }, nil +} + +// SendMsg writes data and a control message to the specified endpoint. +// This method does not block if the data cannot be written. +func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) { + if to == nil { + return e.baseEndpoint.SendMsg(ctx, data, c, nil) + } + + connected, err := to.UnidirectionalConnect(ctx) + if err != nil { + return 0, syserr.ErrInvalidEndpointState + } + defer connected.Release() + + e.Lock() + n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + e.Unlock() + + if notify { + connected.SendNotify() + } + + return n, err +} + +// Type implements Endpoint.Type. +func (e *connectionlessEndpoint) Type() linux.SockType { + return linux.SOCK_DGRAM +} + +// Connect attempts to connect directly to server. +func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoint) *syserr.Error { + connected, err := server.UnidirectionalConnect(ctx) + if err != nil { + return err + } + + e.Lock() + if e.connected != nil { + e.connected.Release() + } + e.connected = connected + e.Unlock() + + return nil +} + +// Listen starts listening on the connection. +func (e *connectionlessEndpoint) Listen(int) *syserr.Error { + return syserr.ErrNotSupported +} + +// Accept accepts a new connection. +func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) { + return nil, syserr.ErrNotSupported +} + +// Bind binds the connection. +// +// For Unix endpoints, this _only sets the address associated with the socket_. +// Work associated with sockets in the filesystem or finding those sockets must +// be done by a higher level. +// +// Bind will fail only if the socket is connected, bound or the passed address +// is invalid (the empty string). +func (e *connectionlessEndpoint) Bind(addr tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error { + e.Lock() + defer e.Unlock() + if e.isBound() { + return syserr.ErrAlreadyBound + } + if addr.Addr == "" { + // The empty string is not permitted. + return syserr.ErrBadLocalAddress + } + if commit != nil { + if err := commit(); err != nil { + return err + } + } + + // Save the bound address. + e.path = string(addr.Addr) + return nil +} + +// Readiness returns the current readiness of the endpoint. For example, if +// waiter.EventIn is set, the endpoint is immediately readable. +func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + e.Lock() + defer e.Unlock() + + ready := waiter.EventMask(0) + if mask&waiter.EventIn != 0 && e.receiver.Readable() { + ready |= waiter.EventIn + } + + if e.Connected() { + if mask&waiter.EventOut != 0 && e.connected.Writable() { + ready |= waiter.EventOut + } + } + + return ready +} + +// State implements socket.Socket.State. +func (e *connectionlessEndpoint) State() uint32 { + e.Lock() + defer e.Unlock() + + switch { + case e.isBound(): + return linux.SS_UNCONNECTED + case e.Connected(): + return linux.SS_CONNECTING + default: + return linux.SS_DISCONNECTING + } +} diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go new file mode 100644 index 000000000..d8f3ad63d --- /dev/null +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -0,0 +1,247 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transport + +import ( + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/waiter" +) + +// queue is a buffer queue. +// +// +stateify savable +type queue struct { + refs.AtomicRefCount + + ReaderQueue *waiter.Queue + WriterQueue *waiter.Queue + + mu sync.Mutex `state:"nosave"` + closed bool + unread bool + used int64 + limit int64 + dataList messageList +} + +// Close closes q for reading and writing. It is immediately not writable and +// will become unreadable when no more data is pending. +// +// Both the read and write queues must be notified after closing: +// q.ReaderQueue.Notify(waiter.EventIn) +// q.WriterQueue.Notify(waiter.EventOut) +func (q *queue) Close() { + q.mu.Lock() + q.closed = true + q.mu.Unlock() +} + +// Reset empties the queue and Releases all of the Entries. +// +// Both the read and write queues must be notified after resetting: +// q.ReaderQueue.Notify(waiter.EventIn) +// q.WriterQueue.Notify(waiter.EventOut) +func (q *queue) Reset() { + q.mu.Lock() + for cur := q.dataList.Front(); cur != nil; cur = cur.Next() { + cur.Release() + } + q.dataList.Reset() + q.used = 0 + q.mu.Unlock() +} + +// DecRef implements RefCounter.DecRef with destructor q.Reset. +func (q *queue) DecRef() { + q.DecRefWithDestructor(q.Reset) + // We don't need to notify after resetting because no one cares about + // this queue after all references have been dropped. +} + +// IsReadable determines if q is currently readable. +func (q *queue) IsReadable() bool { + q.mu.Lock() + defer q.mu.Unlock() + + return q.closed || q.dataList.Front() != nil +} + +// bufWritable returns true if there is space for writing. +// +// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is +// free. +// +// See net/unix/af_unix.c:unix_writeable. +func (q *queue) bufWritable() bool { + return 4*q.used < q.limit +} + +// IsWritable determines if q is currently writable. +func (q *queue) IsWritable() bool { + q.mu.Lock() + defer q.mu.Unlock() + + return q.closed || q.bufWritable() +} + +// Enqueue adds an entry to the data queue if room is available. +// +// If discardEmpty is true and there are zero bytes of data, the packet is +// dropped. +// +// If truncate is true, Enqueue may truncate the message before enqueuing it. +// Otherwise, the entire message must fit. If l is less than the size of data, +// err indicates why. +// +// If notify is true, ReaderQueue.Notify must be called: +// q.ReaderQueue.Notify(waiter.EventIn) +func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { + q.mu.Lock() + + if q.closed { + q.mu.Unlock() + return 0, false, syserr.ErrClosedForSend + } + + for _, d := range data { + l += int64(len(d)) + } + if discardEmpty && l == 0 { + q.mu.Unlock() + c.Release() + return 0, false, nil + } + + free := q.limit - q.used + + if l > free && truncate { + if free == 0 { + // Message can't fit right now. + q.mu.Unlock() + return 0, false, syserr.ErrWouldBlock + } + + l = free + err = syserr.ErrWouldBlock + } + + if l > q.limit { + // Message is too big to ever fit. + q.mu.Unlock() + return 0, false, syserr.ErrMessageTooLong + } + + if l > free { + // Message can't fit right now, and could not be truncated. + q.mu.Unlock() + return 0, false, syserr.ErrWouldBlock + } + + // Aggregate l bytes of data. This will truncate the data if l is less than + // the total bytes held in data. + v := make([]byte, l) + for i, b := 0, v; i < len(data) && len(b) > 0; i++ { + n := copy(b, data[i]) + b = b[n:] + } + + notify = q.dataList.Front() == nil + q.used += l + q.dataList.PushBack(&message{ + Data: buffer.View(v), + Control: c, + Address: from, + }) + + q.mu.Unlock() + + return l, notify, err +} + +// Dequeue removes the first entry in the data queue, if one exists. +// +// If notify is true, WriterQueue.Notify must be called: +// q.WriterQueue.Notify(waiter.EventOut) +func (q *queue) Dequeue() (e *message, notify bool, err *syserr.Error) { + q.mu.Lock() + + if q.dataList.Front() == nil { + err := syserr.ErrWouldBlock + if q.closed { + err = syserr.ErrClosedForReceive + if q.unread { + err = syserr.ErrConnectionReset + } + } + q.mu.Unlock() + + return nil, false, err + } + + notify = !q.bufWritable() + + e = q.dataList.Front() + q.dataList.Remove(e) + q.used -= e.Length() + + notify = notify && q.bufWritable() + + q.mu.Unlock() + + return e, notify, nil +} + +// Peek returns the first entry in the data queue, if one exists. +func (q *queue) Peek() (*message, *syserr.Error) { + q.mu.Lock() + defer q.mu.Unlock() + + if q.dataList.Front() == nil { + err := syserr.ErrWouldBlock + if q.closed { + if err = syserr.ErrClosedForReceive; q.unread { + err = syserr.ErrConnectionReset + } + } + return nil, err + } + + return q.dataList.Front().Peek(), nil +} + +// QueuedSize returns the number of bytes currently in the queue, that is, the +// number of readable bytes. +func (q *queue) QueuedSize() int64 { + q.mu.Lock() + defer q.mu.Unlock() + return q.used +} + +// MaxQueueSize returns the maximum number of bytes storable in the queue. +func (q *queue) MaxQueueSize() int64 { + return q.limit +} + +// CloseUnread sets flag to indicate that the peer is closed (not shutdown) +// with unread data. So if read on this queue shall return ECONNRESET error. +func (q *queue) CloseUnread() { + q.mu.Lock() + defer q.mu.Unlock() + q.unread = true +} diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go new file mode 100644 index 000000000..2f1b127df --- /dev/null +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -0,0 +1,1006 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package transport contains the implementation of Unix endpoints. +package transport + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/waiter" +) + +// initialLimit is the starting limit for the socket buffers. +const initialLimit = 16 * 1024 + +// A RightsControlMessage is a control message containing FDs. +type RightsControlMessage interface { + // Clone returns a copy of the RightsControlMessage. + Clone() RightsControlMessage + + // Release releases any resources owned by the RightsControlMessage. + Release() +} + +// A CredentialsControlMessage is a control message containing Unix credentials. +type CredentialsControlMessage interface { + // Equals returns true iff the two messages are equal. + Equals(CredentialsControlMessage) bool +} + +// A ControlMessages represents a collection of socket control messages. +// +// +stateify savable +type ControlMessages struct { + // Rights is a control message containing FDs. + Rights RightsControlMessage + + // Credentials is a control message containing Unix credentials. + Credentials CredentialsControlMessage +} + +// Empty returns true iff the ControlMessages does not contain either +// credentials or rights. +func (c *ControlMessages) Empty() bool { + return c.Rights == nil && c.Credentials == nil +} + +// Clone clones both the credentials and the rights. +func (c *ControlMessages) Clone() ControlMessages { + cm := ControlMessages{} + if c.Rights != nil { + cm.Rights = c.Rights.Clone() + } + cm.Credentials = c.Credentials + return cm +} + +// Release releases both the credentials and the rights. +func (c *ControlMessages) Release() { + if c.Rights != nil { + c.Rights.Release() + } + *c = ControlMessages{} +} + +// Endpoint is the interface implemented by Unix transport protocol +// implementations that expose functionality like sendmsg, recvmsg, connect, +// etc. to Unix socket implementations. +type Endpoint interface { + Credentialer + waiter.Waitable + + // Close puts the endpoint in a closed state and frees all resources + // associated with it. + Close() + + // RecvMsg reads data and a control message from the endpoint. This method + // does not block if there is no data pending. + // + // creds indicates if credential control messages are requested by the + // caller. This is useful for determining if control messages can be + // coalesced. creds is a hint and can be safely ignored by the + // implementation if no coalescing is possible. It is fine to return + // credential control messages when none were requested or to not return + // credential control messages when they were requested. + // + // numRights is the number of SCM_RIGHTS FDs requested by the caller. This + // is useful if one must allocate a buffer to receive a SCM_RIGHTS message + // or determine if control messages can be coalesced. numRights is a hint + // and can be safely ignored by the implementation if the number of + // available SCM_RIGHTS FDs is known and no coalescing is possible. It is + // fine for the returned number of SCM_RIGHTS FDs to be either higher or + // lower than the requested number. + // + // If peek is true, no data should be consumed from the Endpoint. Any and + // all data returned from a peek should be available in the next call to + // RecvMsg. + // + // recvLen is the number of bytes copied into data. + // + // msgLen is the length of the read message consumed for datagram Endpoints. + // msgLen is always the same as recvLen for stream Endpoints. + // + // CMTruncated indicates that the numRights hint was used to receive fewer + // than the total available SCM_RIGHTS FDs. Additional truncation may be + // required by the caller. + RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, err *syserr.Error) + + // SendMsg writes data and a control message to the endpoint's peer. + // This method does not block if the data cannot be written. + // + // SendMsg does not take ownership of any of its arguments on error. + SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, *syserr.Error) + + // Connect connects this endpoint directly to another. + // + // This should be called on the client endpoint, and the (bound) + // endpoint passed in as a parameter. + // + // The error codes are the same as Connect. + Connect(ctx context.Context, server BoundEndpoint) *syserr.Error + + // Shutdown closes the read and/or write end of the endpoint connection + // to its peer. + Shutdown(flags tcpip.ShutdownFlags) *syserr.Error + + // Listen puts the endpoint in "listen" mode, which allows it to accept + // new connections. + Listen(backlog int) *syserr.Error + + // Accept returns a new endpoint if a peer has established a connection + // to an endpoint previously set to listen mode. This method does not + // block if no new connections are available. + // + // The returned Queue is the wait queue for the newly created endpoint. + Accept() (Endpoint, *syserr.Error) + + // Bind binds the endpoint to a specific local address and port. + // Specifying a NIC is optional. + // + // An optional commit function will be executed atomically with respect + // to binding the endpoint. If this returns an error, the bind will not + // occur and the error will be propagated back to the caller. + Bind(address tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error + + // Type return the socket type, typically either SockStream, SockDgram + // or SockSeqpacket. + Type() linux.SockType + + // GetLocalAddress returns the address to which the endpoint is bound. + GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + + // GetRemoteAddress returns the address to which the endpoint is + // connected. + GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) + + // SetSockOpt sets a socket option. opt should be one of the tcpip.*Option + // types. + SetSockOpt(opt interface{}) *tcpip.Error + + // SetSockOptBool sets a socket option for simple cases when a value has + // the int type. + SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error + + // SetSockOptInt sets a socket option for simple cases when a value has + // the int type. + SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error + + // GetSockOpt gets a socket option. opt should be a pointer to one of the + // tcpip.*Option types. + GetSockOpt(opt interface{}) *tcpip.Error + + // GetSockOptBool gets a socket option for simple cases when a return + // value has the int type. + GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) + + // GetSockOptInt gets a socket option for simple cases when a return + // value has the int type. + GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) + + // State returns the current state of the socket, as represented by Linux in + // procfs. + State() uint32 +} + +// A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket +// option. +type Credentialer interface { + // Passcred returns whether or not the SO_PASSCRED socket option is + // enabled on this end. + Passcred() bool + + // ConnectedPasscred returns whether or not the SO_PASSCRED socket option + // is enabled on the connected end. + ConnectedPasscred() bool +} + +// A BoundEndpoint is a unix endpoint that can be connected to. +type BoundEndpoint interface { + // BidirectionalConnect establishes a bi-directional connection between two + // unix endpoints in an all-or-nothing manner. If an error occurs during + // connecting, the state of neither endpoint should be modified. + // + // In order for an endpoint to establish such a bidirectional connection + // with a BoundEndpoint, the endpoint calls the BidirectionalConnect method + // on the BoundEndpoint and sends a representation of itself (the + // ConnectingEndpoint) and a callback (returnConnect) to receive the + // connection information (Receiver and ConnectedEndpoint) upon a + // successful connect. The callback should only be called on a successful + // connect. + // + // For a connection attempt to be successful, the ConnectingEndpoint must + // be unconnected and not listening and the BoundEndpoint whose + // BidirectionalConnect method is being called must be listening. + // + // This method will return syserr.ErrConnectionRefused on endpoints with a + // type that isn't SockStream or SockSeqpacket. + BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error + + // UnidirectionalConnect establishes a write-only connection to a unix + // endpoint. + // + // An endpoint which calls UnidirectionalConnect and supports it itself must + // not hold its own lock when calling UnidirectionalConnect. + // + // This method will return syserr.ErrConnectionRefused on a non-SockDgram + // endpoint. + UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) + + // Passcred returns whether or not the SO_PASSCRED socket option is + // enabled on this end. + Passcred() bool + + // Release releases any resources held by the BoundEndpoint. It must be + // called before dropping all references to a BoundEndpoint returned by a + // function. + Release() +} + +// message represents a message passed over a Unix domain socket. +// +// +stateify savable +type message struct { + messageEntry + + // Data is the Message payload. + Data buffer.View + + // Control is auxiliary control message data that goes along with the + // data. + Control ControlMessages + + // Address is the bound address of the endpoint that sent the message. + // + // If the endpoint that sent the message is not bound, the Address is + // the empty string. + Address tcpip.FullAddress +} + +// Length returns number of bytes stored in the message. +func (m *message) Length() int64 { + return int64(len(m.Data)) +} + +// Release releases any resources held by the message. +func (m *message) Release() { + m.Control.Release() +} + +// Peek returns a copy of the message. +func (m *message) Peek() *message { + return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address} +} + +// Truncate reduces the length of the message payload to n bytes. +// +// Preconditions: n <= m.Length(). +func (m *message) Truncate(n int64) { + m.Data.CapLength(int(n)) +} + +// A Receiver can be used to receive Messages. +type Receiver interface { + // Recv receives a single message. This method does not block. + // + // See Endpoint.RecvMsg for documentation on shared arguments. + // + // notify indicates if RecvNotify should be called. + Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) + + // RecvNotify notifies the Receiver of a successful Recv. This must not be + // called while holding any endpoint locks. + RecvNotify() + + // CloseRecv prevents the receiving of additional Messages. + // + // After CloseRecv is called, CloseNotify must also be called. + CloseRecv() + + // CloseNotify notifies the Receiver of recv being closed. This must not be + // called while holding any endpoint locks. + CloseNotify() + + // Readable returns if messages should be attempted to be received. This + // includes when read has been shutdown. + Readable() bool + + // RecvQueuedSize returns the total amount of data currently receivable. + // RecvQueuedSize should return -1 if the operation isn't supported. + RecvQueuedSize() int64 + + // RecvMaxQueueSize returns maximum value for RecvQueuedSize. + // RecvMaxQueueSize should return -1 if the operation isn't supported. + RecvMaxQueueSize() int64 + + // Release releases any resources owned by the Receiver. It should be + // called before droping all references to a Receiver. + Release() +} + +// queueReceiver implements Receiver for datagram sockets. +// +// +stateify savable +type queueReceiver struct { + readQueue *queue +} + +// Recv implements Receiver.Recv. +func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { + var m *message + var notify bool + var err *syserr.Error + if peek { + m, err = q.readQueue.Peek() + } else { + m, notify, err = q.readQueue.Dequeue() + } + if err != nil { + return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err + } + src := []byte(m.Data) + var copied int64 + for i := 0; i < len(data) && len(src) > 0; i++ { + n := copy(data[i], src) + copied += int64(n) + src = src[n:] + } + return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil +} + +// RecvNotify implements Receiver.RecvNotify. +func (q *queueReceiver) RecvNotify() { + q.readQueue.WriterQueue.Notify(waiter.EventOut) +} + +// CloseNotify implements Receiver.CloseNotify. +func (q *queueReceiver) CloseNotify() { + q.readQueue.ReaderQueue.Notify(waiter.EventIn) + q.readQueue.WriterQueue.Notify(waiter.EventOut) +} + +// CloseRecv implements Receiver.CloseRecv. +func (q *queueReceiver) CloseRecv() { + q.readQueue.Close() +} + +// Readable implements Receiver.Readable. +func (q *queueReceiver) Readable() bool { + return q.readQueue.IsReadable() +} + +// RecvQueuedSize implements Receiver.RecvQueuedSize. +func (q *queueReceiver) RecvQueuedSize() int64 { + return q.readQueue.QueuedSize() +} + +// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. +func (q *queueReceiver) RecvMaxQueueSize() int64 { + return q.readQueue.MaxQueueSize() +} + +// Release implements Receiver.Release. +func (q *queueReceiver) Release() { + q.readQueue.DecRef() +} + +// streamQueueReceiver implements Receiver for stream sockets. +// +// +stateify savable +type streamQueueReceiver struct { + queueReceiver + + mu sync.Mutex `state:"nosave"` + buffer []byte + control ControlMessages + addr tcpip.FullAddress +} + +func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) { + var copied int64 + for len(data) > 0 && len(buf) > 0 { + n := copy(data[0], buf) + copied += int64(n) + buf = buf[n:] + data[0] = data[0][n:] + if len(data[0]) == 0 { + data = data[1:] + } + } + return copied, data, buf +} + +// Readable implements Receiver.Readable. +func (q *streamQueueReceiver) Readable() bool { + q.mu.Lock() + bl := len(q.buffer) + r := q.readQueue.IsReadable() + q.mu.Unlock() + // We're readable if we have data in our buffer or if the queue receiver is + // readable. + return bl > 0 || r +} + +// RecvQueuedSize implements Receiver.RecvQueuedSize. +func (q *streamQueueReceiver) RecvQueuedSize() int64 { + q.mu.Lock() + bl := len(q.buffer) + qs := q.readQueue.QueuedSize() + q.mu.Unlock() + return int64(bl) + qs +} + +// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. +func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { + // The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest + // message we can buffer which is also the largest message we can receive. + return 2 * q.readQueue.MaxQueueSize() +} + +// Recv implements Receiver.Recv. +func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { + q.mu.Lock() + defer q.mu.Unlock() + + var notify bool + + // If we have no data in the endpoint, we need to get some. + if len(q.buffer) == 0 { + // Load the next message into a buffer, even if we are peeking. Peeking + // won't consume the message, so it will be still available to be read + // the next time Recv() is called. + m, n, err := q.readQueue.Dequeue() + if err != nil { + return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err + } + notify = n + q.buffer = []byte(m.Data) + q.control = m.Control + q.addr = m.Address + } + + var copied int64 + if peek { + // Don't consume control message if we are peeking. + c := q.control.Clone() + + // Don't consume data since we are peeking. + copied, data, _ = vecCopy(data, q.buffer) + + return copied, copied, c, false, q.addr, notify, nil + } + + // Consume data and control message since we are not peeking. + copied, data, q.buffer = vecCopy(data, q.buffer) + + // Save the original state of q.control. + c := q.control + + // Remove rights from q.control and leave behind just the creds. + q.control.Rights = nil + if !wantCreds { + c.Credentials = nil + } + + var cmTruncated bool + if c.Rights != nil && numRights == 0 { + c.Rights.Release() + c.Rights = nil + cmTruncated = true + } + + haveRights := c.Rights != nil + + // If we have more capacity for data and haven't received any usable + // rights. + // + // Linux never coalesces rights control messages. + for !haveRights && len(data) > 0 { + // Get a message from the readQueue. + m, n, err := q.readQueue.Dequeue() + if err != nil { + // We already got some data, so ignore this error. This will + // manifest as a short read to the user, which is what Linux + // does. + break + } + notify = notify || n + q.buffer = []byte(m.Data) + q.control = m.Control + q.addr = m.Address + + if wantCreds { + if (q.control.Credentials == nil) != (c.Credentials == nil) { + // One message has credentials, the other does not. + break + } + + if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) { + // Both messages have credentials, but they don't match. + break + } + } + + if numRights != 0 && c.Rights != nil && q.control.Rights != nil { + // Both messages have rights. + break + } + + var cpd int64 + cpd, data, q.buffer = vecCopy(data, q.buffer) + copied += cpd + + if cpd == 0 { + // data was actually full. + break + } + + if q.control.Rights != nil { + // Consume rights. + if numRights == 0 { + cmTruncated = true + q.control.Rights.Release() + } else { + c.Rights = q.control.Rights + haveRights = true + } + q.control.Rights = nil + } + } + return copied, copied, c, cmTruncated, q.addr, notify, nil +} + +// A ConnectedEndpoint is an Endpoint that can be used to send Messages. +type ConnectedEndpoint interface { + // Passcred implements Endpoint.Passcred. + Passcred() bool + + // GetLocalAddress implements Endpoint.GetLocalAddress. + GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + + // Send sends a single message. This method does not block. + // + // notify indicates if SendNotify should be called. + // + // syserr.ErrWouldBlock can be returned along with a partial write if + // the caller should block to send the rest of the data. + Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) + + // SendNotify notifies the ConnectedEndpoint of a successful Send. This + // must not be called while holding any endpoint locks. + SendNotify() + + // CloseSend prevents the sending of additional Messages. + // + // After CloseSend is call, CloseNotify must also be called. + CloseSend() + + // CloseNotify notifies the ConnectedEndpoint of send being closed. This + // must not be called while holding any endpoint locks. + CloseNotify() + + // Writable returns if messages should be attempted to be sent. This + // includes when write has been shutdown. + Writable() bool + + // EventUpdate lets the ConnectedEndpoint know that event registrations + // have changed. + EventUpdate() + + // SendQueuedSize returns the total amount of data currently queued for + // sending. SendQueuedSize should return -1 if the operation isn't + // supported. + SendQueuedSize() int64 + + // SendMaxQueueSize returns maximum value for SendQueuedSize. + // SendMaxQueueSize should return -1 if the operation isn't supported. + SendMaxQueueSize() int64 + + // Release releases any resources owned by the ConnectedEndpoint. It should + // be called before droping all references to a ConnectedEndpoint. + Release() + + // CloseUnread sets the fact that this end is closed with unread data to + // the peer socket. + CloseUnread() +} + +// +stateify savable +type connectedEndpoint struct { + // endpoint represents the subset of the Endpoint functionality needed by + // the connectedEndpoint. It is implemented by both connectionedEndpoint + // and connectionlessEndpoint and allows the use of types which don't + // fully implement Endpoint. + endpoint interface { + // Passcred implements Endpoint.Passcred. + Passcred() bool + + // GetLocalAddress implements Endpoint.GetLocalAddress. + GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) + + // Type implements Endpoint.Type. + Type() linux.SockType + } + + writeQueue *queue +} + +// Passcred implements ConnectedEndpoint.Passcred. +func (e *connectedEndpoint) Passcred() bool { + return e.endpoint.Passcred() +} + +// GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. +func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return e.endpoint.GetLocalAddress() +} + +// Send implements ConnectedEndpoint.Send. +func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { + discardEmpty := false + truncate := false + if e.endpoint.Type() == linux.SOCK_STREAM { + // Discard empty stream packets. Since stream sockets don't + // preserve message boundaries, sending zero bytes is a no-op. + // In Linux, the receiver actually uses a zero-length receive + // as an indication that the stream was closed. + discardEmpty = true + + // Since stream sockets don't preserve message boundaries, we + // can write only as much of the message as fits in the queue. + truncate = true + } + + return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate) +} + +// SendNotify implements ConnectedEndpoint.SendNotify. +func (e *connectedEndpoint) SendNotify() { + e.writeQueue.ReaderQueue.Notify(waiter.EventIn) +} + +// CloseNotify implements ConnectedEndpoint.CloseNotify. +func (e *connectedEndpoint) CloseNotify() { + e.writeQueue.ReaderQueue.Notify(waiter.EventIn) + e.writeQueue.WriterQueue.Notify(waiter.EventOut) +} + +// CloseSend implements ConnectedEndpoint.CloseSend. +func (e *connectedEndpoint) CloseSend() { + e.writeQueue.Close() +} + +// Writable implements ConnectedEndpoint.Writable. +func (e *connectedEndpoint) Writable() bool { + return e.writeQueue.IsWritable() +} + +// EventUpdate implements ConnectedEndpoint.EventUpdate. +func (*connectedEndpoint) EventUpdate() {} + +// SendQueuedSize implements ConnectedEndpoint.SendQueuedSize. +func (e *connectedEndpoint) SendQueuedSize() int64 { + return e.writeQueue.QueuedSize() +} + +// SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize. +func (e *connectedEndpoint) SendMaxQueueSize() int64 { + return e.writeQueue.MaxQueueSize() +} + +// Release implements ConnectedEndpoint.Release. +func (e *connectedEndpoint) Release() { + e.writeQueue.DecRef() +} + +// CloseUnread implements ConnectedEndpoint.CloseUnread. +func (e *connectedEndpoint) CloseUnread() { + e.writeQueue.CloseUnread() +} + +// baseEndpoint is an embeddable unix endpoint base used in both the connected and connectionless +// unix domain socket Endpoint implementations. +// +// Not to be used on its own. +// +// +stateify savable +type baseEndpoint struct { + *waiter.Queue + + // passcred specifies whether SCM_CREDENTIALS socket control messages are + // enabled on this endpoint. Must be accessed atomically. + passcred int32 + + // Mutex protects the below fields. + sync.Mutex `state:"nosave"` + + // receiver allows Messages to be received. + receiver Receiver + + // connected allows messages to be sent and state information about the + // connected endpoint to be read. + connected ConnectedEndpoint + + // path is not empty if the endpoint has been bound, + // or may be used if the endpoint is connected. + path string +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) { + e.Queue.EventRegister(we, mask) + e.Lock() + if e.connected != nil { + e.connected.EventUpdate() + } + e.Unlock() +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { + e.Queue.EventUnregister(we) + e.Lock() + if e.connected != nil { + e.connected.EventUpdate() + } + e.Unlock() +} + +// Passcred implements Credentialer.Passcred. +func (e *baseEndpoint) Passcred() bool { + return atomic.LoadInt32(&e.passcred) != 0 +} + +// ConnectedPasscred implements Credentialer.ConnectedPasscred. +func (e *baseEndpoint) ConnectedPasscred() bool { + e.Lock() + defer e.Unlock() + return e.connected != nil && e.connected.Passcred() +} + +func (e *baseEndpoint) setPasscred(pc bool) { + if pc { + atomic.StoreInt32(&e.passcred, 1) + } else { + atomic.StoreInt32(&e.passcred, 0) + } +} + +// Connected implements ConnectingEndpoint.Connected. +func (e *baseEndpoint) Connected() bool { + return e.receiver != nil && e.connected != nil +} + +// RecvMsg reads data and a control message from the endpoint. +func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (int64, int64, ControlMessages, bool, *syserr.Error) { + e.Lock() + + if e.receiver == nil { + e.Unlock() + return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected + } + + recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek) + e.Unlock() + if err != nil { + return 0, 0, ControlMessages{}, false, err + } + + if notify { + e.receiver.RecvNotify() + } + + if addr != nil { + *addr = a + } + return recvLen, msgLen, cms, cmt, nil +} + +// SendMsg writes data and a control message to the endpoint's peer. +// This method does not block if the data cannot be written. +func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) { + e.Lock() + if !e.Connected() { + e.Unlock() + return 0, syserr.ErrNotConnected + } + if to != nil { + e.Unlock() + return 0, syserr.ErrAlreadyConnected + } + + n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + e.Unlock() + + if notify { + e.connected.SendNotify() + } + + return n, err +} + +// SetSockOpt sets a socket option. Currently not supported. +func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { + switch opt { + case tcpip.BroadcastOption: + case tcpip.PasscredOption: + e.setPasscred(v) + case tcpip.ReuseAddressOption: + default: + log.Warningf("Unsupported socket option: %d", opt) + } + return nil +} + +func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { + switch opt { + case tcpip.SendBufferSizeOption: + case tcpip.ReceiveBufferSizeOption: + default: + log.Warningf("Unsupported socket option: %d", opt) + } + return nil +} + +func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { + switch opt { + case tcpip.KeepaliveEnabledOption: + return false, nil + + case tcpip.PasscredOption: + return e.Passcred(), nil + + default: + log.Warningf("Unsupported socket option: %d", opt) + return false, tcpip.ErrUnknownProtocolOption + } +} + +func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { + switch opt { + case tcpip.ReceiveQueueSizeOption: + v := 0 + e.Lock() + if !e.Connected() { + e.Unlock() + return -1, tcpip.ErrNotConnected + } + v = int(e.receiver.RecvQueuedSize()) + e.Unlock() + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported + } + return v, nil + + case tcpip.SendQueueSizeOption: + e.Lock() + if !e.Connected() { + e.Unlock() + return -1, tcpip.ErrNotConnected + } + v := e.connected.SendQueuedSize() + e.Unlock() + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported + } + return int(v), nil + + case tcpip.SendBufferSizeOption: + e.Lock() + if !e.Connected() { + e.Unlock() + return -1, tcpip.ErrNotConnected + } + v := e.connected.SendMaxQueueSize() + e.Unlock() + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported + } + return int(v), nil + + case tcpip.ReceiveBufferSizeOption: + e.Lock() + if e.receiver == nil { + e.Unlock() + return -1, tcpip.ErrNotConnected + } + v := e.receiver.RecvMaxQueueSize() + e.Unlock() + if v < 0 { + return -1, tcpip.ErrQueueSizeNotSupported + } + return int(v), nil + + default: + log.Warningf("Unsupported socket option: %d", opt) + return -1, tcpip.ErrUnknownProtocolOption + } +} + +// GetSockOpt implements tcpip.Endpoint.GetSockOpt. +func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch opt.(type) { + case tcpip.ErrorOption: + return nil + + default: + log.Warningf("Unsupported socket option: %T", opt) + return tcpip.ErrUnknownProtocolOption + } +} + +// Shutdown closes the read and/or write end of the endpoint connection to its +// peer. +func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { + e.Lock() + if !e.Connected() { + e.Unlock() + return syserr.ErrNotConnected + } + + if flags&tcpip.ShutdownRead != 0 { + e.receiver.CloseRecv() + } + + if flags&tcpip.ShutdownWrite != 0 { + e.connected.CloseSend() + } + + e.Unlock() + + if flags&tcpip.ShutdownRead != 0 { + e.receiver.CloseNotify() + } + + if flags&tcpip.ShutdownWrite != 0 { + e.connected.CloseNotify() + } + + return nil +} + +// GetLocalAddress returns the bound path. +func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + e.Lock() + defer e.Unlock() + return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil +} + +// GetRemoteAddress returns the local address of the connected endpoint (if +// available). +func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + e.Lock() + c := e.connected + e.Unlock() + if c != nil { + return c.GetLocalAddress() + } + return tcpip.FullAddress{}, tcpip.ErrNotConnected +} + +// Release implements BoundEndpoint.Release. +func (*baseEndpoint) Release() { + // Binding a baseEndpoint doesn't take a reference. +} diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go new file mode 100644 index 000000000..4bb2b6ff4 --- /dev/null +++ b/pkg/sentry/socket/unix/unix.go @@ -0,0 +1,772 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package unix provides an implementation of the socket.Socket interface for +// the AF_UNIX protocol family. +package unix + +import ( + "fmt" + "strings" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketOperations is a Unix socket. It is similar to a netstack socket, +// except it is backed by a transport.Endpoint instead of a tcpip.Endpoint. +// +// +stateify savable +type SocketOperations struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// New creates a new unix socket. +func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File { + dirent := socket.NewDirent(ctx, unixSocketDevice) + defer dirent.DecRef() + return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true}) +} + +// NewWithDirent creates a new unix socket using an existing dirent. +func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File { + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + + s := SocketOperations{ + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, + } + s.EnableLeakCheck("unix.SocketOperations") + + return fs.NewFile(ctx, d, flags, &s) +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + refs.AtomicRefCount + socket.SendReceiveTimeout + + ep transport.Endpoint + stype linux.SockType +} + +// DecRef implements RefCounter.DecRef. +func (s *socketOpsCommon) DecRef() { + s.DecRefWithDestructor(func() { + s.ep.Close() + }) +} + +// Release implemements fs.FileOperations.Release. +func (s *socketOpsCommon) Release() { + // Release only decrements a reference on s because s may be referenced in + // the abstract socket namespace. + s.DecRef() +} + +func (s *socketOpsCommon) isPacket() bool { + switch s.stype { + case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: + return true + case linux.SOCK_STREAM: + return false + default: + // We shouldn't have allowed any other socket types during creation. + panic(fmt.Sprintf("Invalid socket type %d", s.stype)) + } +} + +// Endpoint extracts the transport.Endpoint. +func (s *socketOpsCommon) Endpoint() transport.Endpoint { + return s.ep +} + +// extractPath extracts and validates the address. +func extractPath(sockaddr []byte) (string, *syserr.Error) { + addr, family, err := netstack.AddressAndFamily(sockaddr) + if err != nil { + if err == syserr.ErrAddressFamilyNotSupported { + err = syserr.ErrInvalidArgument + } + return "", err + } + if family != linux.AF_UNIX { + return "", syserr.ErrInvalidArgument + } + + // The address is trimmed by GetAddress. + p := string(addr.Addr) + if p == "" { + // Not allowed. + return "", syserr.ErrInvalidArgument + } + if p[len(p)-1] == '/' { + // Weird, they tried to bind '/a/b/c/'? + return "", syserr.ErrIsDir + } + + return p, nil +} + +// GetPeerName implements the linux syscall getpeername(2) for sockets backed by +// a transport.Endpoint. +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addr, err := s.ep.GetRemoteAddress() + if err != nil { + return nil, 0, syserr.TranslateNetstackError(err) + } + + a, l := netstack.ConvertAddress(linux.AF_UNIX, addr) + return a, l, nil +} + +// GetSockName implements the linux syscall getsockname(2) for sockets backed by +// a transport.Endpoint. +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addr, err := s.ep.GetLocalAddress() + if err != nil { + return nil, 0, syserr.TranslateNetstackError(err) + } + + a, l := netstack.ConvertAddress(linux.AF_UNIX, addr) + return a, l, nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return netstack.Ioctl(ctx, s.ep, io, args) +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +} + +// Listen implements the linux syscall listen(2) for sockets backed by +// a transport.Endpoint. +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { + return s.ep.Listen(backlog) +} + +// blockingAccept implements a blocking version of accept(2), that is, if no +// connections are ready to be accept, it will block until one becomes ready. +func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + // Try to accept the connection; if it fails, then wait until we get a + // notification. + for { + if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + return ep, err + } + + if err := t.Block(ch); err != nil { + return nil, syserr.FromError(err) + } + } +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, err := s.ep.Accept() + if err != nil { + if err != syserr.ErrWouldBlock || !blocking { + return 0, nil, 0, err + } + + var err *syserr.Error + ep, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns := New(t, ep, s.stype) + defer ns.DecRef() + + if flags&linux.SOCK_NONBLOCK != 0 { + flags := ns.Flags() + flags.NonBlocking = true + ns.SetFlags(flags.Settable()) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer. + var err *syserr.Error + addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + if e != nil { + return 0, nil, 0, syserr.FromError(e) + } + + t.Kernel().RecordSocket(ns) + + return fd, addr, addrLen, nil +} + +// Bind implements the linux syscall bind(2) for unix sockets. +func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + p, e := extractPath(sockaddr) + if e != nil { + return e + } + + bep, ok := s.ep.(transport.BoundEndpoint) + if !ok { + // This socket can't be bound. + return syserr.ErrInvalidArgument + } + + return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error { + // Is it abstract? + if p[0] == 0 { + if t.IsNetworkNamespaced() { + return syserr.ErrInvalidEndpointState + } + if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + // syserr.ErrPortInUse corresponds to EADDRINUSE. + return syserr.ErrPortInUse + } + } else { + // The parent and name. + var d *fs.Dirent + var name string + + cwd := t.FSContext().WorkingDirectory() + defer cwd.DecRef() + + // Is there no slash at all? + if !strings.Contains(p, "/") { + d = cwd + name = p + } else { + root := t.FSContext().RootDirectory() + defer root.DecRef() + // Find the last path component, we know that something follows + // that final slash, otherwise extractPath() would have failed. + lastSlash := strings.LastIndex(p, "/") + subPath := p[:lastSlash] + if subPath == "" { + // Fix up subpath in case file is in root. + subPath = "/" + } + var err error + remainingTraversals := uint(fs.DefaultTraversalLimit) + d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, &remainingTraversals) + if err != nil { + // No path available. + return syserr.ErrNoSuchFile + } + defer d.DecRef() + name = p[lastSlash+1:] + } + + // Create the socket. + // + // Note that the file permissions here are not set correctly (see + // gvisor.dev/issue/2324). There is no convenient way to get permissions + // on the socket referred to by s, so we will leave this discrepancy + // unresolved until VFS2 replaces this code. + childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}) + if err != nil { + return syserr.ErrPortInUse + } + childDir.DecRef() + } + + return nil + }) +} + +// extractEndpoint retrieves the transport.BoundEndpoint associated with a Unix +// socket path. The Release must be called on the transport.BoundEndpoint when +// the caller is done with it. +func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, *syserr.Error) { + path, err := extractPath(sockaddr) + if err != nil { + return nil, err + } + + // Is it abstract? + if path[0] == 0 { + if t.IsNetworkNamespaced() { + return nil, syserr.ErrInvalidArgument + } + + ep := t.AbstractSockets().BoundEndpoint(path[1:]) + if ep == nil { + // No socket found. + return nil, syserr.ErrConnectionRefused + } + + return ep, nil + } + + if kernel.VFS2Enabled { + p := fspath.Parse(path) + root := t.FSContext().RootDirectoryVFS2() + start := root + relPath := !p.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: p, + FollowFinalSymlink: true, + } + ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) + root.DecRef() + if relPath { + start.DecRef() + } + if e != nil { + return nil, syserr.FromError(e) + } + return ep, nil + } + + // Find the node in the filesystem. + root := t.FSContext().RootDirectory() + cwd := t.FSContext().WorkingDirectory() + remainingTraversals := uint(fs.DefaultTraversalLimit) + d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals) + cwd.DecRef() + root.DecRef() + if e != nil { + return nil, syserr.FromError(e) + } + + // Extract the endpoint if one is there. + ep := d.Inode.BoundEndpoint(path) + d.DecRef() + if ep == nil { + // No socket! + return nil, syserr.ErrConnectionRefused + } + return ep, nil +} + +// Connect implements the linux syscall connect(2) for unix sockets. +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + ep, err := extractEndpoint(t, sockaddr) + if err != nil { + return err + } + defer ep.Release() + + // Connect the server endpoint. + err = s.ep.Connect(t, ep) + + if err == syserr.ErrWrongProtocolForSocket { + // Linux for abstract sockets returns ErrConnectionRefused + // instead of ErrWrongProtocolForSocket. + path, _ := extractPath(sockaddr) + if len(path) > 0 && path[0] == 0 { + err = syserr.ErrConnectionRefused + } + } + + return err +} + +// Write implements fs.FileOperations.Write. +func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + t := kernel.TaskFromContext(ctx) + ctrl := control.New(t, s.ep, nil) + + if src.NumBytes() == 0 { + nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil) + return int64(nInt), err.ToError() + } + + return src.CopyInTo(ctx, &EndpointWriter{ + Ctx: ctx, + Endpoint: s.ep, + Control: ctrl, + To: nil, + }) +} + +// SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by +// a transport.Endpoint. +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { + w := EndpointWriter{ + Ctx: t, + Endpoint: s.ep, + Control: controlMessages.Unix, + To: nil, + } + if len(to) > 0 { + switch s.stype { + case linux.SOCK_SEQPACKET: + to = nil + case linux.SOCK_STREAM: + if s.State() == linux.SS_CONNECTED { + return 0, syserr.ErrAlreadyConnected + } + return 0, syserr.ErrNotSupported + default: + ep, err := extractEndpoint(t, to) + if err != nil { + return 0, err + } + defer ep.Release() + w.To = ep + + if ep.Passcred() && w.Control.Credentials == nil { + w.Control.Credentials = control.MakeCreds(t) + } + } + } + + n, err := src.CopyInTo(t, &w) + if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + return int(n), syserr.FromError(err) + } + + // We'll have to block. Register for notification and keep trying to + // send all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst64(n) + + n, err = src.CopyInTo(t, &w) + total += n + if err != syserror.ErrWouldBlock { + break + } + + if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } + break + } + } + + return int(total), syserr.FromError(err) +} + +// Passcred implements transport.Credentialer.Passcred. +func (s *socketOpsCommon) Passcred() bool { + return s.ep.Passcred() +} + +// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. +func (s *socketOpsCommon) ConnectedPasscred() bool { + return s.ep.ConnectedPasscred() +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.ep.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.ep.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { + s.ep.EventUnregister(e) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) +} + +// Shutdown implements the linux syscall shutdown(2) for sockets backed by +// a transport.Endpoint. +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { + f, err := netstack.ConvertShutdown(how) + if err != nil { + return err + } + + // Issue shutdown request. + return s.ep.Shutdown(f) +} + +// Read implements fs.FileOperations.Read. +func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &EndpointReader{ + Ctx: ctx, + Endpoint: s.ep, + NumRights: 0, + Peek: false, + From: nil, + }) +} + +// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by +// a transport.Endpoint. +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { + trunc := flags&linux.MSG_TRUNC != 0 + peek := flags&linux.MSG_PEEK != 0 + dontWait := flags&linux.MSG_DONTWAIT != 0 + waitAll := flags&linux.MSG_WAITALL != 0 + isPacket := s.isPacket() + + // Calculate the number of FDs for which we have space and if we are + // requesting credentials. + var wantCreds bool + rightsLen := int(controlDataLen) - syscall.SizeofCmsghdr + if s.Passcred() { + // Credentials take priority if they are enabled and there is space. + wantCreds = rightsLen > 0 + if !wantCreds { + msgFlags |= linux.MSG_CTRUNC + } + credLen := syscall.CmsgSpace(syscall.SizeofUcred) + rightsLen -= credLen + } + // FDs are 32 bit (4 byte) ints. + numRights := rightsLen / 4 + if numRights < 0 { + numRights = 0 + } + + r := EndpointReader{ + Ctx: t, + Endpoint: s.ep, + Creds: wantCreds, + NumRights: numRights, + Peek: peek, + } + if senderRequested { + r.From = &tcpip.FullAddress{} + } + + doRead := func() (int64, error) { + return dst.CopyOutFrom(t, &r) + } + + // If MSG_TRUNC is set with a zero byte destination then we still need + // to read the message and discard it, or in the case where MSG_PEEK is + // set, leave it be. In both cases the full message length must be + // returned. + if trunc && dst.Addrs.NumBytes() == 0 { + doRead = func() (int64, error) { + err := r.Truncate() + // Always return zero for bytes read since the destination size is + // zero. + return 0, err + } + + } + + var total int64 + if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait { + var from linux.SockAddr + var fromLen uint32 + if r.From != nil && len([]byte(r.From.Addr)) != 0 { + from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From) + } + + if r.ControlTrunc { + msgFlags |= linux.MSG_CTRUNC + } + + if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() { + if isPacket && n < int64(r.MsgSize) { + msgFlags |= linux.MSG_TRUNC + } + + if trunc { + n = int64(r.MsgSize) + } + + return int(n), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err) + } + + // Don't overwrite any data we received. + dst = dst.DropFirst64(n) + total += n + } + + // We'll have to block. Register for notification and keep trying to + // send all the data. + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + + for { + if n, err := doRead(); err != syserror.ErrWouldBlock { + var from linux.SockAddr + var fromLen uint32 + if r.From != nil { + from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From) + } + + if r.ControlTrunc { + msgFlags |= linux.MSG_CTRUNC + } + + if trunc { + // n and r.MsgSize are the same for streams. + total += int64(r.MsgSize) + } else { + total += n + } + + streamPeerClosed := s.stype == linux.SOCK_STREAM && n == 0 && err == nil + if err != nil || !waitAll || isPacket || n >= dst.NumBytes() || streamPeerClosed { + if total > 0 { + err = nil + } + if isPacket && n < int64(r.MsgSize) { + msgFlags |= linux.MSG_TRUNC + } + return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err) + } + + // Don't overwrite any data we received. + dst = dst.DropFirst64(n) + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if total > 0 { + err = nil + } + if err == syserror.ETIMEDOUT { + return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain + } + return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + } +} + +// State implements socket.Socket.State. +func (s *socketOpsCommon) State() uint32 { + return s.ep.State() +} + +// Type implements socket.Socket.Type. +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { + // Unix domain sockets always have a protocol of 0. + return linux.AF_UNIX, s.stype, 0 +} + +// provider is a unix domain socket provider. +type provider struct{} + +// Socket returns a new unix domain socket. +func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, syserr.ErrProtocolNotSupported + } + + // Create the endpoint and socket. + var ep transport.Endpoint + switch stype { + case linux.SOCK_DGRAM, linux.SOCK_RAW: + ep = transport.NewConnectionless(t) + case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: + ep = transport.NewConnectioned(t, stype, t.Kernel()) + default: + return nil, syserr.ErrInvalidArgument + } + + return New(t, ep, stype), nil +} + +// Pair creates a new pair of AF_UNIX connected sockets. +func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, nil, syserr.ErrProtocolNotSupported + } + + switch stype { + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: + // Ok + default: + return nil, nil, syserr.ErrInvalidArgument + } + + // Create the endpoints and sockets. + ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) + s1 := New(t, ep1, stype) + s2 := New(t, ep2, stype) + + return s1, s2, nil +} + +func init() { + socket.RegisterProvider(linux.AF_UNIX, &provider{}) + socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{}) +} diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go new file mode 100644 index 000000000..ff2149250 --- /dev/null +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -0,0 +1,371 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unix + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 implements socket.SocketVFS2 (and by extension, +// vfs.FileDescriptionImpl) for Unix sockets. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&SocketVFS2{}) + +// NewSockfsFile creates a new socket file in the global sockfs mount and +// returns a corresponding file description. +func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{}) + if err != nil { + return nil, syserr.FromError(err) + } + return fd, nil +} + +// NewFileDescription creates and returns a socket file description +// corresponding to the given mount and dentry. +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) { + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + + sock := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, + } + sock.LockFD.Init(locks) + vfsfd := &sock.vfsfd + if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return vfsfd, nil +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +} + +// blockingAccept implements a blocking version of accept(2), that is, if no +// connections are ready to be accept, it will block until one becomes ready. +func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.socketOpsCommon.EventRegister(&e, waiter.EventIn) + defer s.socketOpsCommon.EventUnregister(&e) + + // Try to accept the connection; if it fails, then wait until we get a + // notification. + for { + if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + return ep, err + } + + if err := t.Block(ch); err != nil { + return nil, syserr.FromError(err) + } + } +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, err := s.ep.Accept() + if err != nil { + if err != syserr.ErrWouldBlock || !blocking { + return 0, nil, 0, err + } + + var err *syserr.Error + ep, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := NewSockfsFile(t, ep, s.stype) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if flags&linux.SOCK_NONBLOCK != 0 { + ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + if e != nil { + return 0, nil, 0, syserr.FromError(e) + } + + t.Kernel().RecordSocketVFS2(ns) + return fd, addr, addrLen, nil +} + +// Bind implements the linux syscall bind(2) for unix sockets. +func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + p, e := extractPath(sockaddr) + if e != nil { + return e + } + + bep, ok := s.ep.(transport.BoundEndpoint) + if !ok { + // This socket can't be bound. + return syserr.ErrInvalidArgument + } + + return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error { + // Is it abstract? + if p[0] == 0 { + if t.IsNetworkNamespaced() { + return syserr.ErrInvalidEndpointState + } + if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + // syserr.ErrPortInUse corresponds to EADDRINUSE. + return syserr.ErrPortInUse + } + } else { + path := fspath.Parse(p) + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + relPath := !path.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + } + stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE}) + if err != nil { + return syserr.FromError(err) + } + err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ + // File permissions correspond to net/unix/af_unix.c:unix_bind. + Mode: linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()), + Endpoint: bep, + }) + if err == syserror.EEXIST { + return syserr.ErrAddressInUse + } + return syserr.FromError(err) + } + + return nil + }) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return netstack.Ioctl(ctx, s.ep, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &EndpointReader{ + Ctx: ctx, + Endpoint: s.ep, + NumRights: 0, + Peek: false, + From: nil, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + t := kernel.TaskFromContext(ctx) + ctrl := control.New(t, s.ep, nil) + + if src.NumBytes() == 0 { + nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil) + return int64(nInt), err.ToError() + } + + return src.CopyInTo(ctx, &EndpointWriter{ + Ctx: ctx, + Endpoint: s.ep, + Control: ctrl, + To: nil, + }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + +// providerVFS2 is a unix domain socket provider for VFS2. +type providerVFS2 struct{} + +func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, syserr.ErrProtocolNotSupported + } + + // Create the endpoint and socket. + var ep transport.Endpoint + switch stype { + case linux.SOCK_DGRAM, linux.SOCK_RAW: + ep = transport.NewConnectionless(t) + case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: + ep = transport.NewConnectioned(t, stype, t.Kernel()) + default: + return nil, syserr.ErrInvalidArgument + } + + f, err := NewSockfsFile(t, ep, stype) + if err != nil { + ep.Close() + return nil, err + } + return f, nil +} + +// Pair creates a new pair of AF_UNIX connected sockets. +func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, nil, syserr.ErrProtocolNotSupported + } + + switch stype { + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: + // Ok + default: + return nil, nil, syserr.ErrInvalidArgument + } + + // Create the endpoints and sockets. + ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) + s1, err := NewSockfsFile(t, ep1, stype) + if err != nil { + ep1.Close() + ep2.Close() + return nil, nil, err + } + s2, err := NewSockfsFile(t, ep2, stype) + if err != nil { + s1.DecRef() + ep2.Close() + return nil, nil, err + } + + return s1, s2, nil +} |