summaryrefslogtreecommitdiffhomepage
path: root/pkg
diff options
context:
space:
mode:
Diffstat (limited to 'pkg')
-rw-r--r--pkg/eventchannel/BUILD8
-rw-r--r--pkg/eventchannel/event.go40
-rw-r--r--pkg/eventchannel/event.proto2
-rw-r--r--pkg/eventchannel/event_any.go25
-rw-r--r--pkg/eventchannel/event_test.go2
-rw-r--r--pkg/eventchannel/rate.go2
-rw-r--r--pkg/metric/BUILD2
-rw-r--r--pkg/metric/metric_test.go2
-rw-r--r--pkg/sentry/fsimpl/overlay/BUILD5
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go70
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go52
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go4
-rw-r--r--pkg/sentry/fsimpl/overlay/regular_file.go (renamed from pkg/sentry/fsimpl/overlay/non_directory.go)115
-rw-r--r--pkg/sentry/fsimpl/testutil/kernel.go7
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go10
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go20
-rw-r--r--pkg/sentry/kernel/BUILD13
-rw-r--r--pkg/sentry/kernel/context.go3
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go14
-rw-r--r--pkg/sentry/kernel/kernel.go23
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go3
-rw-r--r--pkg/sentry/kernel/shm/BUILD1
-rw-r--r--pkg/sentry/kernel/shm/shm.go46
-rw-r--r--pkg/sentry/kernel/task.go4
-rw-r--r--pkg/sentry/kernel/task_clone.go22
-rw-r--r--pkg/sentry/kernel/task_exit.go3
-rw-r--r--pkg/sentry/kernel/task_start.go13
-rw-r--r--pkg/sentry/kernel/thread_group.go7
-rw-r--r--pkg/sentry/pgalloc/BUILD1
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go122
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go13
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.s12
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.go6
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.s10
-rw-r--r--pkg/sentry/socket/netlink/socket.go7
-rw-r--r--pkg/sentry/socket/netstack/netstack.go5
-rw-r--r--pkg/sentry/socket/unix/unix.go8
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_sysinfo.go2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fd.go20
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go6
-rw-r--r--pkg/sentry/usage/memory.go2
-rw-r--r--pkg/shim/v2/runtimeoptions/BUILD12
-rw-r--r--pkg/shim/v2/runtimeoptions/runtimeoptions.go15
-rw-r--r--pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go383
-rw-r--r--pkg/shim/v2/runtimeoptions/runtimeoptions_test.go9
-rw-r--r--pkg/tcpip/checker/checker.go18
-rw-r--r--pkg/tcpip/link/ethernet/BUILD15
-rw-r--r--pkg/tcpip/link/ethernet/ethernet.go99
-rw-r--r--pkg/tcpip/link/pipe/pipe.go39
-rw-r--r--pkg/tcpip/network/BUILD1
-rw-r--r--pkg/tcpip/network/ip_test.go414
-rw-r--r--pkg/tcpip/network/ipv4/ipv4.go147
-rw-r--r--pkg/tcpip/network/ipv4/ipv4_test.go302
-rw-r--r--pkg/tcpip/network/ipv6/icmp.go119
-rw-r--r--pkg/tcpip/network/ipv6/icmp_test.go254
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go69
-rw-r--r--pkg/tcpip/network/ipv6/ipv6_test.go61
-rw-r--r--pkg/tcpip/network/ipv6/ndp_test.go98
-rw-r--r--pkg/tcpip/stack/BUILD1
-rw-r--r--pkg/tcpip/stack/neighbor_entry.go6
-rw-r--r--pkg/tcpip/stack/neighbor_entry_test.go277
-rw-r--r--pkg/tcpip/stack/nic.go5
-rw-r--r--pkg/tcpip/stack/packet_buffer.go20
-rw-r--r--pkg/tcpip/stack/route.go6
-rw-r--r--pkg/tcpip/stack/stack_test.go47
-rw-r--r--pkg/tcpip/tcpip.go2
-rw-r--r--pkg/tcpip/tests/integration/BUILD1
-rw-r--r--pkg/tcpip/tests/integration/forward_test.go13
-rw-r--r--pkg/tcpip/tests/integration/link_resolution_test.go7
-rw-r--r--pkg/tcpip/tests/integration/multicast_broadcast_test.go2
73 files changed, 2649 insertions, 541 deletions
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index bee28b68d..a493e3407 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -6,6 +6,7 @@ go_library(
name = "eventchannel",
srcs = [
"event.go",
+ "event_any.go",
"rate.go",
],
visibility = ["//:sandbox"],
@@ -14,8 +15,9 @@ go_library(
"//pkg/log",
"//pkg/sync",
"//pkg/unet",
- "@com_github_golang_protobuf//proto:go_default_library",
- "@com_github_golang_protobuf//ptypes:go_default_library_gen",
+ "@org_golang_google_protobuf//encoding/prototext:go_default_library",
+ "@org_golang_google_protobuf//proto:go_default_library",
+ "@org_golang_google_protobuf//types/known/anypb:go_default_library",
"@org_golang_x_time//rate:go_default_library",
],
)
@@ -32,6 +34,6 @@ go_test(
library = ":eventchannel",
deps = [
"//pkg/sync",
- "@com_github_golang_protobuf//proto:go_default_library",
+ "@org_golang_google_protobuf//proto:go_default_library",
],
)
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index 9a29c58bd..7172ce75d 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -24,8 +24,8 @@ import (
"fmt"
"syscall"
- "github.com/golang/protobuf/proto"
- "github.com/golang/protobuf/ptypes"
+ "google.golang.org/protobuf/encoding/prototext"
+ "google.golang.org/protobuf/proto"
pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sync"
@@ -118,22 +118,6 @@ func (me *multiEmitter) Close() error {
return err
}
-func marshal(msg proto.Message) ([]byte, error) {
- anypb, err := ptypes.MarshalAny(msg)
- if err != nil {
- return nil, err
- }
-
- // Wire format is uvarint message length followed by binary proto.
- bufMsg, err := proto.Marshal(anypb)
- if err != nil {
- return nil, err
- }
- p := make([]byte, binary.MaxVarintLen64)
- n := binary.PutUvarint(p, uint64(len(bufMsg)))
- return append(p[:n], bufMsg...), nil
-}
-
// socketEmitter emits proto messages on a socket.
type socketEmitter struct {
socket *unet.Socket
@@ -155,10 +139,19 @@ func SocketEmitter(fd int) (Emitter, error) {
// Emit implements Emitter.Emit.
func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
- p, err := marshal(msg)
+ any, err := newAny(msg)
if err != nil {
return false, err
}
+ bufMsg, err := proto.Marshal(any)
+ if err != nil {
+ return false, err
+ }
+
+ // Wire format is uvarint message length followed by binary proto.
+ p := make([]byte, binary.MaxVarintLen64)
+ n := binary.PutUvarint(p, uint64(len(bufMsg)))
+ p = append(p[:n], bufMsg...)
for done := 0; done < len(p); {
n, err := s.socket.Write(p[done:])
if err != nil {
@@ -166,6 +159,7 @@ func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
}
done += n
}
+
return false, nil
}
@@ -189,9 +183,13 @@ func DebugEmitterFrom(inner Emitter) Emitter {
}
func (d *debugEmitter) Emit(msg proto.Message) (bool, error) {
+ text, err := prototext.Marshal(msg)
+ if err != nil {
+ return false, err
+ }
ev := &pb.DebugEvent{
- Name: proto.MessageName(msg),
- Text: proto.MarshalTextString(msg),
+ Name: string(msg.ProtoReflect().Descriptor().FullName()),
+ Text: string(text),
}
return d.inner.Emit(ev)
}
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index 34468f072..4b24ac47c 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -16,7 +16,7 @@ syntax = "proto3";
package gvisor;
-// A debug event encapsulates any other event protobuf in text format. This is
+// DebugEvent encapsulates any other event protobuf in text format. This is
// useful because clients reading events emitted this way do not need to link
// the event protobufs to display them in a human-readable format.
message DebugEvent {
diff --git a/pkg/eventchannel/event_any.go b/pkg/eventchannel/event_any.go
new file mode 100644
index 000000000..a5549f6cd
--- /dev/null
+++ b/pkg/eventchannel/event_any.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventchannel
+
+import (
+ "google.golang.org/protobuf/types/known/anypb"
+
+ "google.golang.org/protobuf/proto"
+)
+
+func newAny(m proto.Message) (*anypb.Any, error) {
+ return anypb.New(m)
+}
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 43750360b..0dd408f76 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -19,7 +19,7 @@ import (
"testing"
"time"
- "github.com/golang/protobuf/proto"
+ "google.golang.org/protobuf/proto"
"gvisor.dev/gvisor/pkg/sync"
)
diff --git a/pkg/eventchannel/rate.go b/pkg/eventchannel/rate.go
index 179226c92..74960e16a 100644
--- a/pkg/eventchannel/rate.go
+++ b/pkg/eventchannel/rate.go
@@ -15,8 +15,8 @@
package eventchannel
import (
- "github.com/golang/protobuf/proto"
"golang.org/x/time/rate"
+ "google.golang.org/protobuf/proto"
)
// rateLimitedEmitter wraps an emitter and limits events to the given limits.
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 58305009d..0a6a5d215 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -27,6 +27,6 @@ go_test(
deps = [
":metric_go_proto",
"//pkg/eventchannel",
- "@com_github_golang_protobuf//proto:go_default_library",
+ "@org_golang_google_protobuf//proto:go_default_library",
],
)
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index c425ea532..aefd0ea5c 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -17,7 +17,7 @@ package metric
import (
"testing"
- "github.com/golang/protobuf/proto"
+ "google.golang.org/protobuf/proto"
"gvisor.dev/gvisor/pkg/eventchannel"
pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
)
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
index 8cf5b35d3..1e11b0428 100644
--- a/pkg/sentry/fsimpl/overlay/BUILD
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -21,14 +21,16 @@ go_library(
"directory.go",
"filesystem.go",
"fstree.go",
- "non_directory.go",
"overlay.go",
+ "regular_file.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/log",
+ "//pkg/sentry/arch",
"//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
@@ -37,5 +39,6 @@ go_library(
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
+ "//pkg/waiter",
],
)
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 73b126669..4506642ca 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -75,8 +75,21 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
return syserror.ENOENT
}
- // Perform copy-up.
+ // Obtain settable timestamps from the lower layer.
vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ oldpop := vfs.PathOperation{
+ Root: d.lowerVDs[0],
+ Start: d.lowerVDs[0],
+ }
+ const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME
+ oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{
+ Mask: timestampsMask,
+ })
+ if err != nil {
+ return err
+ }
+
+ // Perform copy-up.
newpop := vfs.PathOperation{
Root: d.parent.upperVD,
Start: d.parent.upperVD,
@@ -101,10 +114,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
}
switch ftype {
case linux.S_IFREG:
- oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
- Root: d.lowerVDs[0],
- Start: d.lowerVDs[0],
- }, &vfs.OpenOptions{
+ oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{
Flags: linux.O_RDONLY,
})
if err != nil {
@@ -160,9 +170,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
}
if err := newFD.SetStat(ctx, vfs.SetStatOptions{
Stat: linux.Statx{
- Mask: linux.STATX_UID | linux.STATX_GID,
- UID: d.uid,
- GID: d.gid,
+ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
},
}); err != nil {
cleanupUndoCopyUp()
@@ -179,9 +191,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
}
if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
Stat: linux.Statx{
- Mask: linux.STATX_UID | linux.STATX_GID,
- UID: d.uid,
- GID: d.gid,
+ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
},
}); err != nil {
cleanupUndoCopyUp()
@@ -195,10 +209,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
d.upperVD = upperVD
case linux.S_IFLNK:
- target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
- Root: d.lowerVDs[0],
- Start: d.lowerVDs[0],
- })
+ target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop)
if err != nil {
return err
}
@@ -207,10 +218,12 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
}
if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
Stat: linux.Statx{
- Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
- Mode: uint16(d.mode),
- UID: d.uid,
- GID: d.gid,
+ Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ Mode: uint16(d.mode),
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
},
}); err != nil {
cleanupUndoCopyUp()
@@ -224,25 +237,20 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
d.upperVD = upperVD
case linux.S_IFBLK, linux.S_IFCHR:
- lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
- Root: d.lowerVDs[0],
- Start: d.lowerVDs[0],
- }, &vfs.StatOptions{})
- if err != nil {
- return err
- }
if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
Mode: linux.FileMode(d.mode),
- DevMajor: lowerStat.RdevMajor,
- DevMinor: lowerStat.RdevMinor,
+ DevMajor: oldStat.RdevMajor,
+ DevMinor: oldStat.RdevMinor,
}); err != nil {
return err
}
if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
Stat: linux.Statx{
- Mask: linux.STATX_UID | linux.STATX_GID,
- UID: d.uid,
- GID: d.gid,
+ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
},
}); err != nil {
cleanupUndoCopyUp()
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index bd11372d5..78a01bbb7 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -765,7 +765,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if mustCreate {
return nil, syserror.EEXIST
}
- if mayWrite {
+ if start.isRegularFile() && mayWrite {
if err := start.copyUpLocked(ctx); err != nil {
return nil, err
}
@@ -819,7 +819,7 @@ afterTrailingSymlink:
if rp.MustBeDir() && !child.isDir() {
return nil, syserror.ENOTDIR
}
- if mayWrite {
+ if child.isRegularFile() && mayWrite {
if err := child.copyUpLocked(ctx); err != nil {
return nil, err
}
@@ -872,8 +872,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
if err != nil {
return nil, err
}
+ if ftype != linux.S_IFREG {
+ return layerFD, nil
+ }
layerFlags := layerFD.StatusFlags()
- fd := &nonDirectoryFD{
+ fd := &regularFileFD{
copiedUp: isUpper,
cachedFD: layerFD,
cachedFlags: layerFlags,
@@ -969,7 +972,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
}
// Finally construct the overlay FD.
upperFlags := upperFD.StatusFlags()
- fd := &nonDirectoryFD{
+ fd := &regularFileFD{
copiedUp: true,
cachedFD: upperFD,
cachedFlags: upperFlags,
@@ -1293,6 +1296,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if !child.isDir() {
return syserror.ENOTDIR
}
+ if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+ return err
+ }
child.dirMu.Lock()
defer child.dirMu.Unlock()
whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
@@ -1528,12 +1534,38 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
+ parentMode := atomic.LoadUint32(&parent.mode)
child := parent.children[name]
var childLayer lookupLayer
+ if child == nil {
+ if parentMode&linux.S_ISVTX != 0 {
+ // If the parent's sticky bit is set, we need a child dentry to get
+ // its owner.
+ child, err = fs.getChildLocked(ctx, parent, name, &ds)
+ if err != nil {
+ return err
+ }
+ } else {
+ // Determine if the file being unlinked actually exists. Holding
+ // parent.dirMu prevents a dentry from being instantiated for the file,
+ // which in turn prevents it from being copied-up, so this result is
+ // stable.
+ childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+ if err != nil {
+ return err
+ }
+ if !childLayer.existsInOverlay() {
+ return syserror.ENOENT
+ }
+ }
+ }
if child != nil {
if child.isDir() {
return syserror.EISDIR
}
+ if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+ return err
+ }
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
@@ -1546,18 +1578,6 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
} else {
childLayer = lookupLayerLower
}
- } else {
- // Determine if the file being unlinked actually exists. Holding
- // parent.dirMu prevents a dentry from being instantiated for the file,
- // which in turn prevents it from being copied-up, so this result is
- // stable.
- childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
- if err != nil {
- return err
- }
- if !childLayer.existsInOverlay() {
- return syserror.ENOENT
- }
}
pop := vfs.PathOperation{
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index e5f506d2e..4c5de8d32 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -18,7 +18,7 @@
//
// Lock order:
//
-// directoryFD.mu / nonDirectoryFD.mu
+// directoryFD.mu / regularFileFD.mu
// filesystem.renameMu
// dentry.dirMu
// dentry.copyMu
@@ -453,7 +453,7 @@ type dentry struct {
// - If this dentry is copied-up, then wrappedMappable is the Mappable
// obtained from a call to the current top layer's
// FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
- // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil.
+ // (from a call to regularFileFD.ensureMappable()), it cannot become nil.
// wrappedMappable is protected by mapsMu and dataMu.
//
// - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/regular_file.go
index 853aee951..2b89a7a6d 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/regular_file.go
@@ -19,14 +19,21 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
)
+func (d *dentry) isRegularFile() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG
+}
+
func (d *dentry) isSymlink() bool {
return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
}
@@ -40,7 +47,7 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
}
// +stateify savable
-type nonDirectoryFD struct {
+type regularFileFD struct {
fileDescription
// If copiedUp is false, cachedFD represents
@@ -52,9 +59,13 @@ type nonDirectoryFD struct {
copiedUp bool
cachedFD *vfs.FileDescription
cachedFlags uint32
+
+ // If copiedUp is false, lowerWaiters contains all waiter.Entries
+ // registered with cachedFD. lowerWaiters is protected by mu.
+ lowerWaiters map[*waiter.Entry]waiter.EventMask
}
-func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
fd.mu.Lock()
defer fd.mu.Unlock()
wrappedFD, err := fd.currentFDLocked(ctx)
@@ -65,7 +76,7 @@ func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescriptio
return wrappedFD, nil
}
-func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
d := fd.dentry()
statusFlags := fd.vfsfd.StatusFlags()
if !fd.copiedUp && d.isCopiedUp() {
@@ -87,10 +98,21 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip
return nil, err
}
}
+ if len(fd.lowerWaiters) != 0 {
+ ready := upperFD.Readiness(^waiter.EventMask(0))
+ for e, mask := range fd.lowerWaiters {
+ fd.cachedFD.EventUnregister(e)
+ upperFD.EventRegister(e, mask)
+ if ready&mask != 0 {
+ e.Callback.Callback(e)
+ }
+ }
+ }
fd.cachedFD.DecRef(ctx)
fd.copiedUp = true
fd.cachedFD = upperFD
fd.cachedFlags = statusFlags
+ fd.lowerWaiters = nil
} else if fd.cachedFlags != statusFlags {
if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
return nil, err
@@ -101,13 +123,13 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *nonDirectoryFD) Release(ctx context.Context) {
+func (fd *regularFileFD) Release(ctx context.Context) {
fd.cachedFD.DecRef(ctx)
fd.cachedFD = nil
}
// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
// Linux doesn't define ovl_file_operations.flush at all (i.e. its
// equivalent to OnClose is a no-op). We pass through to
// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
@@ -128,7 +150,7 @@ func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
}
// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
var stat linux.Statx
if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
wrappedFD, err := fd.getCurrentFD(ctx)
@@ -149,7 +171,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux
}
// Allocate implements vfs.FileDescriptionImpl.Allocate.
-func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
wrappedFD, err := fd.getCurrentFD(ctx)
if err != nil {
return err
@@ -159,7 +181,7 @@ func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uin
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
d := fd.dentry()
mode := linux.FileMode(atomic.LoadUint32(&d.mode))
if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
@@ -191,12 +213,61 @@ func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions)
}
// StatFS implements vfs.FileDescriptionImpl.StatFS.
-func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) {
return fd.filesystem().statFS(ctx)
}
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ctx := context.Background()
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ // TODO(b/171089913): Just use fd.cachedFD since Readiness can't return
+ // an error. This is obviously wrong, but at least consistent with
+ // VFS1.
+ log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err)
+ fd.mu.Lock()
+ wrappedFD = fd.cachedFD
+ wrappedFD.IncRef()
+ fd.mu.Unlock()
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(context.Background())
+ if err != nil {
+ // TODO(b/171089913): Just use fd.cachedFD since EventRegister can't
+ // return an error. This is obviously wrong, but at least consistent
+ // with VFS1.
+ log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err)
+ wrappedFD = fd.cachedFD
+ }
+ wrappedFD.EventRegister(e, mask)
+ if !fd.copiedUp {
+ if fd.lowerWaiters == nil {
+ fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask)
+ }
+ fd.lowerWaiters[e] = mask
+ }
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *regularFileFD) EventUnregister(e *waiter.Entry) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ fd.cachedFD.EventUnregister(e)
+ if !fd.copiedUp {
+ delete(fd.lowerWaiters, e)
+ }
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
wrappedFD, err := fd.getCurrentFD(ctx)
if err != nil {
return 0, err
@@ -206,7 +277,7 @@ func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, off
}
// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
// Hold fd.mu during the read to serialize the file offset.
fd.mu.Lock()
defer fd.mu.Unlock()
@@ -218,7 +289,7 @@ func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
wrappedFD, err := fd.getCurrentFD(ctx)
if err != nil {
return 0, err
@@ -228,7 +299,7 @@ func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, of
}
// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
// Hold fd.mu during the write to serialize the file offset.
fd.mu.Lock()
defer fd.mu.Unlock()
@@ -240,7 +311,7 @@ func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opt
}
// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
// Hold fd.mu during the seek to serialize the file offset.
fd.mu.Lock()
defer fd.mu.Unlock()
@@ -252,7 +323,7 @@ func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32)
}
// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
+func (fd *regularFileFD) Sync(ctx context.Context) error {
fd.mu.Lock()
if !fd.dentry().isCopiedUp() {
fd.mu.Unlock()
@@ -269,8 +340,18 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
return wrappedFD.Sync(ctx)
}
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return 0, err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Ioctl(ctx, uio, args)
+}
+
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
if err := fd.ensureMappable(ctx, opts); err != nil {
return err
}
@@ -278,7 +359,7 @@ func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOp
}
// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
-func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
d := fd.dentry()
// Fast path if we already have a Mappable for the current top layer.
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 1813269e0..738c0c9cc 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
FSContext: kernel.NewFSContextVFS2(root, cwd, 0022),
FDTable: k.NewFDTable(),
}
- return k.TaskSet().NewTask(config)
+ t, err := k.TaskSet().NewTask(ctx, config)
+ if err != nil {
+ config.ThreadGroup.Release(ctx)
+ return nil, err
+ }
+ return t, nil
}
func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 3b3c8725f..03da505e1 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -377,12 +377,12 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
// enabled, we should verify the child hash here because it may
// be cached before enabled.
if fs.allowRuntimeEnable {
- if isEnabled(parent) {
+ if parent.verityEnabled() {
if _, err := fs.verifyChild(ctx, parent, child); err != nil {
return nil, err
}
}
- if isEnabled(child) {
+ if child.verityEnabled() {
vfsObj := fs.vfsfs.VirtualFilesystem()
mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
@@ -553,13 +553,13 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
// Verify child hash. This should always be performed unless in
// allowRuntimeEnable mode and the parent directory hasn't been enabled
// yet.
- if isEnabled(parent) {
+ if parent.verityEnabled() {
if _, err := fs.verifyChild(ctx, parent, child); err != nil {
child.destroyLocked(ctx)
return nil, err
}
}
- if isEnabled(child) {
+ if child.verityEnabled() {
if err := fs.verifyStat(ctx, child, stat); err != nil {
child.destroyLocked(ctx)
return nil, err
@@ -915,7 +915,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err != nil {
return linux.Statx{}, err
}
- if isEnabled(d) {
+ if d.verityEnabled() {
if err := fs.verifyStat(ctx, d, stat); err != nil {
return linux.Statx{}, err
}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 70034280b..8dc9e26bc 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -148,14 +148,6 @@ func (FilesystemType) Name() string {
return Name
}
-// isEnabled checks whether the target is enabled with verity features. It
-// should always be true if runtime enable is not allowed. In runtime enable
-// mode, it returns true if the target has been enabled with
-// ioctl(FS_IOC_ENABLE_VERITY).
-func isEnabled(d *dentry) bool {
- return !d.fs.allowRuntimeEnable || len(d.hash) != 0
-}
-
// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}
@@ -448,6 +440,14 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+ return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
func (d *dentry) readlink(ctx context.Context) (string, error) {
return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
Root: d.lowerVD,
@@ -510,7 +510,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
if err != nil {
return linux.Statx{}, err
}
- if isEnabled(fd.d) {
+ if fd.d.verityEnabled() {
if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil {
return linux.Statx{}, err
}
@@ -726,7 +726,7 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
// No need to verify if the file is not enabled yet in
// allowRuntimeEnable mode.
- if !isEnabled(fd.d) {
+ if !fd.d.verityEnabled() {
return fd.lowerFD.PRead(ctx, dst, offset, opts)
}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 5de70aecb..c0de72eef 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -97,6 +97,17 @@ go_template_instance(
)
go_template_instance(
+ name = "ipc_namespace_refs",
+ out = "ipc_namespace_refs.go",
+ package = "kernel",
+ prefix = "IPCNamespace",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "IPCNamespace",
+ },
+)
+
+go_template_instance(
name = "process_group_refs",
out = "process_group_refs.go",
package = "kernel",
@@ -137,6 +148,7 @@ go_library(
"fs_context.go",
"fs_context_refs.go",
"ipc_namespace.go",
+ "ipc_namespace_refs.go",
"kcov.go",
"kcov_unsafe.go",
"kernel.go",
@@ -206,6 +218,7 @@ go_library(
"//pkg/amutex",
"//pkg/bits",
"//pkg/bpf",
+ "//pkg/cleanup",
"//pkg/context",
"//pkg/coverage",
"//pkg/cpuid",
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index dd5f0f5fa..bb94769c4 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
}
// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
-// or nil if there is no such IPC namespace.
+// or nil if there is no such IPC namespace. It takes a reference on the
+// namespace.
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
if v := ctx.Value(CtxIPCNamespace); v != nil {
return v.(*IPCNamespace)
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 80a070d7e..3f34ee0db 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
@@ -24,6 +25,8 @@ import (
//
// +stateify savable
type IPCNamespace struct {
+ IPCNamespaceRefs
+
// User namespace which owns this IPC namespace. Immutable.
userNS *auth.UserNamespace
@@ -33,11 +36,13 @@ type IPCNamespace struct {
// NewIPCNamespace creates a new IPC namespace.
func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
- return &IPCNamespace{
+ ns := &IPCNamespace{
userNS: userNS,
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
+ ns.EnableLeakCheck()
+ return ns
}
// SemaphoreRegistry returns the semaphore set registry for this namespace.
@@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
return i.shms
}
+// DecRef implements refs_vfs2.RefCounter.DecRef.
+func (i *IPCNamespace) DecRef(ctx context.Context) {
+ i.IPCNamespaceRefs.DecRef(func() {
+ i.shms.Release(ctx)
+ })
+}
+
// IPCNamespace returns the task's IPC namespace.
func (t *Task) IPCNamespace() *IPCNamespace {
t.mu.Lock()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 675506269..0eb2bf7bd 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -39,6 +39,7 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/eventchannel"
@@ -340,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
return fmt.Errorf("Timekeeper is nil")
}
if args.Timekeeper.clocks == nil {
- return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+ return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
}
if args.RootUserNamespace == nil {
return fmt.Errorf("RootUserNamespace is nil")
@@ -365,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.useHostCores = true
maxCPU, err := hostcpu.MaxPossibleCPU()
if err != nil {
- return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+ return fmt.Errorf("failed to get maximum CPU number: %v", err)
}
minAppCores := uint(maxCPU) + 1
if k.applicationCores < minAppCores {
@@ -828,7 +829,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.args.UTSNamespace
case CtxIPCNamespace:
- return ctx.args.IPCNamespace
+ ipcns := ctx.args.IPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
@@ -964,6 +967,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
}
tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+ cu := cleanup.Make(func() {
+ tg.Release(ctx)
+ })
+ defer cu.Clean()
// Check which file to start from.
switch {
@@ -1023,13 +1030,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
MountNamespaceVFS2: mntnsVFS2,
ContainerID: args.ContainerID,
}
- t, err := k.tasks.NewTask(config)
+ t, err := k.tasks.NewTask(ctx, config)
if err != nil {
return nil, 0, err
}
t.traceExecEvent(tc) // Simulate exec for tracing.
// Success.
+ cu.Release()
tgid := k.tasks.Root.IDOfThreadGroup(tg)
if k.globalInit == nil {
k.globalInit = tg
@@ -1374,8 +1382,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace {
return k.rootUTSNamespace
}
-// RootIPCNamespace returns the root IPCNamespace.
+// RootIPCNamespace takes a reference and returns the root IPCNamespace.
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+ k.rootIPCNamespace.IncRef()
return k.rootIPCNamespace
}
@@ -1636,7 +1645,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.k.rootUTSNamespace
case CtxIPCNamespace:
- return ctx.k.rootIPCNamespace
+ ipcns := ctx.k.rootIPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index f61039f5b..1a152142b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -237,8 +237,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
// PipeSize implements fcntl(F_GETPIPE_SZ).
func (fd *VFSPipeFD) PipeSize() int64 {
- // Inline Pipe.FifoSize() rather than calling it with nil Context and
- // fs.File and ignoring the returned error (which is always nil).
+ // Inline Pipe.FifoSize() since we don't have a fs.File.
fd.pipe.mu.Lock()
defer fd.pipe.mu.Unlock()
return fd.pipe.max
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index b7e4b480d..f8a382fd8 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -27,6 +27,7 @@ go_library(
"//pkg/context",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refs_vfs2",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 00c03585e..ebbebf46b 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -321,9 +321,32 @@ func (r *Registry) remove(s *Shm) {
r.totalPages -= s.effectiveSize / usermem.PageSize
}
+// Release drops the self-reference of each active shm segment in the registry.
+// It is called when the kernel.IPCNamespace containing r is being destroyed.
+func (r *Registry) Release(ctx context.Context) {
+ // Because Shm.DecRef() may acquire the same locks, collect the segments to
+ // release first. Note that this should not race with any updates to r, since
+ // the IPC namespace containing it has no more references.
+ toRelease := make([]*Shm, 0)
+ r.mu.Lock()
+ for _, s := range r.keysToShms {
+ s.mu.Lock()
+ if !s.pendingDestruction {
+ toRelease = append(toRelease, s)
+ }
+ s.mu.Unlock()
+ }
+ r.mu.Unlock()
+
+ for _, s := range toRelease {
+ r.dissociateKey(s)
+ s.DecRef(ctx)
+ }
+}
+
// Shm represents a single shared memory segment.
//
-// Shm segment are backed directly by an allocation from platform memory.
+// Shm segments are backed directly by an allocation from platform memory.
// Segments are always mapped as a whole, greatly simplifying how mappings are
// tracked. However note that mremap and munmap calls may cause the vma for a
// segment to become fragmented; which requires special care when unmapping a
@@ -652,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) {
s.registry.dissociateKey(s)
s.mu.Lock()
- defer s.mu.Unlock()
- if !s.pendingDestruction {
- s.pendingDestruction = true
- // Drop the self-reference so destruction occurs when all
- // external references are gone.
- //
- // N.B. This cannot be the final DecRef, as the caller also
- // holds a reference.
- s.DecRef(ctx)
+ if s.pendingDestruction {
+ s.mu.Unlock()
return
}
+ s.pendingDestruction = true
+ s.mu.Unlock()
+
+ // Drop the self-reference so destruction occurs when all
+ // external references are gone.
+ //
+ // N.B. This cannot be the final DecRef, as the caller also
+ // holds a reference.
+ s.DecRef(ctx)
+ return
}
// checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e90a19cfb..037971393 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -656,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return t.utsns
case CtxIPCNamespace:
- return t.ipcns
+ ipcns := t.IPCNamespace()
+ ipcns.IncRef()
+ return ipcns
case CtxTask:
return t
case auth.CtxCredentials:
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index fce1064a7..682080c14 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -203,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
ipcns = NewIPCNamespace(userns)
+ } else {
+ ipcns.IncRef()
}
+ cu := cleanup.Make(func() {
+ ipcns.DecRef(t)
+ })
+ defer cu.Clean()
netns := t.NetworkNamespace()
if opts.NewNetworkNamespace {
@@ -214,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
mntnsVFS2 := t.mountNamespaceVFS2
if mntnsVFS2 != nil {
mntnsVFS2.IncRef()
+ cu.Add(func() {
+ mntnsVFS2.DecRef(t)
+ })
}
tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
if err != nil {
return 0, nil, err
}
+ cu.Add(func() {
+ tc.release()
+ })
// clone() returns 0 in the child.
tc.Arch.SetReturn(0)
if opts.Stack != 0 {
@@ -295,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
} else {
cfg.InheritParent = t
}
- nt, err := t.tg.pidns.owner.NewTask(cfg)
+ nt, err := t.tg.pidns.owner.NewTask(t, cfg)
+ // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
+ // the cleanup for us.
+ cu.Release()
if err != nil {
- if opts.NewThreadGroup {
- tg.release(t)
- }
return 0, nil, err
}
@@ -509,6 +522,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
+ t.ipcns.DecRef(t)
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
var oldFDTable *FDTable
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b400a8b41..ce7b9641d 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -280,12 +280,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.mountNamespaceVFS2.DecRef(t)
t.mountNamespaceVFS2 = nil
}
+ t.ipcns.DecRef(t)
t.mu.Unlock()
// If this is the last task to exit from the thread group, release the
// thread group's resources.
if lastExiter {
- t.tg.release(t)
+ t.tg.Release(t)
}
// Detach tracees.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 64c1e120a..8e28230cc 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -98,14 +99,18 @@ type TaskConfig struct {
// NewTask creates a new task defined by cfg.
//
// NewTask does not start the returned task; the caller must call Task.Start.
-func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+//
+// If successful, NewTask transfers references held by cfg to the new task.
+// Otherwise, NewTask releases them.
+func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
t, err := ts.newTask(cfg)
if err != nil {
cfg.TaskContext.release()
- cfg.FSContext.DecRef(t)
- cfg.FDTable.DecRef(t)
+ cfg.FSContext.DecRef(ctx)
+ cfg.FDTable.DecRef(ctx)
+ cfg.IPCNamespace.DecRef(ctx)
if cfg.MountNamespaceVFS2 != nil {
- cfg.MountNamespaceVFS2.DecRef(t)
+ cfg.MountNamespaceVFS2.DecRef(ctx)
}
return nil, err
}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 0b34c0099..a183b28c1 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
return tg.limits
}
-// release releases the thread group's resources.
-func (tg *ThreadGroup) release(t *Task) {
+// Release releases the thread group's resources.
+func (tg *ThreadGroup) Release(ctx context.Context) {
// Timers must be destroyed without holding the TaskSet or signal mutexes
// since timers send signals with Timer.mu locked.
tg.itimerRealTimer.Destroy()
@@ -325,7 +326,7 @@ func (tg *ThreadGroup) release(t *Task) {
it.DestroyTimer()
}
if tg.mounts != nil {
- tg.mounts.DecRef(t)
+ tg.mounts.DecRef(ctx)
}
}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 7a3311a70..5b09b9feb 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -83,6 +83,7 @@ go_library(
],
visibility = ["//pkg/sentry:internal"],
deps = [
+ "//pkg/abi/linux",
"//pkg/context",
"//pkg/log",
"//pkg/memutil",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 626d1eaa4..7c297fb9e 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -29,6 +29,7 @@ import (
"syscall"
"time"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
@@ -224,6 +225,18 @@ type usageInfo struct {
refs uint64
}
+// canCommit returns true if the tracked region can be committed.
+func (u *usageInfo) canCommit() bool {
+ // refs must be greater than 0 because we assume that reclaimable pages
+ // (that aren't already known to be committed) are not committed. This
+ // isn't necessarily true, even after the reclaimer does Decommit(),
+ // because the kernel may subsequently back the hugepage-sized region
+ // containing the decommitted page with a hugepage. However, it's
+ // consistent with our treatment of unallocated pages, which have the same
+ // property.
+ return !u.knownCommitted && u.refs != 0
+}
+
// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
// may be asked to deallocate that memory in the presence of memory pressure.
type EvictableMemoryUser interface {
@@ -828,6 +841,11 @@ func (f *MemoryFile) UpdateUsage() error {
log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
return nil
}
+ // Linux updates usage values at CONFIG_HZ.
+ if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
+ log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
+ return nil
+ }
f.usageLast = time.Now()
err = f.updateUsageLocked(currentUsage, mincore)
@@ -841,7 +859,7 @@ func (f *MemoryFile) UpdateUsage() error {
// pages by invoking checkCommitted, which is a function that, for each page i
// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
//
-// Precondition: f.mu must be held.
+// Precondition: f.mu must be held; it may be unlocked and reacquired.
func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
// Track if anything changed to elide the merge. In the common case, we
// expect all segments to be committed and no merge to occur.
@@ -868,7 +886,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
} else if f.usageSwapped != 0 {
// We have more usage accounted for than the file itself.
// That's fine, we probably caught a race where pages were
- // being committed while the above loop was running. Just
+ // being committed while the below loop was running. Just
// report the higher number that we found and ignore swap.
usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
f.usageSwapped = 0
@@ -880,21 +898,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
// Iterate over all usage data. There will only be usage segments
// present when there is an associated reference.
- for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- val := seg.Value()
-
- // Already known to be committed; ignore.
- if val.knownCommitted {
- continue
- }
-
- // Assume that reclaimable pages (that aren't already known to be
- // committed) are not committed. This isn't necessarily true, even
- // after the reclaimer does Decommit(), because the kernel may
- // subsequently back the hugepage-sized region containing the
- // decommitted page with a hugepage. However, it's consistent with our
- // treatment of unallocated pages, which have the same property.
- if val.refs == 0 {
+ for seg := f.usage.FirstSegment(); seg.Ok(); {
+ if !seg.ValuePtr().canCommit() {
+ seg = seg.NextSegment()
continue
}
@@ -917,56 +923,53 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
}
// Query for new pages in core.
- if err := checkCommitted(s, buf); err != nil {
+ // NOTE(b/165896008): mincore (which is passed as checkCommitted)
+ // by f.UpdateUsage() might take a really long time. So unlock f.mu
+ // while checkCommitted runs.
+ f.mu.Unlock()
+ err := checkCommitted(s, buf)
+ f.mu.Lock()
+ if err != nil {
checkErr = err
return
}
// Scan each page and switch out segments.
- populatedRun := false
- populatedRunStart := 0
- for i := 0; i <= bufLen; i++ {
- // We run past the end of the slice here to
- // simplify the logic and only set populated if
- // we're still looking at elements.
- populated := false
- if i < bufLen {
- populated = buf[i]&0x1 != 0
- }
-
- switch {
- case populated == populatedRun:
- // Keep the run going.
- continue
- case populated && !populatedRun:
- // Begin the run.
- populatedRun = true
- populatedRunStart = i
- // Keep going.
+ seg := f.usage.LowerBoundSegment(r.Start)
+ for i := 0; i < bufLen; {
+ if buf[i]&0x1 == 0 {
+ i++
continue
- case !populated && populatedRun:
- // Finish the run by changing this segment.
- runRange := memmap.FileRange{
- Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
- End: r.Start + uint64(i*usermem.PageSize),
+ }
+ // Scan to the end of this committed range.
+ j := i + 1
+ for ; j < bufLen; j++ {
+ if buf[j]&0x1 == 0 {
+ break
}
- seg = f.usage.Isolate(seg, runRange)
- seg.ValuePtr().knownCommitted = true
- // Advance the segment only if we still
- // have work to do in the context of
- // the original segment from the for
- // loop. Otherwise, the for loop itself
- // will advance the segment
- // appropriately.
- if runRange.End != r.End {
- seg = seg.NextSegment()
+ }
+ committedFR := memmap.FileRange{
+ Start: r.Start + uint64(i*usermem.PageSize),
+ End: r.Start + uint64(j*usermem.PageSize),
+ }
+ // Advance seg to committedFR.Start.
+ for seg.Ok() && seg.End() < committedFR.Start {
+ seg = seg.NextSegment()
+ }
+ // Mark pages overlapping committedFR as committed.
+ for seg.Ok() && seg.Start() < committedFR.End {
+ if seg.ValuePtr().canCommit() {
+ seg = f.usage.Isolate(seg, committedFR)
+ seg.ValuePtr().knownCommitted = true
+ amount := seg.Range().Length()
+ usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind)
+ f.usageExpected += amount
+ changedAny = true
}
- amount := runRange.Length()
- usage.MemoryAccounting.Inc(amount, val.kind)
- f.usageExpected += amount
- changedAny = true
- populatedRun = false
+ seg = seg.NextSegment()
}
+ // Continue scanning for committed pages.
+ i = j + 1
}
// Advance r.Start.
@@ -978,6 +981,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
if err != nil {
return err
}
+
+ // Continue with the first segment after r.End.
+ seg = f.usage.LowerBoundSegment(r.End)
}
return nil
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index ed5ae03d3..58f3d6fdd 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -39,6 +39,16 @@ var (
}
)
+// getTLS returns the value of TPIDR_EL0 register.
+//
+//go:nosplit
+func getTLS() (value uint64)
+
+// setTLS writes the TPIDR_EL0 value.
+//
+//go:nosplit
+func setTLS(value uint64)
+
// bluepillArchEnter is called during bluepillEnter.
//
//go:nosplit
@@ -51,6 +61,8 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
regs.Pstate = context.Pstate
regs.Pstate &^= uint64(ring0.PsrFlagsClear)
regs.Pstate |= ring0.KernelFlagsSet
+ regs.TPIDR_EL0 = getTLS()
+
return
}
@@ -65,6 +77,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
context.Pstate = regs.Pstate
context.Pstate &^= uint64(ring0.PsrFlagsClear)
context.Pstate |= ring0.UserFlagsSet
+ setTLS(regs.TPIDR_EL0)
lazyVfp := c.GetLazyVFP()
if lazyVfp != 0 {
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index 04efa0147..09c7e88e5 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -32,6 +32,18 @@
#define CONTEXT_PC 0x1B8
#define CONTEXT_R0 0xB8
+// getTLS returns the value of TPIDR_EL0 register.
+TEXT ·getTLS(SB),NOSPLIT,$0-8
+ MRS TPIDR_EL0, R1
+ MOVD R1, ret+0(FP)
+ RET
+
+// setTLS writes the TPIDR_EL0 value.
+TEXT ·setTLS(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R1
+ MSR R1, TPIDR_EL0
+ RET
+
// See bluepill.go.
TEXT ·bluepill(SB),NOSPLIT,$0
begin:
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 2f1abcb0f..d91a09de1 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -53,12 +53,6 @@ func LoadFloatingPoint(*byte)
// SaveFloatingPoint saves floating point state.
func SaveFloatingPoint(*byte)
-// GetTLS returns the value of TPIDR_EL0 register.
-func GetTLS() (value uint64)
-
-// SetTLS writes the TPIDR_EL0 value.
-func SetTLS(value uint64)
-
// Init sets function pointers based on architectural features.
//
// This must be called prior to using ring0.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 8aabf7d0e..da9d3cf55 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -29,16 +29,6 @@ TEXT ·FlushTlbAll(SB),NOSPLIT,$0
ISB $15
RET
-TEXT ·GetTLS(SB),NOSPLIT,$0-8
- MRS TPIDR_EL0, R1
- MOVD R1, ret+0(FP)
- RET
-
-TEXT ·SetTLS(SB),NOSPLIT,$0-8
- MOVD addr+0(FP), R1
- MSR R1, TPIDR_EL0
- RET
-
TEXT ·CPACREL1(SB),NOSPLIT,$0-8
WORD $0xd5381041 // MRS CPACR_EL1, R1
MOVD R1, ret+0(FP)
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 5ddcd4be5..3baad098b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,6 +16,7 @@
package netlink
import (
+ "io"
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -748,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t
buf := make([]byte, src.NumBytes())
n, err := src.CopyIn(ctx, buf)
+ // io.EOF can be only returned if src is a file, this means that
+ // sendMsg is called from splice and the error has to be ignored in
+ // this case.
+ if err == io.EOF {
+ err = nil
+ }
if err != nil {
// Don't partially consume messages.
return 0, syserr.FromError(err)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 87e30d742..211f07947 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -587,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
}
v := buffer.NewView(size)
if _, err := i.src.CopyIn(i.ctx, v); err != nil {
+ // EOF can be returned only if src is a file and this means it
+ // is in a splice syscall and the error has to be ignored.
+ if err == io.EOF {
+ return v, nil
+ }
return nil, tcpip.ErrBadAddress
}
return v, nil
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index f80011ce4..a4a76d0a3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -573,13 +573,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
if dst.NumBytes() == 0 {
return 0, nil
}
- return dst.CopyOutFrom(ctx, &EndpointReader{
+ r := &EndpointReader{
Ctx: ctx,
Endpoint: s.ep,
NumRights: 0,
Peek: false,
From: nil,
- })
+ }
+ n, err := dst.CopyOutFrom(ctx, r)
+ // Drop control messages.
+ r.Control.Release(ctx)
+ return n, err
}
// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 3345124cc..678355fb9 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -267,13 +267,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.
if dst.NumBytes() == 0 {
return 0, nil
}
- return dst.CopyOutFrom(ctx, &EndpointReader{
+ r := &EndpointReader{
Ctx: ctx,
Endpoint: s.ep,
NumRights: 0,
Peek: false,
From: nil,
- })
+ }
+ n, err := dst.CopyOutFrom(ctx, r)
+ // Drop control messages.
+ r.Control.Release(ctx)
+ return n, err
}
// PWrite implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 9feaca0da..9cd052c3d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1052,7 +1052,9 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
- if err != nil {
+ // Control messages should be released on error as well as for zero-length
+ // messages, which are discarded by the receiver.
+ if n == 0 || err != nil {
controlMessages.Release(t)
}
return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 6320593f0..db3d924d9 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -21,7 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
)
-// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+// Sysinfo implements Linux syscall sysinfo(2).
func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index d8b8d9783..36e89700e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -145,16 +145,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(file.StatusFlags()), nil, nil
case linux.F_SETFL:
return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
- case linux.F_SETPIPE_SZ:
- pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
- if !ok {
- return 0, nil, syserror.EBADF
- }
- n, err := pipefile.SetPipeSize(int64(args[2].Int()))
- if err != nil {
- return 0, nil, err
- }
- return uintptr(n), nil, nil
case linux.F_GETOWN:
owner, hasOwner := getAsyncOwner(t, file)
if !hasOwner {
@@ -190,6 +180,16 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, err
}
return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
+ case linux.F_SETPIPE_SZ:
+ pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+ if !ok {
+ return 0, nil, syserror.EBADF
+ }
+ n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(n), nil, nil
case linux.F_GETPIPE_SZ:
pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
if !ok {
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index bfae6b7e9..7b33b3f59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -1055,7 +1055,9 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
- if err != nil {
+ // Control messages should be released on error as well as for zero-length
+ // messages, which are discarded by the receiver.
+ if n == 0 || err != nil {
controlMessages.Release(t)
}
return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index bf5c1171f..035e2a6b0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -45,6 +45,9 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if count > int64(kernel.MAX_RW_COUNT) {
count = int64(kernel.MAX_RW_COUNT)
}
+ if count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
// Check for invalid flags.
if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -192,6 +195,9 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if count > int64(kernel.MAX_RW_COUNT) {
count = int64(kernel.MAX_RW_COUNT)
}
+ if count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
// Check for invalid flags.
if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index ab1d140d2..5ed6726ab 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -278,7 +278,7 @@ func TotalMemory(memSize, used uint64) uint64 {
}
if memSize < used {
memSize = used
- // Bump totalSize to the next largest power of 2, if one exists, so
+ // Bump memSize to the next largest power of 2, if one exists, so
// that MemFree isn't 0.
if msb := bits.MostSignificantOne64(memSize); msb < 63 {
memSize = uint64(1) << (uint(msb) + 1)
diff --git a/pkg/shim/v2/runtimeoptions/BUILD b/pkg/shim/v2/runtimeoptions/BUILD
index ba2ed1ea7..abb8c3be3 100644
--- a/pkg/shim/v2/runtimeoptions/BUILD
+++ b/pkg/shim/v2/runtimeoptions/BUILD
@@ -11,12 +11,12 @@ proto_library(
go_library(
name = "runtimeoptions",
- srcs = ["runtimeoptions.go"],
- visibility = ["//pkg/shim/v2:__pkg__"],
- deps = [
- ":api_go_proto",
- "@com_github_gogo_protobuf//proto:go_default_library",
+ srcs = [
+ "runtimeoptions.go",
+ "runtimeoptions_cri.go",
],
+ visibility = ["//pkg/shim/v2:__pkg__"],
+ deps = ["@com_github_gogo_protobuf//proto:go_default_library"],
)
go_test(
@@ -27,6 +27,6 @@ go_test(
deps = [
"@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
"@com_github_containerd_typeurl//:go_default_library",
- "@com_github_golang_protobuf//proto:go_default_library",
+ "@com_github_gogo_protobuf//proto:go_default_library",
],
)
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.go b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
index aaf17b87a..072dd87f0 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
@@ -13,18 +13,5 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Package runtimeoptions contains the runtimeoptions proto.
package runtimeoptions
-
-import (
- proto "github.com/gogo/protobuf/proto"
- pb "gvisor.dev/gvisor/pkg/shim/v2/runtimeoptions/api_go_proto"
-)
-
-type Options = pb.Options
-
-func init() {
- // The generated proto file auto registers with "golang/protobuf/proto"
- // package. However, typeurl uses "golang/gogo/protobuf/proto". So registers
- // the type there too.
- proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
-}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go
new file mode 100644
index 000000000..e6102b4cf
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go
@@ -0,0 +1,383 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runtimeoptions
+
+import (
+ "fmt"
+ "io"
+ "reflect"
+ "strings"
+
+ proto "github.com/gogo/protobuf/proto"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+// A compilation error at this line likely means your copy of the
+// proto package needs to be updated.
+const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package
+
+type Options struct {
+ // TypeUrl specifies the type of the content inside the config file.
+ TypeUrl string `protobuf:"bytes,1,opt,name=type_url,json=typeUrl,proto3" json:"type_url,omitempty"`
+ // ConfigPath specifies the filesystem location of the config file
+ // used by the runtime.
+ ConfigPath string `protobuf:"bytes,2,opt,name=config_path,json=configPath,proto3" json:"config_path,omitempty"`
+}
+
+func (m *Options) Reset() { *m = Options{} }
+func (*Options) ProtoMessage() {}
+func (*Options) Descriptor() ([]byte, []int) { return fileDescriptorApi, []int{0} }
+
+func (m *Options) GetTypeUrl() string {
+ if m != nil {
+ return m.TypeUrl
+ }
+ return ""
+}
+
+func (m *Options) GetConfigPath() string {
+ if m != nil {
+ return m.ConfigPath
+ }
+ return ""
+}
+
+func init() {
+ proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
+}
+
+func (m *Options) Marshal() (dAtA []byte, err error) {
+ size := m.Size()
+ dAtA = make([]byte, size)
+ n, err := m.MarshalTo(dAtA)
+ if err != nil {
+ return nil, err
+ }
+ return dAtA[:n], nil
+}
+
+func (m *Options) MarshalTo(dAtA []byte) (int, error) {
+ var i int
+ _ = i
+ var l int
+ _ = l
+ if len(m.TypeUrl) > 0 {
+ dAtA[i] = 0xa
+ i++
+ i = encodeVarintApi(dAtA, i, uint64(len(m.TypeUrl)))
+ i += copy(dAtA[i:], m.TypeUrl)
+ }
+ if len(m.ConfigPath) > 0 {
+ dAtA[i] = 0x12
+ i++
+ i = encodeVarintApi(dAtA, i, uint64(len(m.ConfigPath)))
+ i += copy(dAtA[i:], m.ConfigPath)
+ }
+ return i, nil
+}
+
+func encodeVarintApi(dAtA []byte, offset int, v uint64) int {
+ for v >= 1<<7 {
+ dAtA[offset] = uint8(v&0x7f | 0x80)
+ v >>= 7
+ offset++
+ }
+ dAtA[offset] = uint8(v)
+ return offset + 1
+}
+
+func (m *Options) Size() (n int) {
+ var l int
+ _ = l
+ l = len(m.TypeUrl)
+ if l > 0 {
+ n += 1 + l + sovApi(uint64(l))
+ }
+ l = len(m.ConfigPath)
+ if l > 0 {
+ n += 1 + l + sovApi(uint64(l))
+ }
+ return n
+}
+
+func sovApi(x uint64) (n int) {
+ for {
+ n++
+ x >>= 7
+ if x == 0 {
+ break
+ }
+ }
+ return n
+}
+
+func sozApi(x uint64) (n int) {
+ return sovApi(uint64((x << 1) ^ uint64((int64(x) >> 63))))
+}
+
+func (this *Options) String() string {
+ if this == nil {
+ return "nil"
+ }
+ s := strings.Join([]string{`&Options{`,
+ `TypeUrl:` + fmt.Sprintf("%v", this.TypeUrl) + `,`,
+ `ConfigPath:` + fmt.Sprintf("%v", this.ConfigPath) + `,`,
+ `}`,
+ }, "")
+ return s
+}
+
+func valueToStringApi(v interface{}) string {
+ rv := reflect.ValueOf(v)
+ if rv.IsNil() {
+ return "nil"
+ }
+ pv := reflect.Indirect(rv).Interface()
+ return fmt.Sprintf("*%v", pv)
+}
+
+func (m *Options) Unmarshal(dAtA []byte) error {
+ l := len(dAtA)
+ iNdEx := 0
+ for iNdEx < l {
+ preIndex := iNdEx
+ var wire uint64
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return io.ErrUnexpectedEOF
+ }
+ b := dAtA[iNdEx]
+ iNdEx++
+ wire |= (uint64(b) & 0x7F) << shift
+ if b < 0x80 {
+ break
+ }
+ }
+ fieldNum := int32(wire >> 3)
+ wireType := int(wire & 0x7)
+ if wireType == 4 {
+ return fmt.Errorf("proto: Options: wiretype end group for non-group")
+ }
+ if fieldNum <= 0 {
+ return fmt.Errorf("proto: Options: illegal tag %d (wire type %d)", fieldNum, wire)
+ }
+ switch fieldNum {
+ case 1:
+ if wireType != 2 {
+ return fmt.Errorf("proto: wrong wireType = %d for field TypeUrl", wireType)
+ }
+ var stringLen uint64
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return io.ErrUnexpectedEOF
+ }
+ b := dAtA[iNdEx]
+ iNdEx++
+ stringLen |= (uint64(b) & 0x7F) << shift
+ if b < 0x80 {
+ break
+ }
+ }
+ intStringLen := int(stringLen)
+ if intStringLen < 0 {
+ return ErrInvalidLengthApi
+ }
+ postIndex := iNdEx + intStringLen
+ if postIndex > l {
+ return io.ErrUnexpectedEOF
+ }
+ m.TypeUrl = string(dAtA[iNdEx:postIndex])
+ iNdEx = postIndex
+ case 2:
+ if wireType != 2 {
+ return fmt.Errorf("proto: wrong wireType = %d for field ConfigPath", wireType)
+ }
+ var stringLen uint64
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return io.ErrUnexpectedEOF
+ }
+ b := dAtA[iNdEx]
+ iNdEx++
+ stringLen |= (uint64(b) & 0x7F) << shift
+ if b < 0x80 {
+ break
+ }
+ }
+ intStringLen := int(stringLen)
+ if intStringLen < 0 {
+ return ErrInvalidLengthApi
+ }
+ postIndex := iNdEx + intStringLen
+ if postIndex > l {
+ return io.ErrUnexpectedEOF
+ }
+ m.ConfigPath = string(dAtA[iNdEx:postIndex])
+ iNdEx = postIndex
+ default:
+ iNdEx = preIndex
+ skippy, err := skipApi(dAtA[iNdEx:])
+ if err != nil {
+ return err
+ }
+ if skippy < 0 {
+ return ErrInvalidLengthApi
+ }
+ if (iNdEx + skippy) > l {
+ return io.ErrUnexpectedEOF
+ }
+ iNdEx += skippy
+ }
+ }
+
+ if iNdEx > l {
+ return io.ErrUnexpectedEOF
+ }
+ return nil
+}
+
+func skipApi(dAtA []byte) (n int, err error) {
+ l := len(dAtA)
+ iNdEx := 0
+ for iNdEx < l {
+ var wire uint64
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return 0, ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return 0, io.ErrUnexpectedEOF
+ }
+ b := dAtA[iNdEx]
+ iNdEx++
+ wire |= (uint64(b) & 0x7F) << shift
+ if b < 0x80 {
+ break
+ }
+ }
+ wireType := int(wire & 0x7)
+ switch wireType {
+ case 0:
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return 0, ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return 0, io.ErrUnexpectedEOF
+ }
+ iNdEx++
+ if dAtA[iNdEx-1] < 0x80 {
+ break
+ }
+ }
+ return iNdEx, nil
+ case 1:
+ iNdEx += 8
+ return iNdEx, nil
+ case 2:
+ var length int
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return 0, ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return 0, io.ErrUnexpectedEOF
+ }
+ b := dAtA[iNdEx]
+ iNdEx++
+ length |= (int(b) & 0x7F) << shift
+ if b < 0x80 {
+ break
+ }
+ }
+ iNdEx += length
+ if length < 0 {
+ return 0, ErrInvalidLengthApi
+ }
+ return iNdEx, nil
+ case 3:
+ for {
+ var innerWire uint64
+ var start int = iNdEx
+ for shift := uint(0); ; shift += 7 {
+ if shift >= 64 {
+ return 0, ErrIntOverflowApi
+ }
+ if iNdEx >= l {
+ return 0, io.ErrUnexpectedEOF
+ }
+ b := dAtA[iNdEx]
+ iNdEx++
+ innerWire |= (uint64(b) & 0x7F) << shift
+ if b < 0x80 {
+ break
+ }
+ }
+ innerWireType := int(innerWire & 0x7)
+ if innerWireType == 4 {
+ break
+ }
+ next, err := skipApi(dAtA[start:])
+ if err != nil {
+ return 0, err
+ }
+ iNdEx = start + next
+ }
+ return iNdEx, nil
+ case 4:
+ return iNdEx, nil
+ case 5:
+ iNdEx += 4
+ return iNdEx, nil
+ default:
+ return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
+ }
+ }
+ panic("unreachable")
+}
+
+var (
+ ErrInvalidLengthApi = fmt.Errorf("proto: negative length found during unmarshaling")
+ ErrIntOverflowApi = fmt.Errorf("proto: integer overflow")
+)
+
+func init() { proto.RegisterFile("api.proto", fileDescriptorApi) }
+
+var fileDescriptorApi = []byte{
+ // 183 bytes of a gzipped FileDescriptorProto
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4c, 0x2c, 0xc8, 0xd4,
+ 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x12, 0x4d, 0x2e, 0xca, 0xd4, 0x2b, 0x2a, 0xcd, 0x2b, 0xc9,
+ 0xcc, 0x4d, 0xcd, 0x2f, 0x28, 0xc9, 0xcc, 0xcf, 0x2b, 0xd6, 0x2b, 0x33, 0x94, 0xd2, 0x4d, 0xcf,
+ 0x2c, 0xc9, 0x28, 0x4d, 0xd2, 0x4b, 0xce, 0xcf, 0xd5, 0x4f, 0xcf, 0x4f, 0xcf, 0xd7, 0x07, 0xab,
+ 0x4e, 0x2a, 0x4d, 0x03, 0xf3, 0xc0, 0x1c, 0x30, 0x0b, 0x62, 0x8a, 0x92, 0x2b, 0x17, 0xbb, 0x3f,
+ 0x44, 0xb3, 0x90, 0x24, 0x17, 0x47, 0x49, 0x65, 0x41, 0x6a, 0x7c, 0x69, 0x51, 0x8e, 0x04, 0xa3,
+ 0x02, 0xa3, 0x06, 0x67, 0x10, 0x3b, 0x88, 0x1f, 0x5a, 0x94, 0x23, 0x24, 0xcf, 0xc5, 0x9d, 0x9c,
+ 0x9f, 0x97, 0x96, 0x99, 0x1e, 0x5f, 0x90, 0x58, 0x92, 0x21, 0xc1, 0x04, 0x96, 0xe5, 0x82, 0x08,
+ 0x05, 0x24, 0x96, 0x64, 0x38, 0xc9, 0x9c, 0x78, 0x28, 0xc7, 0x78, 0xe3, 0xa1, 0x1c, 0x43, 0xc3,
+ 0x23, 0x39, 0xc6, 0x13, 0x8f, 0xe4, 0x18, 0x2f, 0x3c, 0x92, 0x63, 0x7c, 0xf0, 0x48, 0x8e, 0x71,
+ 0xc2, 0x63, 0x39, 0x86, 0x24, 0x36, 0xb0, 0x5d, 0xc6, 0x80, 0x00, 0x00, 0x00, 0xff, 0xff, 0x07,
+ 0x00, 0xf2, 0x18, 0xbe, 0x00, 0x00, 0x00,
+}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
index f4c238a00..c59a2400e 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
@@ -15,11 +15,12 @@
package runtimeoptions
import (
+ "bytes"
"testing"
shim "github.com/containerd/containerd/runtime/v1/shim/v1"
"github.com/containerd/typeurl"
- "github.com/golang/protobuf/proto"
+ "github.com/gogo/protobuf/proto"
)
func TestCreateTaskRequest(t *testing.T) {
@@ -32,7 +33,11 @@ func TestCreateTaskRequest(t *testing.T) {
if err := proto.UnmarshalText(encodedText, got); err != nil {
t.Fatalf("unable to unmarshal text: %v", err)
}
- t.Logf("got: %s", proto.MarshalTextString(got))
+ var textBuffer bytes.Buffer
+ if err := proto.MarshalText(&textBuffer, got); err != nil {
+ t.Errorf("unable to marshal text: %v", err)
+ }
+ t.Logf("got: %s", string(textBuffer.Bytes()))
// Check the options.
wantOptions := &Options{}
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index d4d785cca..6f81b0164 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -178,6 +178,24 @@ func PayloadLen(payloadLength int) NetworkChecker {
}
}
+// IPPayload creates a checker that checks the payload.
+func IPPayload(payload []byte) NetworkChecker {
+ return func(t *testing.T, h []header.Network) {
+ t.Helper()
+
+ got := h[0].Payload()
+
+ // cmp.Diff does not consider nil slices equal to empty slices, but we do.
+ if len(got) == 0 && len(payload) == 0 {
+ return
+ }
+
+ if diff := cmp.Diff(payload, got); diff != "" {
+ t.Errorf("payload mismatch (-want +got):\n%s", diff)
+ }
+ }
+}
+
// IPv4Options returns a checker that checks the options in an IPv4 packet.
func IPv4Options(want []byte) NetworkChecker {
return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/link/ethernet/BUILD b/pkg/tcpip/link/ethernet/BUILD
new file mode 100644
index 000000000..ec92ed623
--- /dev/null
+++ b/pkg/tcpip/link/ethernet/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "ethernet",
+ srcs = ["ethernet.go"],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//pkg/tcpip",
+ "//pkg/tcpip/header",
+ "//pkg/tcpip/link/nested",
+ "//pkg/tcpip/stack",
+ ],
+)
diff --git a/pkg/tcpip/link/ethernet/ethernet.go b/pkg/tcpip/link/ethernet/ethernet.go
new file mode 100644
index 000000000..3eef7cd56
--- /dev/null
+++ b/pkg/tcpip/link/ethernet/ethernet.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ethernet provides an implementation of an ethernet link endpoint that
+// wraps an inner link endpoint.
+package ethernet
+
+import (
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/link/nested"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+var _ stack.NetworkDispatcher = (*Endpoint)(nil)
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+
+// New returns an ethernet link endpoint that wraps an inner link endpoint.
+func New(ep stack.LinkEndpoint) *Endpoint {
+ var e Endpoint
+ e.Endpoint.Init(ep, &e)
+ return &e
+}
+
+// Endpoint is an ethernet endpoint.
+//
+// It adds an ethernet header to packets before sending them out through its
+// inner link endpoint and consumes an ethernet header before sending the
+// packet to the stack.
+type Endpoint struct {
+ nested.Endpoint
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.
+func (e *Endpoint) DeliverNetworkPacket(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+ hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
+ if !ok {
+ return
+ }
+
+ eth := header.Ethernet(hdr)
+ if dst := eth.DestinationAddress(); dst == e.Endpoint.LinkAddress() || dst == header.EthernetBroadcastAddress || header.IsMulticastEthernetAddress(dst) {
+ e.Endpoint.DeliverNetworkPacket(eth.SourceAddress() /* remote */, dst /* local */, eth.Type() /* protocol */, pkt)
+ }
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+ return stack.CapabilityResolutionRequired | e.Endpoint.Capabilities()
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+ e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress, proto, pkt)
+ return e.Endpoint.WritePacket(r, gso, proto, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ linkAddr := e.Endpoint.LinkAddress()
+
+ for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+ e.AddHeader(linkAddr, r.RemoteLinkAddress, proto, pkt)
+ }
+
+ return e.Endpoint.WritePackets(r, gso, pkts, proto)
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+ return header.EthernetMinimumSize + e.Endpoint.MaxHeaderLength()
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.
+func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
+ return header.ARPHardwareEther
+}
+
+// AddHeader implements stack.LinkEndpoint.
+func (*Endpoint) AddHeader(local, remote tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+ eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
+ fields := header.EthernetFields{
+ SrcAddr: local,
+ DstAddr: remote,
+ Type: proto,
+ }
+ eth.Encode(&fields)
+}
diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go
index 76f563811..523b0d24b 100644
--- a/pkg/tcpip/link/pipe/pipe.go
+++ b/pkg/tcpip/link/pipe/pipe.go
@@ -26,27 +26,23 @@ import (
var _ stack.LinkEndpoint = (*Endpoint)(nil)
// New returns both ends of a new pipe.
-func New(linkAddr1, linkAddr2 tcpip.LinkAddress, capabilities stack.LinkEndpointCapabilities) (*Endpoint, *Endpoint) {
+func New(linkAddr1, linkAddr2 tcpip.LinkAddress) (*Endpoint, *Endpoint) {
ep1 := &Endpoint{
- linkAddr: linkAddr1,
- capabilities: capabilities,
+ linkAddr: linkAddr1,
}
ep2 := &Endpoint{
- linkAddr: linkAddr2,
- linked: ep1,
- capabilities: capabilities,
+ linkAddr: linkAddr2,
}
ep1.linked = ep2
+ ep2.linked = ep1
return ep1, ep2
}
// Endpoint is one end of a pipe.
type Endpoint struct {
- capabilities stack.LinkEndpointCapabilities
- linkAddr tcpip.LinkAddress
- dispatcher stack.NetworkDispatcher
- linked *Endpoint
- onWritePacket func(*stack.PacketBuffer)
+ dispatcher stack.NetworkDispatcher
+ linked *Endpoint
+ linkAddr tcpip.LinkAddress
}
// WritePacket implements stack.LinkEndpoint.
@@ -55,16 +51,11 @@ func (e *Endpoint) WritePacket(r *stack.Route, _ *stack.GSO, proto tcpip.Network
return nil
}
- // The pipe endpoint will accept all multicast/broadcast link traffic and only
- // unicast traffic destined to itself.
- if len(e.linked.linkAddr) != 0 &&
- r.RemoteLinkAddress != e.linked.linkAddr &&
- r.RemoteLinkAddress != header.EthernetBroadcastAddress &&
- !header.IsMulticastEthernetAddress(r.RemoteLinkAddress) {
- return nil
- }
-
- e.linked.dispatcher.DeliverNetworkPacket(e.linkAddr, r.RemoteLinkAddress, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
+ // Note that the local address from the perspective of this endpoint is the
+ // remote address from the perspective of the other end of the pipe
+ // (e.linked). Similarly, the remote address from the perspective of this
+ // endpoint is the local address on the other end.
+ e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
}))
@@ -100,8 +91,8 @@ func (*Endpoint) MTU() uint32 {
}
// Capabilities implements stack.LinkEndpoint.
-func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
- return e.capabilities
+func (*Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+ return 0
}
// MaxHeaderLength implements stack.LinkEndpoint.
@@ -116,7 +107,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
// ARPHardwareType implements stack.LinkEndpoint.
func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
- return header.ARPHardwareEther
+ return header.ARPHardwareNone
}
// AddHeader implements stack.LinkEndpoint.
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 59710352b..c118a2929 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -12,6 +12,7 @@ go_test(
"//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
+ "//pkg/tcpip/checker",
"//pkg/tcpip/header",
"//pkg/tcpip/link/channel",
"//pkg/tcpip/link/loopback",
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index d436873b6..f20b94d97 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -15,11 +15,13 @@
package ip_test
import (
+ "strings"
"testing"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/checker"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
@@ -320,6 +322,7 @@ func TestSourceAddressValidation(t *testing.T) {
SrcAddr: src,
DstAddr: localIPv4Addr,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),
@@ -342,7 +345,6 @@ func TestSourceAddressValidation(t *testing.T) {
SrcAddr: src,
DstAddr: localIPv6Addr,
})
-
e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),
}))
@@ -579,6 +581,7 @@ func TestIPv4Receive(t *testing.T) {
SrcAddr: remoteIPv4Addr,
DstAddr: localIPv4Addr,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
// Make payload be non-zero.
for i := header.IPv4MinimumSize; i < totalLen; i++ {
@@ -660,6 +663,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
SrcAddr: "\x0a\x00\x00\xbb",
DstAddr: localIPv4Addr,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
// Create the ICMP header.
icmp := header.ICMPv4(view[header.IPv4MinimumSize:])
@@ -679,6 +683,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
SrcAddr: localIPv4Addr,
DstAddr: remoteIPv4Addr,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
// Make payload be non-zero.
for i := dataOffset; i < len(view); i++ {
@@ -732,6 +737,8 @@ func TestIPv4FragmentationReceive(t *testing.T) {
SrcAddr: remoteIPv4Addr,
DstAddr: localIPv4Addr,
})
+ ip1.SetChecksum(^ip1.CalculateChecksum())
+
// Make payload be non-zero.
for i := header.IPv4MinimumSize; i < totalLen; i++ {
frag1[i] = uint8(i)
@@ -748,6 +755,8 @@ func TestIPv4FragmentationReceive(t *testing.T) {
SrcAddr: remoteIPv4Addr,
DstAddr: localIPv4Addr,
})
+ ip2.SetChecksum(^ip2.CalculateChecksum())
+
// Make payload be non-zero.
for i := header.IPv4MinimumSize; i < totalLen; i++ {
frag2[i] = uint8(i)
@@ -1020,3 +1029,406 @@ func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer
_, _ = pkt.NetworkHeader().Consume(netHdrLen)
return pkt
}
+
+func TestWriteHeaderIncludedPacket(t *testing.T) {
+ const (
+ nicID = 1
+ transportProto = 5
+
+ dataLen = 4
+ optionsLen = 4
+ )
+
+ dataBuf := [dataLen]byte{1, 2, 3, 4}
+ data := dataBuf[:]
+
+ ipv4OptionsBuf := [optionsLen]byte{0, 1, 0, 1}
+ ipv4Options := ipv4OptionsBuf[:]
+
+ ipv6FragmentExtHdrBuf := [header.IPv6FragmentExtHdrLength]byte{transportProto, 0, 62, 4, 1, 2, 3, 4}
+ ipv6FragmentExtHdr := ipv6FragmentExtHdrBuf[:]
+
+ var ipv6PayloadWithExtHdrBuf [dataLen + header.IPv6FragmentExtHdrLength]byte
+ ipv6PayloadWithExtHdr := ipv6PayloadWithExtHdrBuf[:]
+ if n := copy(ipv6PayloadWithExtHdr, ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+ }
+ if n := copy(ipv6PayloadWithExtHdr[header.IPv6FragmentExtHdrLength:], data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+
+ tests := []struct {
+ name string
+ protoFactory stack.NetworkProtocolFactory
+ protoNum tcpip.NetworkProtocolNumber
+ nicAddr tcpip.Address
+ remoteAddr tcpip.Address
+ pktGen func(*testing.T, tcpip.Address) buffer.View
+ checker func(*testing.T, *stack.PacketBuffer, tcpip.Address)
+ expectedErr *tcpip.Error
+ }{
+ {
+ name: "IPv4",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv4MinimumSize + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv4Any {
+ src = localIPv4Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv4MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+ }
+
+ checker.IPv4(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv4Addr),
+ checker.IPv4HeaderLength(header.IPv4MinimumSize),
+ checker.IPFullLength(uint16(header.IPv4MinimumSize+len(data))),
+ checker.IPPayload(data),
+ )
+ },
+ },
+ {
+ name: "IPv4 with IHL too small",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv4MinimumSize + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize - 1,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ expectedErr: tcpip.ErrMalformedHeader,
+ },
+ {
+ name: "IPv4 too small",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip[:len(ip)-1])
+ },
+ expectedErr: tcpip.ErrMalformedHeader,
+ },
+ {
+ name: "IPv4 minimum size",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+ ip.Encode(&header.IPv4Fields{
+ IHL: header.IPv4MinimumSize,
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip)
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv4Any {
+ src = localIPv4Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv4MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+ }
+
+ checker.IPv4(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv4Addr),
+ checker.IPv4HeaderLength(header.IPv4MinimumSize),
+ checker.IPFullLength(header.IPv4MinimumSize),
+ checker.IPPayload(nil),
+ )
+ },
+ },
+ {
+ name: "IPv4 with options",
+ protoFactory: ipv4.NewProtocol,
+ protoNum: ipv4.ProtocolNumber,
+ nicAddr: localIPv4Addr,
+ remoteAddr: remoteIPv4Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ipHdrLen := header.IPv4MinimumSize + len(ipv4Options)
+ totalLen := ipHdrLen + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv4(hdr.Prepend(ipHdrLen))
+ ip.Encode(&header.IPv4Fields{
+ IHL: uint8(ipHdrLen),
+ Protocol: transportProto,
+ TTL: ipv4.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ if n := copy(ip.Options(), ipv4Options); n != len(ipv4Options) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv4Options))
+ }
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv4Any {
+ src = localIPv4Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ hdrLen := header.IPv4MinimumSize + len(ipv4Options)
+ if len(netHdr.View()) != hdrLen {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen)
+ }
+
+ checker.IPv4(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv4Addr),
+ checker.IPv4HeaderLength(hdrLen),
+ checker.IPFullLength(uint16(hdrLen+len(data))),
+ checker.IPv4Options(ipv4Options),
+ checker.IPPayload(data),
+ )
+ },
+ },
+ {
+ name: "IPv6",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv6MinimumSize + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: transportProto,
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv6Any {
+ src = localIPv6Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv6MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv6Addr),
+ checker.IPFullLength(uint16(header.IPv6MinimumSize+len(data))),
+ checker.IPPayload(data),
+ )
+ },
+ },
+ {
+ name: "IPv6 with extension header",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ totalLen := header.IPv6MinimumSize + len(ipv6FragmentExtHdr) + len(data)
+ hdr := buffer.NewPrependable(totalLen)
+ if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+ }
+ if n := copy(hdr.Prepend(len(ipv6FragmentExtHdr)), ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+ t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+ }
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: uint8(header.IPv6FragmentExtHdrIdentifier),
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return hdr.View()
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv6Any {
+ src = localIPv6Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if want := header.IPv6MinimumSize + len(ipv6FragmentExtHdr); len(netHdr.View()) != want {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), want)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv6Addr),
+ checker.IPFullLength(uint16(header.IPv6MinimumSize+len(ipv6PayloadWithExtHdr))),
+ checker.IPPayload(ipv6PayloadWithExtHdr),
+ )
+ },
+ },
+ {
+ name: "IPv6 minimum size",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: transportProto,
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip)
+ },
+ checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+ if src == header.IPv6Any {
+ src = localIPv6Addr
+ }
+
+ netHdr := pkt.NetworkHeader()
+
+ if len(netHdr.View()) != header.IPv6MinimumSize {
+ t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(netHdr),
+ checker.SrcAddr(src),
+ checker.DstAddr(remoteIPv6Addr),
+ checker.IPFullLength(header.IPv6MinimumSize),
+ checker.IPPayload(nil),
+ )
+ },
+ },
+ {
+ name: "IPv6 too small",
+ protoFactory: ipv6.NewProtocol,
+ protoNum: ipv6.ProtocolNumber,
+ nicAddr: localIPv6Addr,
+ remoteAddr: remoteIPv6Addr,
+ pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+ ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ NextHeader: transportProto,
+ HopLimit: ipv6.DefaultTTL,
+ SrcAddr: src,
+ DstAddr: header.IPv4Any,
+ })
+ return buffer.View(ip[:len(ip)-1])
+ },
+ expectedErr: tcpip.ErrMalformedHeader,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ subTests := []struct {
+ name string
+ srcAddr tcpip.Address
+ }{
+ {
+ name: "unspecified source",
+ srcAddr: tcpip.Address(strings.Repeat("\x00", len(test.nicAddr))),
+ },
+ {
+ name: "random source",
+ srcAddr: tcpip.Address(strings.Repeat("\xab", len(test.nicAddr))),
+ },
+ }
+
+ for _, subTest := range subTests {
+ t.Run(subTest.name, func(t *testing.T) {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocolFactory{test.protoFactory},
+ })
+ e := channel.New(1, 1280, "")
+ if err := s.CreateNIC(nicID, e); err != nil {
+ t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+ }
+ if err := s.AddAddress(nicID, test.protoNum, test.nicAddr); err != nil {
+ t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, test.protoNum, test.nicAddr, err)
+ }
+
+ s.SetRouteTable([]tcpip.Route{{Destination: test.remoteAddr.WithPrefix().Subnet(), NIC: nicID}})
+
+ r, err := s.FindRoute(nicID, test.nicAddr, test.remoteAddr, test.protoNum, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, test.remoteAddr, test.nicAddr, test.protoNum, err)
+ }
+ defer r.Release()
+
+ if err := r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
+ Data: test.pktGen(t, subTest.srcAddr).ToVectorisedView(),
+ })); err != test.expectedErr {
+ t.Fatalf("got r.WriteHeaderIncludedPacket(_) = %s, want = %s", err, test.expectedErr)
+ }
+
+ if test.expectedErr != nil {
+ return
+ }
+
+ pkt, ok := e.Read()
+ if !ok {
+ t.Fatal("expected a packet to be written")
+ }
+ test.checker(t, pkt.Pkt, subTest.srcAddr)
+ })
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index c5ac7b8b5..e7c58ae0a 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -190,29 +190,6 @@ func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
return e.protocol.Number()
}
-// writePacketFragments fragments pkt and writes the results on the link
-// endpoint. The IP header must already present in the original packet. The mtu
-// is the maximum size of the packets.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu uint32, pkt *stack.PacketBuffer) *tcpip.Error {
- networkHeader := header.IPv4(pkt.NetworkHeader().View())
- fragMTU := int(calculateFragmentInnerMTU(mtu, pkt))
- pf := fragmentation.MakePacketFragmenter(pkt, fragMTU, pkt.AvailableHeaderBytes()+len(networkHeader))
-
- for {
- fragPkt, more := buildNextFragment(&pf, networkHeader)
- if err := e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt); err != nil {
- r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pf.RemainingFragmentCount() + 1))
- return err
- }
- r.Stats().IP.PacketsSent.Increment()
- if !more {
- break
- }
- }
-
- return nil
-}
-
func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
ip := header.IPv4(pkt.NetworkHeader().Push(header.IPv4MinimumSize))
length := uint16(pkt.Size())
@@ -234,10 +211,39 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
pkt.NetworkProtocolNumber = ProtocolNumber
}
+func (e *endpoint) packetMustBeFragmented(pkt *stack.PacketBuffer, gso *stack.GSO) bool {
+ return (gso == nil || gso.Type == stack.GSONone) && pkt.Size() > int(e.nic.MTU())
+}
+
+// handleFragments fragments pkt and calls the handler function on each
+// fragment. It returns the number of fragments handled and the number of
+// fragments left to be processed. The IP header must already be present in the
+// original packet. The mtu is the maximum size of the packets.
+func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, mtu uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) *tcpip.Error) (int, int, *tcpip.Error) {
+ fragMTU := int(calculateFragmentInnerMTU(mtu, pkt))
+ networkHeader := header.IPv4(pkt.NetworkHeader().View())
+ pf := fragmentation.MakePacketFragmenter(pkt, fragMTU, pkt.AvailableHeaderBytes()+len(networkHeader))
+
+ var n int
+ for {
+ fragPkt, more := buildNextFragment(&pf, networkHeader)
+ if err := handler(fragPkt); err != nil {
+ return n, pf.RemainingFragmentCount() + 1, err
+ }
+ n++
+ if !more {
+ return n, pf.RemainingFragmentCount(), nil
+ }
+ }
+}
+
// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
e.addIPHeader(r, pkt, params)
+ return e.writePacket(r, gso, pkt)
+}
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer) *tcpip.Error {
// iptables filtering. All packets that reach here are locally
// generated.
nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
@@ -273,8 +279,18 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
if r.Loop&stack.PacketOut == 0 {
return nil
}
- if pkt.Size() > int(e.nic.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
- return e.writePacketFragments(r, gso, e.nic.MTU(), pkt)
+
+ if e.packetMustBeFragmented(pkt, gso) {
+ sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+ // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
+ // fragment one by one using WritePacket() (current strategy) or if we
+ // want to create a PacketBufferList from the fragments and feed it to
+ // WritePackets(). It'll be faster but cost more memory.
+ return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
+ })
+ r.Stats().IP.PacketsSent.IncrementBy(uint64(sent))
+ r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(remain))
+ return err
}
if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
r.Stats().IP.OutgoingPacketErrors.Increment()
@@ -293,9 +309,23 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
return pkts.Len(), nil
}
- for pkt := pkts.Front(); pkt != nil; {
+ for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
e.addIPHeader(r, pkt, params)
- pkt = pkt.Next()
+ if e.packetMustBeFragmented(pkt, gso) {
+ // Keep track of the packet that is about to be fragmented so it can be
+ // removed once the fragmentation is done.
+ originalPkt := pkt
+ if _, _, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+ // Modify the packet list in place with the new fragments.
+ pkts.InsertAfter(pkt, fragPkt)
+ pkt = fragPkt
+ return nil
+ }); err != nil {
+ panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", e.nic.MTU(), err))
+ }
+ // Remove the packet that was just fragmented and process the rest.
+ pkts.Remove(originalPkt)
+ }
}
nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
@@ -347,30 +377,27 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
return n + len(dropped), nil
}
-// WriteHeaderIncludedPacket writes a packet already containing a network
-// header through the given route.
+// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
// The packet already has an IP header, but there are a few required
// checks.
h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
if !ok {
- return tcpip.ErrInvalidOptionValue
+ return tcpip.ErrMalformedHeader
}
ip := header.IPv4(h)
- if !ip.IsValid(pkt.Data.Size()) {
- return tcpip.ErrInvalidOptionValue
- }
// Always set the total length.
- ip.SetTotalLength(uint16(pkt.Data.Size()))
+ pktSize := pkt.Data.Size()
+ ip.SetTotalLength(uint16(pktSize))
// Set the source address when zero.
- if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) {
+ if ip.SourceAddress() == header.IPv4Any {
ip.SetSourceAddress(r.LocalAddress)
}
- // Set the destination. If the packet already included a destination,
- // it will be part of the route.
+ // Set the destination. If the packet already included a destination, it will
+ // be part of the route anyways.
ip.SetDestinationAddress(r.RemoteAddress)
// Set the packet ID when zero.
@@ -387,19 +414,17 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
ip.SetChecksum(0)
ip.SetChecksum(^ip.CalculateChecksum())
- if r.Loop&stack.PacketLoop != 0 {
- e.HandlePacket(r, pkt.Clone())
- }
- if r.Loop&stack.PacketOut == 0 {
- return nil
+ // Populate the packet buffer's network header and don't allow an invalid
+ // packet to be sent.
+ //
+ // Note that parsing only makes sure that the packet is well formed as per the
+ // wire format. We also want to check if the header's fields are valid before
+ // sending the packet.
+ if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
+ return tcpip.ErrMalformedHeader
}
- if err := e.nic.WritePacket(r, nil /* gso */, ProtocolNumber, pkt); err != nil {
- r.Stats().IP.OutgoingPacketErrors.Increment()
- return err
- }
- r.Stats().IP.PacketsSent.Increment()
- return nil
+ return e.writePacket(r, nil /* gso */, pkt)
}
// HandlePacket is called by the link layer when new ipv4 packets arrive for
@@ -415,6 +440,32 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
return
}
+ // There has been some confusion regarding verifying checksums. We need
+ // just look for negative 0 (0xffff) as the checksum, as it's not possible to
+ // get positive 0 (0) for the checksum. Some bad implementations could get it
+ // when doing entry replacement in the early days of the Internet,
+ // however the lore that one needs to check for both persists.
+ //
+ // RFC 1624 section 1 describes the source of this confusion as:
+ // [the partial recalculation method described in RFC 1071] computes a
+ // result for certain cases that differs from the one obtained from
+ // scratch (one's complement of one's complement sum of the original
+ // fields).
+ //
+ // However RFC 1624 section 5 clarifies that if using the verification method
+ // "recommended by RFC 1071, it does not matter if an intermediate system
+ // generated a -0 instead of +0".
+ //
+ // RFC1071 page 1 specifies the verification method as:
+ // (3) To check a checksum, the 1's complement sum is computed over the
+ // same set of octets, including the checksum field. If the result
+ // is all 1 bits (-0 in 1's complement arithmetic), the check
+ // succeeds.
+ if h.CalculateChecksum() != 0xffff {
+ r.Stats().IP.MalformedPacketsReceived.Increment()
+ return
+ }
+
// As per RFC 1122 section 3.2.1.3:
// When a host sends any datagram, the IP source address MUST
// be one of its own IP addresses (but not a broadcast or
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 9916d783f..fee11bb38 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -15,9 +15,9 @@
package ipv4_test
import (
- "bytes"
"context"
"encoding/hex"
+ "fmt"
"math"
"net"
"testing"
@@ -39,6 +39,8 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
+const extraHeaderReserve = 50
+
func TestExcludeBroadcast(t *testing.T) {
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
@@ -118,6 +120,7 @@ func TestIPv4Sanity(t *testing.T) {
tests := []struct {
name string
headerLength uint8 // value of 0 means "use correct size"
+ badHeaderChecksum bool
maxTotalLength uint16
transportProtocol uint8
TTL uint8
@@ -133,6 +136,14 @@ func TestIPv4Sanity(t *testing.T) {
transportProtocol: uint8(header.ICMPv4ProtocolNumber),
TTL: ttl,
},
+ {
+ name: "bad header checksum",
+ maxTotalLength: defaultMTU,
+ transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+ TTL: ttl,
+ badHeaderChecksum: true,
+ shouldFail: true,
+ },
// The TTL tests check that we are not rejecting an incoming packet
// with a zero or one TTL, which has been a point of confusion in the
// past as RFC 791 says: "If this field contains the value zero, then the
@@ -243,7 +254,7 @@ func TestIPv4Sanity(t *testing.T) {
// Default routes for IPv4 so ICMP can find a route to the remote
// node when attempting to send the ICMP Echo Reply.
s.SetRouteTable([]tcpip.Route{
- tcpip.Route{
+ {
Destination: header.IPv4EmptySubnet,
NIC: nicID,
},
@@ -288,6 +299,12 @@ func TestIPv4Sanity(t *testing.T) {
if test.headerLength != 0 {
ip.SetHeaderLength(test.headerLength)
}
+ ip.SetChecksum(0)
+ ipHeaderChecksum := ip.CalculateChecksum()
+ if test.badHeaderChecksum {
+ ipHeaderChecksum += 42
+ }
+ ip.SetChecksum(^ipHeaderChecksum)
requestPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),
})
@@ -369,11 +386,10 @@ func TestIPv4Sanity(t *testing.T) {
// comparePayloads compared the contents of all the packets against the contents
// of the source packet.
-func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
- t.Helper()
- // Make a complete array of the sourcePacketInfo packet.
- source := header.IPv4(packets[0].NetworkHeader().View()[:header.IPv4MinimumSize])
- vv := buffer.NewVectorisedView(sourcePacketInfo.Size(), sourcePacketInfo.Views())
+func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketBuffer, mtu uint32, wantFragments []fragmentInfo, proto tcpip.TransportProtocolNumber) error {
+ // Make a complete array of the sourcePacket packet.
+ source := header.IPv4(packets[0].NetworkHeader().View())
+ vv := buffer.NewVectorisedView(sourcePacket.Size(), sourcePacket.Views())
source = append(source, vv.ToView()...)
// Make a copy of the IP header, which will be modified in some fields to make
@@ -382,82 +398,147 @@ func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketI
sourceCopy.SetChecksum(0)
sourceCopy.SetFlagsFragmentOffset(0, 0)
sourceCopy.SetTotalLength(0)
- var offset uint16
// Build up an array of the bytes sent.
- var reassembledPayload []byte
+ var reassembledPayload buffer.VectorisedView
for i, packet := range packets {
// Confirm that the packet is valid.
allBytes := buffer.NewVectorisedView(packet.Size(), packet.Views())
- ip := header.IPv4(allBytes.ToView())
- if !ip.IsValid(len(ip)) {
- t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
+ fragmentIPHeader := header.IPv4(allBytes.ToView())
+ if !fragmentIPHeader.IsValid(len(fragmentIPHeader)) {
+ return fmt.Errorf("fragment #%d: IP packet is invalid:\n%s", i, hex.Dump(fragmentIPHeader))
}
- if got, want := ip.CalculateChecksum(), uint16(0xffff); got != want {
- t.Errorf("ip.CalculateChecksum() got %#x, want %#x", got, want)
+ if got := len(fragmentIPHeader); got > int(mtu) {
+ return fmt.Errorf("fragment #%d: got len(fragmentIPHeader) = %d, want <= %d", i, got, mtu)
}
- if got, want := len(ip), int(mtu); got > want {
- t.Errorf("fragment is too large, got %d want %d", got, want)
+ if got := fragmentIPHeader.TransportProtocol(); got != proto {
+ return fmt.Errorf("fragment #%d: got fragmentIPHeader.TransportProtocol() = %d, want = %d", i, got, uint8(proto))
}
- if got, want := packet.AvailableHeaderBytes(), sourcePacketInfo.AvailableHeaderBytes()-header.IPv4MinimumSize; got != want {
- t.Errorf("fragment #%d should have the same available space for prepending as source: got %d, want %d", i, got, want)
+ if got := packet.AvailableHeaderBytes(); got != extraHeaderReserve {
+ return fmt.Errorf("fragment #%d: got packet.AvailableHeaderBytes() = %d, want = %d", i, got, extraHeaderReserve)
}
- if got, want := packet.NetworkProtocolNumber, sourcePacketInfo.NetworkProtocolNumber; got != want {
- t.Errorf("fragment #%d has wrong network protocol number: got %d, want %d", i, got, want)
+ if got, want := packet.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber; got != want {
+ return fmt.Errorf("fragment #%d: got fragment.NetworkProtocolNumber = %d, want = %d", i, got, want)
}
- if i < len(packets)-1 {
- sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, offset)
+ if got, want := fragmentIPHeader.CalculateChecksum(), uint16(0xffff); got != want {
+ return fmt.Errorf("fragment #%d: got ip.CalculateChecksum() = %#x, want = %#x", i, got, want)
+ }
+ if wantFragments[i].more {
+ sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, wantFragments[i].offset)
} else {
- sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, offset)
+ sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, wantFragments[i].offset)
}
- reassembledPayload = append(reassembledPayload, ip.Payload()...)
- offset += ip.TotalLength() - uint16(ip.HeaderLength())
+ reassembledPayload.AppendView(packet.TransportHeader().View())
+ reassembledPayload.Append(packet.Data)
// Clear out the checksum and length from the ip because we can't compare
// it.
- sourceCopy.SetTotalLength(uint16(len(ip)))
+ sourceCopy.SetTotalLength(wantFragments[i].payloadSize + header.IPv4MinimumSize)
sourceCopy.SetChecksum(0)
sourceCopy.SetChecksum(^sourceCopy.CalculateChecksum())
- if !bytes.Equal(ip[:ip.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]) {
- t.Errorf("ip[:ip.HeaderLength()] got:\n%s\nwant:\n%s", hex.Dump(ip[:ip.HeaderLength()]), hex.Dump(sourceCopy[:sourceCopy.HeaderLength()]))
+ if diff := cmp.Diff(fragmentIPHeader[:fragmentIPHeader.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]); diff != "" {
+ return fmt.Errorf("fragment #%d: fragmentIPHeader mismatch (-want +got):\n%s", i, diff)
}
}
- expected := source[source.HeaderLength():]
- if !bytes.Equal(reassembledPayload, expected) {
- t.Errorf("reassembledPayload got:\n%s\nwant:\n%s", hex.Dump(reassembledPayload), hex.Dump(expected))
+
+ expected := buffer.View(source[source.HeaderLength():])
+ if diff := cmp.Diff(expected, reassembledPayload.ToView()); diff != "" {
+ return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
}
+
+ return nil
}
-func TestFragmentation(t *testing.T) {
- const ttl = 42
+type fragmentInfo struct {
+ offset uint16
+ more bool
+ payloadSize uint16
+}
- var manyPayloadViewsSizes [1000]int
- for i := range manyPayloadViewsSizes {
- manyPayloadViewsSizes[i] = 7
- }
- fragTests := []struct {
- description string
- mtu uint32
- gso *stack.GSO
- transportHeaderLength int
- extraHeaderReserveLength int
- payloadViewsSizes []int
- expectedFrags int
- }{
- {"No fragmentation", 2000, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 1},
- {"No fragmentation with big header", 2000, &stack.GSO{}, 16, header.IPv4MinimumSize, []int{1000}, 1},
- {"Fragmented", 800, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 2},
- {"Fragmented with gso nil", 800, nil, 0, header.IPv4MinimumSize, []int{1000}, 2},
- {"Fragmented with many views", 300, &stack.GSO{}, 0, header.IPv4MinimumSize, manyPayloadViewsSizes[:], 25},
- {"Fragmented with many views and prependable bytes", 300, &stack.GSO{}, 0, header.IPv4MinimumSize + 55, manyPayloadViewsSizes[:], 25},
- {"Fragmented with big header", 800, &stack.GSO{}, 20, header.IPv4MinimumSize, []int{1000}, 2},
- {"Fragmented with big header and prependable bytes", 800, &stack.GSO{}, 20, header.IPv4MinimumSize + 66, []int{1000}, 2},
- {"Fragmented with MTU smaller than header and prependable bytes", 300, &stack.GSO{}, 1000, header.IPv4MinimumSize + 77, []int{500}, 6},
- }
+var fragmentationTests = []struct {
+ description string
+ mtu uint32
+ gso *stack.GSO
+ transportHeaderLength int
+ payloadSize int
+ wantFragments []fragmentInfo
+}{
+ {
+ description: "No Fragmentation",
+ mtu: 1280,
+ gso: nil,
+ transportHeaderLength: 0,
+ payloadSize: 1000,
+ wantFragments: []fragmentInfo{
+ {offset: 0, payloadSize: 1000, more: false},
+ },
+ },
+ {
+ description: "Fragmented",
+ mtu: 1280,
+ gso: nil,
+ transportHeaderLength: 0,
+ payloadSize: 2000,
+ wantFragments: []fragmentInfo{
+ {offset: 0, payloadSize: 1256, more: true},
+ {offset: 1256, payloadSize: 744, more: false},
+ },
+ },
+ {
+ description: "No fragmentation with big header",
+ mtu: 2000,
+ gso: nil,
+ transportHeaderLength: 100,
+ payloadSize: 1000,
+ wantFragments: []fragmentInfo{
+ {offset: 0, payloadSize: 1100, more: false},
+ },
+ },
+ {
+ description: "Fragmented with gso none",
+ mtu: 1280,
+ gso: &stack.GSO{Type: stack.GSONone},
+ transportHeaderLength: 0,
+ payloadSize: 1400,
+ wantFragments: []fragmentInfo{
+ {offset: 0, payloadSize: 1256, more: true},
+ {offset: 1256, payloadSize: 144, more: false},
+ },
+ },
+ {
+ description: "Fragmented with big header",
+ mtu: 1280,
+ gso: nil,
+ transportHeaderLength: 100,
+ payloadSize: 1200,
+ wantFragments: []fragmentInfo{
+ {offset: 0, payloadSize: 1256, more: true},
+ {offset: 1256, payloadSize: 44, more: false},
+ },
+ },
+ {
+ description: "Fragmented with MTU smaller than header",
+ mtu: 300,
+ gso: nil,
+ transportHeaderLength: 1000,
+ payloadSize: 500,
+ wantFragments: []fragmentInfo{
+ {offset: 0, payloadSize: 280, more: true},
+ {offset: 280, payloadSize: 280, more: true},
+ {offset: 560, payloadSize: 280, more: true},
+ {offset: 840, payloadSize: 280, more: true},
+ {offset: 1120, payloadSize: 280, more: true},
+ {offset: 1400, payloadSize: 100, more: false},
+ },
+ },
+}
- for _, ft := range fragTests {
+func TestFragmentationWritePacket(t *testing.T) {
+ const ttl = 42
+
+ for _, ft := range fragmentationTests {
t.Run(ft.description, func(t *testing.T) {
ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
r := buildRoute(t, ep)
- pkt := testutil.MakeRandPkt(ft.transportHeaderLength, ft.extraHeaderReserveLength, ft.payloadViewsSizes, header.IPv4ProtocolNumber)
+ pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
source := pkt.Clone()
err := r.WritePacket(ft.gso, stack.NetworkHeaderParams{
Protocol: tcp.ProtocolNumber,
@@ -467,17 +548,101 @@ func TestFragmentation(t *testing.T) {
if err != nil {
t.Fatalf("r.WritePacket(_, _, _) = %s", err)
}
-
- if got := len(ep.WrittenPackets); got != ft.expectedFrags {
- t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, ft.expectedFrags)
+ if got := len(ep.WrittenPackets); got != len(ft.wantFragments) {
+ t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, len(ft.wantFragments))
}
- if got, want := len(ep.WrittenPackets), int(r.Stats().IP.PacketsSent.Value()); got != want {
- t.Errorf("no errors yet got len(ep.WrittenPackets) = %d, want = %d", got, want)
+ if got := int(r.Stats().IP.PacketsSent.Value()); got != len(ft.wantFragments) {
+ t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, len(ft.wantFragments))
}
if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
}
- compareFragments(t, ep.WrittenPackets, source, ft.mtu)
+ if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+ t.Error(err)
+ }
+ })
+ }
+}
+
+func TestFragmentationWritePackets(t *testing.T) {
+ const ttl = 42
+ writePacketsTests := []struct {
+ description string
+ insertBefore int
+ insertAfter int
+ }{
+ {
+ description: "Single packet",
+ insertBefore: 0,
+ insertAfter: 0,
+ },
+ {
+ description: "With packet before",
+ insertBefore: 1,
+ insertAfter: 0,
+ },
+ {
+ description: "With packet after",
+ insertBefore: 0,
+ insertAfter: 1,
+ },
+ {
+ description: "With packet before and after",
+ insertBefore: 1,
+ insertAfter: 1,
+ },
+ }
+ tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, extraHeaderReserve+header.IPv4MinimumSize, []int{1}, header.IPv4ProtocolNumber)
+
+ for _, test := range writePacketsTests {
+ t.Run(test.description, func(t *testing.T) {
+ for _, ft := range fragmentationTests {
+ t.Run(ft.description, func(t *testing.T) {
+ var pkts stack.PacketBufferList
+ for i := 0; i < test.insertBefore; i++ {
+ pkts.PushBack(tinyPacket.Clone())
+ }
+ pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
+ pkts.PushBack(pkt.Clone())
+ for i := 0; i < test.insertAfter; i++ {
+ pkts.PushBack(tinyPacket.Clone())
+ }
+
+ ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+ r := buildRoute(t, ep)
+
+ wantTotalPackets := len(ft.wantFragments) + test.insertBefore + test.insertAfter
+ n, err := r.WritePackets(ft.gso, pkts, stack.NetworkHeaderParams{
+ Protocol: tcp.ProtocolNumber,
+ TTL: ttl,
+ TOS: stack.DefaultTOS,
+ })
+ if err != nil {
+ t.Errorf("got WritePackets(_, _, _) = (_, %s), want = (_, nil)", err)
+ }
+ if n != wantTotalPackets {
+ t.Errorf("got WritePackets(_, _, _) = (%d, _), want = (%d, _)", n, wantTotalPackets)
+ }
+ if got := len(ep.WrittenPackets); got != wantTotalPackets {
+ t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, wantTotalPackets)
+ }
+ if got := int(r.Stats().IP.PacketsSent.Value()); got != wantTotalPackets {
+ t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, wantTotalPackets)
+ }
+ if got := int(r.Stats().IP.OutgoingPacketErrors.Value()); got != 0 {
+ t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+ }
+
+ if wantTotalPackets == 0 {
+ return
+ }
+
+ fragments := ep.WrittenPackets[test.insertBefore : len(ft.wantFragments)+test.insertBefore]
+ if err := compareFragments(fragments, pkt, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+ t.Error(err)
+ }
+ })
+ }
})
}
}
@@ -534,14 +699,14 @@ func TestFragmentationErrors(t *testing.T) {
t.Run(ft.description, func(t *testing.T) {
ep := testutil.NewMockLinkEndpoint(ft.mtu, expectedError, ft.allowPackets)
r := buildRoute(t, ep)
- pkt := testutil.MakeRandPkt(ft.transportHeaderLength, header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
+ pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
err := r.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
Protocol: tcp.ProtocolNumber,
TTL: ttl,
TOS: stack.DefaultTOS,
}, pkt)
if err != expectedError {
- t.Errorf("got WritePacket() = %s, want = %s", err, expectedError)
+ t.Errorf("got WritePacket(_, _, _) = %s, want = %s", err, expectedError)
}
if got, want := len(ep.WrittenPackets), int(r.Stats().IP.PacketsSent.Value()); err != nil && got != want {
t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, want)
@@ -1277,6 +1442,7 @@ func TestReceiveFragments(t *testing.T) {
SrcAddr: frag.srcAddr,
DstAddr: frag.dstAddr,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
vv := hdr.View().ToVectorisedView()
vv.AppendView(frag.payload)
@@ -1545,6 +1711,7 @@ func TestPacketQueing(t *testing.T) {
SrcAddr: host2IPv4Addr.AddressWithPrefix.Address,
DstAddr: host1IPv4Addr.AddressWithPrefix.Address,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
e.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),
}))
@@ -1588,6 +1755,7 @@ func TestPacketQueing(t *testing.T) {
SrcAddr: host2IPv4Addr.AddressWithPrefix.Address,
DstAddr: host1IPv4Addr.AddressWithPrefix.Address,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),
}))
@@ -1633,7 +1801,7 @@ func TestPacketQueing(t *testing.T) {
}
s.SetRouteTable([]tcpip.Route{
- tcpip.Route{
+ {
Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
NIC: nicID,
},
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index a454f6c34..ead6bedcb 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -252,26 +252,29 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
return
}
- it, err := ns.Options().Iter(false /* check */)
- if err != nil {
- // Options are not valid as per the wire format, silently drop the packet.
- received.Invalid.Increment()
- return
- }
+ var sourceLinkAddr tcpip.LinkAddress
+ {
+ it, err := ns.Options().Iter(false /* check */)
+ if err != nil {
+ // Options are not valid as per the wire format, silently drop the
+ // packet.
+ received.Invalid.Increment()
+ return
+ }
- sourceLinkAddr, ok := getSourceLinkAddr(it)
- if !ok {
- received.Invalid.Increment()
- return
+ sourceLinkAddr, ok = getSourceLinkAddr(it)
+ if !ok {
+ received.Invalid.Increment()
+ return
+ }
}
- unspecifiedSource := r.RemoteAddress == header.IPv6Any
-
// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
// NOT be included when the source IP address is the unspecified address.
// Otherwise, on link layers that have addresses this option MUST be
// included in multicast solicitations and SHOULD be included in unicast
// solicitations.
+ unspecifiedSource := r.RemoteAddress == header.IPv6Any
if len(sourceLinkAddr) == 0 {
if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
received.Invalid.Increment()
@@ -297,41 +300,51 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
return
}
- // ICMPv6 Neighbor Solicit messages are always sent to
- // specially crafted IPv6 multicast addresses. As a result, the
- // route we end up with here has as its LocalAddress such a
- // multicast address. It would be nonsense to claim that our
- // source address is a multicast address, so we manually set
- // the source address to the target address requested in the
- // solicit message. Since that requires mutating the route, we
- // must first clone it.
- r := r.Clone()
- defer r.Release()
- r.LocalAddress = targetAddr
-
- // As per RFC 4861 section 7.2.4, if the the source of the solicitation is
- // the unspecified address, the node MUST set the Solicited flag to zero and
- // multicast the advertisement to the all-nodes address.
- solicited := true
+ // As per RFC 4861 section 7.2.4:
+ //
+ // If the source of the solicitation is the unspecified address, the node
+ // MUST [...] and multicast the advertisement to the all-nodes address.
+ //
+ remoteAddr := r.RemoteAddress
if unspecifiedSource {
- solicited = false
- r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+ remoteAddr = header.IPv6AllNodesMulticastAddress
+ }
+
+ // Even if we were able to receive a packet from some remote, we may not
+ // have a route to it - the remote may be blocked via routing rules. We must
+ // always consult our routing table and find a route to the remote before
+ // sending any packet.
+ r, err := e.protocol.stack.FindRoute(e.nic.ID(), targetAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ // If we cannot find a route to the destination, silently drop the packet.
+ return
}
+ defer r.Release()
- // If the NS has a source link-layer option, use the link address it
- // specifies as the remote link address for the response instead of the
- // source link address of the packet.
+ // If the NS has a source link-layer option, resolve the route immediately
+ // to avoid querying the neighbor table when the neighbor entry was updated
+ // as probing the neighbor table for a link address will transition the
+ // entry's state from stale to delay.
+ //
+ // Note, if the source link address is unspecified and this is a unicast
+ // solicitation, we may need to perform neighbor discovery to send the
+ // neighbor advertisement response. This is expected as per RFC 4861 section
+ // 7.2.4:
+ //
+ // Because unicast Neighbor Solicitations are not required to include a
+ // Source Link-Layer Address, it is possible that a node sending a
+ // solicited Neighbor Advertisement does not have a corresponding link-
+ // layer address for its neighbor in its Neighbor Cache. In such
+ // situations, a node will first have to use Neighbor Discovery to
+ // determine the link-layer address of its neighbor (i.e., send out a
+ // multicast Neighbor Solicitation).
//
- // TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
- // address cache for the right destination link address instead of manually
- // patching the route with the remote link address if one is specified in a
- // Source Link-Layer Address option.
if len(sourceLinkAddr) != 0 {
- r.RemoteLinkAddress = sourceLinkAddr
+ r.ResolveWith(sourceLinkAddr)
}
optsSerializer := header.NDPOptionsSerializer{
- header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+ header.NDPTargetLinkLayerAddressOption(e.nic.LinkAddress()),
}
neighborAdvertSize := header.ICMPv6NeighborAdvertMinimumSize + optsSerializer.Length()
pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
@@ -341,7 +354,14 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize))
packet.SetType(header.ICMPv6NeighborAdvert)
na := header.NDPNeighborAdvert(packet.NDPPayload())
- na.SetSolicitedFlag(solicited)
+
+ // As per RFC 4861 section 7.2.4:
+ //
+ // If the source of the solicitation is the unspecified address, the node
+ // MUST set the Solicited flag to zero and [..]. Otherwise, the node MUST
+ // set the Solicited flag to one and [..].
+ //
+ na.SetSolicitedFlag(!unspecifiedSource)
na.SetOverrideFlag(true)
na.SetTargetAddress(targetAddr)
na.Options().Serialize(optsSerializer)
@@ -419,19 +439,19 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
// If the NA message has the target link layer option, update the link
// address cache with the link address for the target of the message.
- if len(targetLinkAddr) != 0 {
- if e.nud == nil {
+ if e.nud == nil {
+ if len(targetLinkAddr) != 0 {
e.linkAddrCache.AddLinkAddress(e.nic.ID(), targetAddr, targetLinkAddr)
- return
}
-
- e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
- Solicited: na.SolicitedFlag(),
- Override: na.OverrideFlag(),
- IsRouter: na.RouterFlag(),
- })
+ return
}
+ e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
+ Solicited: na.SolicitedFlag(),
+ Override: na.OverrideFlag(),
+ IsRouter: na.RouterFlag(),
+ })
+
case header.ICMPv6EchoRequest:
received.EchoRequest.Increment()
icmpHdr, ok := pkt.TransportHeader().Consume(header.ICMPv6EchoMinimumSize)
@@ -635,6 +655,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAdd
r := stack.Route{
LocalAddress: localAddr,
RemoteAddress: addr,
+ LocalLinkAddress: linkEP.LinkAddress(),
RemoteLinkAddress: remoteLinkAddr,
}
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 3affcc4e4..8dc33c560 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -101,14 +101,19 @@ func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtoco
func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
}
-type stubNUDHandler struct{}
+type stubNUDHandler struct {
+ probeCount int
+ confirmationCount int
+}
var _ stack.NUDHandler = (*stubNUDHandler)(nil)
-func (*stubNUDHandler) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes stack.LinkAddressResolver) {
+func (s *stubNUDHandler) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes stack.LinkAddressResolver) {
+ s.probeCount++
}
-func (*stubNUDHandler) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags stack.ReachabilityConfirmationFlags) {
+func (s *stubNUDHandler) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags stack.ReachabilityConfirmationFlags) {
+ s.confirmationCount++
}
func (*stubNUDHandler) HandleUpperLevelConfirmation(addr tcpip.Address) {
@@ -118,6 +123,12 @@ var _ stack.NetworkInterface = (*testInterface)(nil)
type testInterface struct {
stack.NetworkLinkEndpoint
+
+ linkAddr tcpip.LinkAddress
+}
+
+func (i *testInterface) LinkAddress() tcpip.LinkAddress {
+ return i.linkAddr
}
func (*testInterface) ID() tcpip.NICID {
@@ -1492,3 +1503,240 @@ func TestPacketQueing(t *testing.T) {
})
}
}
+
+func TestCallsToNeighborCache(t *testing.T) {
+ tests := []struct {
+ name string
+ createPacket func() header.ICMPv6
+ multicast bool
+ source tcpip.Address
+ destination tcpip.Address
+ wantProbeCount int
+ wantConfirmationCount int
+ }{
+ {
+ name: "Unicast Neighbor Solicitation without source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ // "The source link-layer address option SHOULD be included in unicast
+ // solicitations." - RFC 4861 section 4.3
+ //
+ // A Neighbor Advertisement needs to be sent in response, but the
+ // Neighbor Cache shouldn't be updated since we have no useful
+ // information about the sender.
+ wantProbeCount: 0,
+ },
+ {
+ name: "Unicast Neighbor Solicitation with source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ ns.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPSourceLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ wantProbeCount: 1,
+ },
+ {
+ name: "Multicast Neighbor Solicitation without source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.SolicitedNodeAddr(lladdr0),
+ // "The source link-layer address option MUST be included in multicast
+ // solicitations." - RFC 4861 section 4.3
+ wantProbeCount: 0,
+ },
+ {
+ name: "Multicast Neighbor Solicitation with source link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(nsSize))
+ icmp.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+ ns.SetTargetAddress(lladdr0)
+ ns.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPSourceLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.SolicitedNodeAddr(lladdr0),
+ wantProbeCount: 1,
+ },
+ {
+ name: "Unicast Neighbor Advertisement without target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ // "When responding to unicast solicitations, the target link-layer
+ // address option can be omitted since the sender of the solicitation has
+ // the correct link-layer address; otherwise, it would not be able to
+ // send the unicast solicitation in the first place."
+ // - RFC 4861 section 4.4
+ wantConfirmationCount: 1,
+ },
+ {
+ name: "Unicast Neighbor Advertisement with target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ na.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: lladdr0,
+ wantConfirmationCount: 1,
+ },
+ {
+ name: "Multicast Neighbor Advertisement without target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(false)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.IPv6AllNodesMulticastAddress,
+ // "Target link-layer address MUST be included for multicast solicitations
+ // in order to avoid infinite Neighbor Solicitation "recursion" when the
+ // peer node does not have a cache entry to return a Neighbor
+ // Advertisements message." - RFC 4861 section 4.4
+ wantConfirmationCount: 0,
+ },
+ {
+ name: "Multicast Neighbor Advertisement with target link-layer address option",
+ createPacket: func() header.ICMPv6 {
+ naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+ icmp := header.ICMPv6(buffer.NewView(naSize))
+ icmp.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(icmp.NDPPayload())
+ na.SetSolicitedFlag(false)
+ na.SetOverrideFlag(false)
+ na.SetTargetAddress(lladdr1)
+ na.Options().Serialize(header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(linkAddr1),
+ })
+ return icmp
+ },
+ source: lladdr1,
+ destination: header.IPv6AllNodesMulticastAddress,
+ wantConfirmationCount: 1,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+ TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+ UseNeighborCache: true,
+ })
+ {
+ if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+ t.Fatalf("CreateNIC(_, _) = %s", err)
+ }
+ if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+ }
+ }
+ {
+ subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+ if err != nil {
+ t.Fatal(err)
+ }
+ s.SetRouteTable(
+ []tcpip.Route{{
+ Destination: subnet,
+ NIC: nicID,
+ }},
+ )
+ }
+
+ netProto := s.NetworkProtocolInstance(ProtocolNumber)
+ if netProto == nil {
+ t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+ }
+ nudHandler := &stubNUDHandler{}
+ ep := netProto.NewEndpoint(&testInterface{linkAddr: linkAddr0}, &stubLinkAddressCache{}, nudHandler, &stubDispatcher{})
+ defer ep.Close()
+
+ if err := ep.Enable(); err != nil {
+ t.Fatalf("ep.Enable(): %s", err)
+ }
+
+ r, err := s.FindRoute(nicID, lladdr0, test.source, ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+ }
+ defer r.Release()
+
+ // TODO(gvisor.dev/issue/4517): Remove the need for this manual patch.
+ r.LocalAddress = test.destination
+
+ icmp := test.createPacket()
+ icmp.SetChecksum(header.ICMPv6Checksum(icmp, r.RemoteAddress, r.LocalAddress, buffer.VectorisedView{}))
+ pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+ ReserveHeaderBytes: header.IPv6MinimumSize,
+ Data: buffer.View(icmp).ToVectorisedView(),
+ })
+ ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ PayloadLength: uint16(len(icmp)),
+ NextHeader: uint8(header.ICMPv6ProtocolNumber),
+ HopLimit: header.NDPHopLimit,
+ SrcAddr: r.RemoteAddress,
+ DstAddr: r.LocalAddress,
+ })
+ ep.HandlePacket(&r, pkt)
+
+ // Confirm the endpoint calls the correct NUDHandler method.
+ if nudHandler.probeCount != test.wantProbeCount {
+ t.Errorf("got nudHandler.probeCount = %d, want = %d", nudHandler.probeCount, test.wantProbeCount)
+ }
+ if nudHandler.confirmationCount != test.wantConfirmationCount {
+ t.Errorf("got nudHandler.confirmationCount = %d, want = %d", nudHandler.confirmationCount, test.wantConfirmationCount)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 2bd8f4ece..9670696c7 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -387,7 +387,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
}
func (e *endpoint) packetMustBeFragmented(pkt *stack.PacketBuffer, gso *stack.GSO) bool {
- return pkt.Size() > int(e.nic.MTU()) && (gso == nil || gso.Type == stack.GSONone)
+ return (gso == nil || gso.Type == stack.GSONone) && pkt.Size() > int(e.nic.MTU())
}
// handleFragments fragments pkt and calls the handler function on each
@@ -416,17 +416,18 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, mtu uint32, p
}
n++
if !more {
- break
+ return n, pf.RemainingFragmentCount(), nil
}
}
-
- return n, 0, nil
}
// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
e.addIPHeader(r, pkt, params)
+ return e.writePacket(r, gso, pkt, params.Protocol)
+}
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber) *tcpip.Error {
// iptables filtering. All packets that reach here are locally
// generated.
nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
@@ -468,7 +469,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
}
if e.packetMustBeFragmented(pkt, gso) {
- sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, params.Protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+ sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
// fragment one by one using WritePacket() (current strategy) or if we
// want to create a PacketBufferList from the fragments and feed it to
@@ -501,21 +502,20 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
for pb := pkts.Front(); pb != nil; pb = pb.Next() {
e.addIPHeader(r, pb, params)
if e.packetMustBeFragmented(pb, gso) {
- current := pb
- _, _, err := e.handleFragments(r, gso, e.nic.MTU(), pb, params.Protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+ // Keep track of the packet that is about to be fragmented so it can be
+ // removed once the fragmentation is done.
+ originalPkt := pb
+ if _, _, err := e.handleFragments(r, gso, e.nic.MTU(), pb, params.Protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
// Modify the packet list in place with the new fragments.
- pkts.InsertAfter(current, fragPkt)
- current = current.Next()
+ pkts.InsertAfter(pb, fragPkt)
+ pb = fragPkt
return nil
- })
- if err != nil {
+ }); err != nil {
r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
return 0, err
}
- // The fragmented packet can be released. The rest of the packets can be
- // processed.
- pkts.Remove(pb)
- pb = current
+ // Remove the packet that was just fragmented and process the rest.
+ pkts.Remove(originalPkt)
}
}
@@ -569,11 +569,40 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
return n + len(dropped), nil
}
-// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
-// supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
- // TODO(b/146666412): Support IPv6 header-included packets.
- return tcpip.ErrNotSupported
+// WriteHeaderIncludedPacker implements stack.NetworkEndpoint.
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+ // The packet already has an IP header, but there are a few required checks.
+ h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+ if !ok {
+ return tcpip.ErrMalformedHeader
+ }
+ ip := header.IPv6(h)
+
+ // Always set the payload length.
+ pktSize := pkt.Data.Size()
+ ip.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize))
+
+ // Set the source address when zero.
+ if ip.SourceAddress() == header.IPv6Any {
+ ip.SetSourceAddress(r.LocalAddress)
+ }
+
+ // Set the destination. If the packet already included a destination, it will
+ // be part of the route anyways.
+ ip.SetDestinationAddress(r.RemoteAddress)
+
+ // Populate the packet buffer's network header and don't allow an invalid
+ // packet to be sent.
+ //
+ // Note that parsing only makes sure that the packet is well formed as per the
+ // wire format. We also want to check if the header's fields are valid before
+ // sending the packet.
+ proto, _, _, _, ok := parse.IPv6(pkt)
+ if !ok || !header.IPv6(pkt.NetworkHeader().View()).IsValid(pktSize) {
+ return tcpip.ErrMalformedHeader
+ }
+
+ return e.writePacket(r, nil /* gso */, pkt, proto)
}
// HandlePacket is called by the link layer when new ipv6 packets arrive for
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index bee18d1a8..297868f24 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -49,6 +49,8 @@ const (
fragmentExtHdrID = uint8(header.IPv6FragmentExtHdrIdentifier)
destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
noNextHdrID = uint8(header.IPv6NoNextHeaderIdentifier)
+
+ extraHeaderReserve = 50
)
// testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
@@ -181,6 +183,9 @@ func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketB
return fmt.Errorf("fragment #%d: fragmentIPHeader mismatch (-want +got):\n%s", i, diff)
}
+ if got := fragment.AvailableHeaderBytes(); got != extraHeaderReserve {
+ return fmt.Errorf("fragment #%d: got packet.AvailableHeaderBytes() = %d, want = %d", i, got, extraHeaderReserve)
+ }
if fragment.NetworkProtocolNumber != sourcePacket.NetworkProtocolNumber {
return fmt.Errorf("fragment #%d: got fragment.NetworkProtocolNumber = %d, want = %d", i, fragment.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber)
}
@@ -208,8 +213,7 @@ func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketB
reassembledPayload.Append(fragment.Data)
}
- result := reassembledPayload.ToView()
- if diff := cmp.Diff(result, buffer.View(source[sourceIPHeadersLen:])); diff != "" {
+ if diff := cmp.Diff(buffer.View(source[sourceIPHeadersLen:]), reassembledPayload.ToView()); diff != "" {
return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
}
@@ -2217,24 +2221,19 @@ type fragmentInfo struct {
payloadSize uint16
}
-type fragmentationTestCase struct {
+var fragmentationTests = []struct {
description string
mtu uint32
gso *stack.GSO
transHdrLen int
- extraHdrLen int
payloadSize int
wantFragments []fragmentInfo
- expectedFrags int
-}
-
-var fragmentationTests = []fragmentationTestCase{
+}{
{
description: "No Fragmentation",
mtu: 1280,
- gso: &stack.GSO{},
+ gso: nil,
transHdrLen: 0,
- extraHdrLen: header.IPv6MinimumSize,
payloadSize: 1000,
wantFragments: []fragmentInfo{
{offset: 0, payloadSize: 1000, more: false},
@@ -2243,9 +2242,8 @@ var fragmentationTests = []fragmentationTestCase{
{
description: "Fragmented",
mtu: 1280,
- gso: &stack.GSO{},
+ gso: nil,
transHdrLen: 0,
- extraHdrLen: header.IPv6MinimumSize,
payloadSize: 2000,
wantFragments: []fragmentInfo{
{offset: 0, payloadSize: 1240, more: true},
@@ -2255,20 +2253,18 @@ var fragmentationTests = []fragmentationTestCase{
{
description: "No fragmentation with big header",
mtu: 2000,
- gso: &stack.GSO{},
+ gso: nil,
transHdrLen: 100,
- extraHdrLen: header.IPv6MinimumSize,
payloadSize: 1000,
wantFragments: []fragmentInfo{
{offset: 0, payloadSize: 1100, more: false},
},
},
{
- description: "Fragmented with gso nil",
+ description: "Fragmented with gso none",
mtu: 1280,
- gso: nil,
+ gso: &stack.GSO{Type: stack.GSONone},
transHdrLen: 0,
- extraHdrLen: header.IPv6MinimumSize,
payloadSize: 1400,
wantFragments: []fragmentInfo{
{offset: 0, payloadSize: 1240, more: true},
@@ -2278,30 +2274,17 @@ var fragmentationTests = []fragmentationTestCase{
{
description: "Fragmented with big header",
mtu: 1280,
- gso: &stack.GSO{},
+ gso: nil,
transHdrLen: 100,
- extraHdrLen: header.IPv6MinimumSize,
payloadSize: 1200,
wantFragments: []fragmentInfo{
{offset: 0, payloadSize: 1240, more: true},
{offset: 154, payloadSize: 76, more: false},
},
},
- {
- description: "Fragmented with big header and prependable bytes",
- mtu: 1280,
- gso: &stack.GSO{},
- transHdrLen: 20,
- extraHdrLen: header.IPv6MinimumSize + 66,
- payloadSize: 1500,
- wantFragments: []fragmentInfo{
- {offset: 0, payloadSize: 1240, more: true},
- {offset: 154, payloadSize: 296, more: false},
- },
- },
}
-func TestFragmentation(t *testing.T) {
+func TestFragmentationWritePacket(t *testing.T) {
const (
ttl = 42
tos = stack.DefaultTOS
@@ -2310,7 +2293,7 @@ func TestFragmentation(t *testing.T) {
for _, ft := range fragmentationTests {
t.Run(ft.description, func(t *testing.T) {
- pkt := testutil.MakeRandPkt(ft.transHdrLen, ft.extraHdrLen, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+ pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
source := pkt.Clone()
ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
r := buildRoute(t, ep)
@@ -2331,10 +2314,8 @@ func TestFragmentation(t *testing.T) {
if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
}
- if len(ep.WrittenPackets) > 0 {
- if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
- t.Error(err)
- }
+ if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+ t.Error(err)
}
})
}
@@ -2368,7 +2349,7 @@ func TestFragmentationWritePackets(t *testing.T) {
insertAfter: 1,
},
}
- tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, header.IPv6MinimumSize, []int{1}, header.IPv6ProtocolNumber)
+ tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, extraHeaderReserve+header.IPv6MinimumSize, []int{1}, header.IPv6ProtocolNumber)
for _, test := range tests {
t.Run(test.description, func(t *testing.T) {
@@ -2378,7 +2359,7 @@ func TestFragmentationWritePackets(t *testing.T) {
for i := 0; i < test.insertBefore; i++ {
pkts.PushBack(tinyPacket.Clone())
}
- pkt := testutil.MakeRandPkt(ft.transHdrLen, ft.extraHdrLen, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+ pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
source := pkt
pkts.PushBack(pkt.Clone())
for i := 0; i < test.insertAfter; i++ {
@@ -2480,7 +2461,7 @@ func TestFragmentationErrors(t *testing.T) {
for _, ft := range tests {
t.Run(ft.description, func(t *testing.T) {
- pkt := testutil.MakeRandPkt(ft.transHdrLen, header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+ pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
ep := testutil.NewMockLinkEndpoint(ft.mtu, ft.mockError, ft.allowPackets)
r := buildRoute(t, ep)
err := r.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 9033a9ed5..ac20f217e 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -15,6 +15,7 @@
package ipv6
import (
+ "context"
"strings"
"testing"
"time"
@@ -398,16 +399,17 @@ func TestNeighorSolicitationResponse(t *testing.T) {
}
tests := []struct {
- name string
- nsOpts header.NDPOptionsSerializer
- nsSrcLinkAddr tcpip.LinkAddress
- nsSrc tcpip.Address
- nsDst tcpip.Address
- nsInvalid bool
- naDstLinkAddr tcpip.LinkAddress
- naSolicited bool
- naSrc tcpip.Address
- naDst tcpip.Address
+ name string
+ nsOpts header.NDPOptionsSerializer
+ nsSrcLinkAddr tcpip.LinkAddress
+ nsSrc tcpip.Address
+ nsDst tcpip.Address
+ nsInvalid bool
+ naDstLinkAddr tcpip.LinkAddress
+ naSolicited bool
+ naSrc tcpip.Address
+ naDst tcpip.Address
+ performsLinkResolution bool
}{
{
name: "Unspecified source to solicited-node multicast destination",
@@ -416,7 +418,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
nsSrc: header.IPv6Any,
nsDst: nicAddrSNMC,
nsInvalid: false,
- naDstLinkAddr: remoteLinkAddr0,
+ naDstLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllNodesMulticastAddress),
naSolicited: false,
naSrc: nicAddr,
naDst: header.IPv6AllNodesMulticastAddress,
@@ -449,7 +451,6 @@ func TestNeighorSolicitationResponse(t *testing.T) {
nsDst: nicAddr,
nsInvalid: true,
},
-
{
name: "Specified source with 1 source ll to multicast destination",
nsOpts: header.NDPOptionsSerializer{
@@ -509,6 +510,10 @@ func TestNeighorSolicitationResponse(t *testing.T) {
naSolicited: true,
naSrc: nicAddr,
naDst: remoteAddr,
+ // Since we send a unicast solicitations to a node without an entry for
+ // the remote, the node needs to perform neighbor discovery to get the
+ // remote's link address to send the advertisement response.
+ performsLinkResolution: true,
},
{
name: "Specified source with 1 source ll to unicast destination",
@@ -615,11 +620,78 @@ func TestNeighorSolicitationResponse(t *testing.T) {
t.Fatalf("got invalid = %d, want = 0", got)
}
- p, got := e.Read()
+ if test.performsLinkResolution {
+ p, got := e.ReadContext(context.Background())
+ if !got {
+ t.Fatal("expected an NDP NS response")
+ }
+
+ if p.Route.LocalAddress != nicAddr {
+ t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, nicAddr)
+ }
+ if p.Route.LocalLinkAddress != nicLinkAddr {
+ t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+ }
+ respNSDst := header.SolicitedNodeAddr(test.nsSrc)
+ if p.Route.RemoteAddress != respNSDst {
+ t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst)
+ }
+ if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want {
+ t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+ }
+
+ checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+ checker.SrcAddr(nicAddr),
+ checker.DstAddr(respNSDst),
+ checker.TTL(header.NDPHopLimit),
+ checker.NDPNS(
+ checker.NDPNSTargetAddress(test.nsSrc),
+ checker.NDPNSOptions([]header.NDPOption{
+ header.NDPSourceLinkLayerAddressOption(nicLinkAddr),
+ }),
+ ))
+
+ ser := header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(linkAddr1),
+ }
+ ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + ser.Length()
+ hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+ pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+ pkt.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(pkt.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(true)
+ na.SetTargetAddress(test.nsSrc)
+ na.Options().Serialize(ser)
+ pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, nicAddr, buffer.VectorisedView{}))
+ payloadLength := hdr.UsedLength()
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ PayloadLength: uint16(payloadLength),
+ NextHeader: uint8(header.ICMPv6ProtocolNumber),
+ HopLimit: header.NDPHopLimit,
+ SrcAddr: test.nsSrc,
+ DstAddr: nicAddr,
+ })
+ e.InjectLinkAddr(ProtocolNumber, "", stack.NewPacketBuffer(stack.PacketBufferOptions{
+ Data: hdr.View().ToVectorisedView(),
+ }))
+ }
+
+ p, got := e.ReadContext(context.Background())
if !got {
t.Fatal("expected an NDP NA response")
}
+ if p.Route.LocalAddress != test.naSrc {
+ t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, test.naSrc)
+ }
+ if p.Route.LocalLinkAddress != nicLinkAddr {
+ t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+ }
+ if p.Route.RemoteAddress != test.naDst {
+ t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, test.naDst)
+ }
if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
}
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index eba97334e..d09ebe7fa 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -123,6 +123,7 @@ go_test(
"//pkg/tcpip/header",
"//pkg/tcpip/link/channel",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/network/arp",
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/ports",
diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go
index 4d69a4de1..be61a21af 100644
--- a/pkg/tcpip/stack/neighbor_entry.go
+++ b/pkg/tcpip/stack/neighbor_entry.go
@@ -406,9 +406,9 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
// INCOMPLETE state." - RFC 4861 section 7.2.5
case Reachable, Stale, Delay, Probe:
- sameLinkAddr := e.neigh.LinkAddr == linkAddr
+ isLinkAddrDifferent := len(linkAddr) != 0 && e.neigh.LinkAddr != linkAddr
- if !sameLinkAddr {
+ if isLinkAddrDifferent {
if !flags.Override {
if e.neigh.State == Reachable {
e.dispatchChangeEventLocked(Stale)
@@ -431,7 +431,7 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
}
}
- if flags.Solicited && (flags.Override || sameLinkAddr) {
+ if flags.Solicited && (flags.Override || !isLinkAddrDifferent) {
if e.neigh.State != Reachable {
e.dispatchChangeEventLocked(Reachable)
}
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
index e79abebca..3ee2a3b31 100644
--- a/pkg/tcpip/stack/neighbor_entry_test.go
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -83,15 +83,18 @@ func eventDiffOptsWithSort() []cmp.Option {
// | Reachable | Stale | Reachable timer expired | | Changed |
// | Reachable | Stale | Probe or confirmation w/ different address | | Changed |
// | Stale | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
+// | Stale | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Stale | Stale | Override confirmation | Update LinkAddr | Changed |
// | Stale | Stale | Probe w/ different address | Update LinkAddr | Changed |
// | Stale | Delay | Packet sent | | Changed |
// | Delay | Reachable | Upper-layer confirmation | | Changed |
// | Delay | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
+// | Delay | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Delay | Stale | Probe or confirmation w/ different address | | Changed |
// | Delay | Probe | Delay timer expired | Send probe | Changed |
// | Probe | Reachable | Solicited override confirmation | Update LinkAddr | Changed |
// | Probe | Reachable | Solicited confirmation w/ same address | Notify wakers | Changed |
+// | Probe | Reachable | Solicited confirmation w/o address | Notify wakers | Changed |
// | Probe | Stale | Probe or confirmation w/ different address | | Changed |
// | Probe | Probe | Retransmit timer expired | Send probe | Changed |
// | Probe | Failed | Max probes sent without reply | Notify wakers | Removed |
@@ -1370,6 +1373,77 @@ func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
nudDisp.mu.Unlock()
}
+func TestEntryStaleToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+ c := DefaultNUDConfigurations()
+ e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+ e.mu.Lock()
+ e.handlePacketQueuedLocked()
+ e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+ Solicited: false,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Stale {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Stale)
+ }
+ e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+ Solicited: true,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Reachable {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+ }
+ if e.neigh.LinkAddr != entryTestLinkAddr1 {
+ t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+ }
+ e.mu.Unlock()
+
+ wantProbes := []entryTestProbeInfo{
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: tcpip.LinkAddress(""),
+ LocalAddress: entryTestAddr2,
+ },
+ }
+ linkRes.mu.Lock()
+ diff := cmp.Diff(linkRes.probes, wantProbes)
+ linkRes.mu.Unlock()
+ if diff != "" {
+ t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+ }
+
+ wantEvents := []testEntryEventInfo{
+ {
+ EventType: entryTestAdded,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: tcpip.LinkAddress(""),
+ State: Incomplete,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Reachable,
+ },
+ }
+ nudDisp.mu.Lock()
+ if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+ t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+ }
+ nudDisp.mu.Unlock()
+}
+
func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) {
c := DefaultNUDConfigurations()
e, nudDisp, linkRes, _ := entryTestSetup(c)
@@ -1752,6 +1826,100 @@ func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
nudDisp.mu.Unlock()
}
+func TestEntryDelayToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+ c := DefaultNUDConfigurations()
+ c.MaxMulticastProbes = 1
+ // Eliminate random factors from ReachableTime computation so the transition
+ // from Stale to Reachable will only take BaseReachableTime duration.
+ c.MinRandomFactor = 1
+ c.MaxRandomFactor = 1
+
+ e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+ e.mu.Lock()
+ e.handlePacketQueuedLocked()
+ e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+ Solicited: false,
+ Override: false,
+ IsRouter: false,
+ })
+ e.handlePacketQueuedLocked()
+ if e.neigh.State != Delay {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Delay)
+ }
+ e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+ Solicited: true,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Reachable {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+ }
+ if e.neigh.LinkAddr != entryTestLinkAddr1 {
+ t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+ }
+ e.mu.Unlock()
+
+ wantProbes := []entryTestProbeInfo{
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: tcpip.LinkAddress(""),
+ LocalAddress: entryTestAddr2,
+ },
+ }
+ linkRes.mu.Lock()
+ diff := cmp.Diff(linkRes.probes, wantProbes)
+ linkRes.mu.Unlock()
+ if diff != "" {
+ t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+ }
+
+ clock.Advance(c.BaseReachableTime)
+
+ wantEvents := []testEntryEventInfo{
+ {
+ EventType: entryTestAdded,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: tcpip.LinkAddress(""),
+ State: Incomplete,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Delay,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Reachable,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ }
+ nudDisp.mu.Lock()
+ if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+ t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+ }
+ nudDisp.mu.Unlock()
+}
+
func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) {
c := DefaultNUDConfigurations()
e, nudDisp, linkRes, _ := entryTestSetup(c)
@@ -2665,6 +2833,115 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
nudDisp.mu.Unlock()
}
+func TestEntryProbeToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+ c := DefaultNUDConfigurations()
+ // Eliminate random factors from ReachableTime computation so the transition
+ // from Stale to Reachable will only take BaseReachableTime duration.
+ c.MinRandomFactor = 1
+ c.MaxRandomFactor = 1
+
+ e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+ e.mu.Lock()
+ e.handlePacketQueuedLocked()
+ e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+ Solicited: false,
+ Override: false,
+ IsRouter: false,
+ })
+ e.handlePacketQueuedLocked()
+ e.mu.Unlock()
+
+ clock.Advance(c.DelayFirstProbeTime)
+
+ wantProbes := []entryTestProbeInfo{
+ // The first probe is caused by the Unknown-to-Incomplete transition.
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: tcpip.LinkAddress(""),
+ LocalAddress: entryTestAddr2,
+ },
+ // The second probe is caused by the Delay-to-Probe transition.
+ {
+ RemoteAddress: entryTestAddr1,
+ RemoteLinkAddress: entryTestLinkAddr1,
+ LocalAddress: entryTestAddr2,
+ },
+ }
+ linkRes.mu.Lock()
+ diff := cmp.Diff(linkRes.probes, wantProbes)
+ linkRes.mu.Unlock()
+ if diff != "" {
+ t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+ }
+
+ e.mu.Lock()
+ if e.neigh.State != Probe {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Probe)
+ }
+ e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+ Solicited: true,
+ Override: false,
+ IsRouter: false,
+ })
+ if e.neigh.State != Reachable {
+ t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+ }
+ e.mu.Unlock()
+
+ clock.Advance(c.BaseReachableTime)
+
+ wantEvents := []testEntryEventInfo{
+ {
+ EventType: entryTestAdded,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: tcpip.LinkAddress(""),
+ State: Incomplete,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Delay,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Probe,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Reachable,
+ },
+ {
+ EventType: entryTestChanged,
+ NICID: entryTestNICID,
+ Addr: entryTestAddr1,
+ LinkAddr: entryTestLinkAddr1,
+ State: Stale,
+ },
+ }
+ nudDisp.mu.Lock()
+ if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+ t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+ }
+ nudDisp.mu.Unlock()
+}
+
func TestEntryProbeToFailed(t *testing.T) {
c := DefaultNUDConfigurations()
c.MaxMulticastProbes = 3
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 8828cc5fe..dcd4319bf 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sleep"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
- "gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
)
@@ -686,7 +685,9 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
// packet to forward.
fwdPkt := NewPacketBuffer(PacketBufferOptions{
ReserveHeaderBytes: int(n.LinkEndpoint.MaxHeaderLength()),
- Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
+ // We need to do a deep copy of the IP packet because WritePacket (and
+ // friends) take ownership of the packet buffer, but we do not own it.
+ Data: PayloadSince(pkt.NetworkHeader()).ToVectorisedView(),
})
// TODO(b/143425874) Decrease the TTL field in forwarded packets.
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 105583c49..7f54a6de8 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -311,11 +311,25 @@ func (h PacketHeader) Consume(size int) (v buffer.View, consumed bool) {
}
// PayloadSince returns packet payload starting from and including a particular
-// header. This method isn't optimized and should be used in test only.
+// header.
+//
+// The returned View is owned by the caller - its backing buffer is separate
+// from the packet header's underlying packet buffer.
func PayloadSince(h PacketHeader) buffer.View {
- var v buffer.View
+ size := h.pk.Data.Size()
+ for _, hinfo := range h.pk.headers[h.typ:] {
+ size += len(hinfo.buf)
+ }
+
+ v := make(buffer.View, 0, size)
+
for _, hinfo := range h.pk.headers[h.typ:] {
v = append(v, hinfo.buf...)
}
- return append(v, h.pk.Data.ToView()...)
+
+ for _, view := range h.pk.Data.Views() {
+ v = append(v, view...)
+ }
+
+ return v
}
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 25f80c1f8..b76e2d37b 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -126,6 +126,12 @@ func (r *Route) GSOMaxSize() uint32 {
return 0
}
+// ResolveWith immediately resolves a route with the specified remote link
+// address.
+func (r *Route) ResolveWith(addr tcpip.LinkAddress) {
+ r.RemoteLinkAddress = addr
+}
+
// Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
// case address resolution requires blocking, e.g. wait for ARP reply. Waker is
// notified when address resolution is complete (success or not).
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 38994cca1..e75f58c64 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -34,6 +34,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -3498,6 +3499,52 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
}
}
+func TestResolveWith(t *testing.T) {
+ const (
+ unspecifiedNICID = 0
+ nicID = 1
+ )
+
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol},
+ })
+ ep := channel.New(0, defaultMTU, "")
+ ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+ if err := s.CreateNIC(nicID, ep); err != nil {
+ t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+ }
+ addr := tcpip.ProtocolAddress{
+ Protocol: header.IPv4ProtocolNumber,
+ AddressWithPrefix: tcpip.AddressWithPrefix{
+ Address: tcpip.Address([]byte{192, 168, 1, 58}),
+ PrefixLen: 24,
+ },
+ }
+ if err := s.AddProtocolAddress(nicID, addr); err != nil {
+ t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, addr, err)
+ }
+
+ s.SetRouteTable([]tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}})
+
+ remoteAddr := tcpip.Address([]byte{192, 168, 1, 59})
+ r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, remoteAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, remoteAddr, header.IPv4ProtocolNumber, err)
+ }
+ defer r.Release()
+
+ // Should initially require resolution.
+ if !r.IsResolutionRequired() {
+ t.Fatal("got r.IsResolutionRequired() = false, want = true")
+ }
+
+ // Manually resolving the route should no longer require resolution.
+ r.ResolveWith("\x01")
+ if r.IsResolutionRequired() {
+ t.Fatal("got r.IsResolutionRequired() = true, want = false")
+ }
+}
+
// TestRouteReleaseAfterAddrRemoval tests that releasing a Route after its
// associated address is removed should not cause a panic.
func TestRouteReleaseAfterAddrRemoval(t *testing.T) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index c42bb0991..d77848d61 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -111,6 +111,7 @@ var (
ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"}
ErrNotPermitted = &Error{msg: "operation not permitted"}
ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
+ ErrMalformedHeader = &Error{msg: "header is malformed"}
)
var messageToError map[string]*Error
@@ -159,6 +160,7 @@ func StringToError(s string) *Error {
ErrBroadcastDisabled,
ErrNotPermitted,
ErrAddressFamilyNotSupported,
+ ErrMalformedHeader,
}
messageToError = make(map[string]*Error)
diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD
index a4f141253..34aab32d0 100644
--- a/pkg/tcpip/tests/integration/BUILD
+++ b/pkg/tcpip/tests/integration/BUILD
@@ -16,6 +16,7 @@ go_test(
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
"//pkg/tcpip/link/channel",
+ "//pkg/tcpip/link/ethernet",
"//pkg/tcpip/link/loopback",
"//pkg/tcpip/link/pipe",
"//pkg/tcpip/network/arp",
diff --git a/pkg/tcpip/tests/integration/forward_test.go b/pkg/tcpip/tests/integration/forward_test.go
index ffd38ee1a..0dcef7b04 100644
--- a/pkg/tcpip/tests/integration/forward_test.go
+++ b/pkg/tcpip/tests/integration/forward_test.go
@@ -21,6 +21,7 @@ import (
"github.com/google/go-cmp/cmp"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
"gvisor.dev/gvisor/pkg/tcpip/link/pipe"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -178,19 +179,19 @@ func TestForwarding(t *testing.T) {
routerStack := stack.New(stackOpts)
host2Stack := stack.New(stackOpts)
- host1NIC, routerNIC1 := pipe.New(host1NICLinkAddr, routerNIC1LinkAddr, stack.CapabilityResolutionRequired)
- routerNIC2, host2NIC := pipe.New(routerNIC2LinkAddr, host2NICLinkAddr, stack.CapabilityResolutionRequired)
+ host1NIC, routerNIC1 := pipe.New(host1NICLinkAddr, routerNIC1LinkAddr)
+ routerNIC2, host2NIC := pipe.New(routerNIC2LinkAddr, host2NICLinkAddr)
- if err := host1Stack.CreateNIC(host1NICID, host1NIC); err != nil {
+ if err := host1Stack.CreateNIC(host1NICID, ethernet.New(host1NIC)); err != nil {
t.Fatalf("host1Stack.CreateNIC(%d, _): %s", host1NICID, err)
}
- if err := routerStack.CreateNIC(routerNICID1, routerNIC1); err != nil {
+ if err := routerStack.CreateNIC(routerNICID1, ethernet.New(routerNIC1)); err != nil {
t.Fatalf("routerStack.CreateNIC(%d, _): %s", routerNICID1, err)
}
- if err := routerStack.CreateNIC(routerNICID2, routerNIC2); err != nil {
+ if err := routerStack.CreateNIC(routerNICID2, ethernet.New(routerNIC2)); err != nil {
t.Fatalf("routerStack.CreateNIC(%d, _): %s", routerNICID2, err)
}
- if err := host2Stack.CreateNIC(host2NICID, host2NIC); err != nil {
+ if err := host2Stack.CreateNIC(host2NICID, ethernet.New(host2NIC)); err != nil {
t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err)
}
diff --git a/pkg/tcpip/tests/integration/link_resolution_test.go b/pkg/tcpip/tests/integration/link_resolution_test.go
index bf3a6f6ee..6ddcda70c 100644
--- a/pkg/tcpip/tests/integration/link_resolution_test.go
+++ b/pkg/tcpip/tests/integration/link_resolution_test.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
"gvisor.dev/gvisor/pkg/tcpip/link/pipe"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -126,12 +127,12 @@ func TestPing(t *testing.T) {
host1Stack := stack.New(stackOpts)
host2Stack := stack.New(stackOpts)
- host1NIC, host2NIC := pipe.New(host1NICLinkAddr, host2NICLinkAddr, stack.CapabilityResolutionRequired)
+ host1NIC, host2NIC := pipe.New(host1NICLinkAddr, host2NICLinkAddr)
- if err := host1Stack.CreateNIC(host1NICID, host1NIC); err != nil {
+ if err := host1Stack.CreateNIC(host1NICID, ethernet.New(host1NIC)); err != nil {
t.Fatalf("host1Stack.CreateNIC(%d, _): %s", host1NICID, err)
}
- if err := host2Stack.CreateNIC(host2NICID, host2NIC); err != nil {
+ if err := host2Stack.CreateNIC(host2NICID, ethernet.New(host2NIC)); err != nil {
t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err)
}
diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
index 4f2ca7f54..f1028823b 100644
--- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go
+++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
@@ -80,6 +80,7 @@ func TestPingMulticastBroadcast(t *testing.T) {
SrcAddr: remoteIPv4Addr,
DstAddr: dst,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),
@@ -250,6 +251,7 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
SrcAddr: remoteIPv4Addr,
DstAddr: dst,
})
+ ip.SetChecksum(^ip.CalculateChecksum())
e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: hdr.View().ToVectorisedView(),