From abbdcebc543242862fad0984db2db0a842021917 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 8 Mar 2021 20:37:14 -0800 Subject: Implement /proc/sys/net/ipv4/ip_local_port_range Speeds up the socket stress tests by a couple orders of magnitude. PiperOrigin-RevId: 361721050 --- pkg/sentry/fs/proc/sys_net.go | 121 +++++++++++++++++++++++++++++++++++- pkg/sentry/fsimpl/proc/tasks_sys.go | 78 +++++++++++++++++++++-- pkg/sentry/inet/inet.go | 8 +++ pkg/sentry/inet/test_stack.go | 12 ++++ pkg/sentry/socket/hostinet/stack.go | 11 ++++ pkg/sentry/socket/netstack/stack.go | 10 +++ 6 files changed, 233 insertions(+), 7 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 52061175f..bbe282c03 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -17,6 +17,7 @@ package proc import ( "fmt" "io" + "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -26,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -498,6 +500,120 @@ func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IO return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled) } +// portRangeInode implements fs.InodeOperations. It provides and allows +// modification of the range of ephemeral ports that IPv4 and IPv6 sockets +// choose from. +// +// +stateify savable +type portRangeInode struct { + fsutil.SimpleFileInode + + stack inet.Stack `state:"wait"` + + // start and end store the port range. We must save/restore this here, + // since a netstack instance is created on restore. + start *uint16 + end *uint16 +} + +func newPortRangeInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + ipf := &portRangeInode{ + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + stack: s, + } + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(ctx, ipf, msrc, sattr) +} + +// Truncate implements fs.InodeOperations.Truncate. Truncate is called when +// O_TRUNC is specified for any kind of existing Dirent but is not called via +// (f)truncate for proc files. +func (*portRangeInode) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// +stateify savable +type portRangeFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + inode *portRangeInode +} + +// GetFile implements fs.InodeOperations.GetFile. +func (in *portRangeInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + flags.Pread = true + flags.Pwrite = true + return fs.NewFile(ctx, dirent, flags, &portRangeFile{ + inode: in, + }), nil +} + +// Read implements fs.FileOperations.Read. +func (pf *portRangeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + + if pf.inode.start == nil { + start, end := pf.inode.stack.PortRange() + pf.inode.start = &start + pf.inode.end = &end + } + + contents := fmt.Sprintf("%d %d\n", *pf.inode.start, *pf.inode.end) + n, err := dst.CopyOut(ctx, []byte(contents)) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +// +// Offset is ignored, multiple writes are not supported. +func (pf *portRangeFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Only consider size of one memory page for input for performance + // reasons. + src = src.TakeFirst(usermem.PageSize - 1) + + ports := make([]int32, 2) + n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts) + if err != nil { + return 0, err + } + + // Port numbers must be uint16s. + if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { + return 0, syserror.EINVAL + } + + if err := pf.inode.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { + return 0, err + } + if pf.inode.start == nil { + pf.inode.start = new(uint16) + pf.inode.end = new(uint16) + } + *pf.inode.start = uint16(ports[0]) + *pf.inode.end = uint16(ports[1]) + return n, nil +} + func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { contents := map[string]*fs.Inode{ // Add tcp_sack. @@ -506,12 +622,15 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine // Add ip_forward. "ip_forward": newIPForwardingInode(ctx, msrc, s), + // Allow for configurable ephemeral port ranges. Note that this + // controls ports for both IPv4 and IPv6 sockets. + "ip_local_port_range": newPortRangeInode(ctx, msrc, s), + // The following files are simple stubs until they are // implemented in netstack, most of these files are // configuration related. We use the value closest to the // actual netstack behavior or any empty file, all of these // files will have mode 0444 (read-only for all users). - "ip_local_port_range": newStaticProcInode(ctx, msrc, []byte("16000 65535")), "ip_local_reserved_ports": newStaticProcInode(ctx, msrc, []byte("")), "ipfrag_time": newStaticProcInode(ctx, msrc, []byte("30")), "ip_nonlocal_bind": newStaticProcInode(ctx, msrc, []byte("0")), diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index fd7823daa..fb274b78e 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -17,6 +17,7 @@ package proc import ( "bytes" "fmt" + "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -69,17 +70,17 @@ func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]kernfs.Inode{ "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ - "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}), - "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), - "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}), - "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), - "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}), + "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}), + "ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}), + "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}), + "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), + "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}), + "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). - "ip_local_port_range": fs.newInode(ctx, root, 0444, newStaticFile("16000 65535")), "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")), "ipfrag_time": fs.newInode(ctx, root, 0444, newStaticFile("30")), "ip_nonlocal_bind": fs.newInode(ctx, root, 0444, newStaticFile("0")), @@ -421,3 +422,68 @@ func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offs } return n, nil } + +// portRange implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/ip_local_port_range. +// +// +stateify savable +type portRange struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` + + // start and end store the port range. We must save/restore this here, + // since a netstack instance is created on restore. + start *uint16 + end *uint16 +} + +var _ vfs.WritableDynamicBytesSource = (*portRange)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error { + if pr.start == nil { + start, end := pr.stack.PortRange() + pr.start = &start + pr.end = &end + } + _, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end) + return err +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is + // large. + src = src.TakeFirst(usermem.PageSize - 1) + + ports := make([]int32, 2) + n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts) + if err != nil { + return 0, err + } + + // Port numbers must be uint16s. + if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { + return 0, syserror.EINVAL + } + + if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { + return 0, err + } + if pr.start == nil { + pr.start = new(uint16) + pr.end = new(uint16) + } + *pr.start = uint16(ports[0]) + *pr.end = uint16(ports[1]) + return n, nil +} diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index f31277d30..6b71bd3a9 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -93,6 +93,14 @@ type Stack interface { // SetForwarding enables or disables packet forwarding between NICs. SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error + + // PortRange returns the UDP and TCP inclusive range of ephemeral ports + // used in both IPv4 and IPv6. + PortRange() (uint16, uint16) + + // SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range + // (inclusive). + SetPortRange(start uint16, end uint16) error } // Interface contains information about a network interface. diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index 9ebeba8a3..03e2608c2 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -164,3 +164,15 @@ func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable b s.IPForwarding = enable return nil } + +// PortRange implements inet.Stack.PortRange. +func (*TestStack) PortRange() (uint16, uint16) { + // Use the default Linux values per net/ipv4/af_inet.c:inet_init_net(). + return 32768, 28232 +} + +// SetPortRange implements inet.Stack.SetPortRange. +func (*TestStack) SetPortRange(start uint16, end uint16) error { + // No-op. + return nil +} diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index e6323244c..5bcf92e14 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -504,3 +504,14 @@ func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { return syserror.EACCES } + +// PortRange implements inet.Stack.PortRange. +func (*Stack) PortRange() (uint16, uint16) { + // Use the default Linux values per net/ipv4/af_inet.c:inet_init_net(). + return 32768, 28232 +} + +// SetPortRange implements inet.Stack.SetPortRange. +func (*Stack) SetPortRange(start uint16, end uint16) error { + return syserror.EACCES +} diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 71c3bc034..b215067cf 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -478,3 +478,13 @@ func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) } return nil } + +// PortRange implements inet.Stack.PortRange. +func (s *Stack) PortRange() (uint16, uint16) { + return s.Stack.PortRange() +} + +// SetPortRange implements inet.Stack.SetPortRange. +func (s *Stack) SetPortRange(start uint16, end uint16) error { + return syserr.TranslateNetstackError(s.Stack.SetPortRange(start, end)).ToError() +} -- cgit v1.2.3