From 9143fcd7fd38243dd40f927dafaeb75f6ef8ef49 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Tue, 21 Jan 2020 14:47:17 -0800 Subject: Add UDP matchers. --- pkg/abi/linux/netfilter.go | 92 +++++++++++++++++ pkg/sentry/socket/netfilter/netfilter.go | 164 ++++++++++++++++++++++++++----- pkg/tcpip/iptables/BUILD | 2 + pkg/tcpip/iptables/iptables.go | 3 + pkg/tcpip/iptables/tcp_matcher.go | 122 +++++++++++++++++++++++ pkg/tcpip/iptables/types.go | 17 ++++ pkg/tcpip/iptables/udp_matcher.go | 127 ++++++++++++++++++++++++ pkg/tcpip/network/ipv4/ipv4.go | 10 +- 8 files changed, 509 insertions(+), 28 deletions(-) create mode 100644 pkg/tcpip/iptables/tcp_matcher.go create mode 100644 pkg/tcpip/iptables/udp_matcher.go (limited to 'pkg') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index 33fcc6c95..fb4588272 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -340,3 +340,95 @@ func goString(cstring []byte) string { } return string(cstring) } + +// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp +// in include/uapi/linux/netfilter/xt_tcpudp.h. +type XTTCP struct { + // SourcePortStart specifies the inclusive start of the range of source + // ports to which the matcher applies. + SourcePortStart uint16 + + // SourcePortEnd specifies the inclusive end of the range of source ports + // to which the matcher applies. + SourcePortEnd uint16 + + // DestinationPortStart specifies the start of the destination port + // range to which the matcher applies. + DestinationPortStart uint16 + + // DestinationPortEnd specifies the start of the destination port + // range to which the matcher applies. + DestinationPortEnd uint16 + + // Option specifies that a particular TCP option must be set. + Option uint8 + + // FlagMask masks the FlagCompare byte when comparing to the TCP flag + // fields. + FlagMask uint8 + + // FlagCompare is binary and-ed with the TCP flag fields. + FlagCompare uint8 + + // InverseFlags flips the meaning of certain fields. See the + // TX_TCP_INV_* flags. + InverseFlags uint8 +} + +// SizeOfXTTCP is the size of an XTTCP. +const SizeOfXTTCP = 12 + +// Flags in XTTCP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter/xt_tcpudp.h. +const ( + // Invert the meaning of SourcePortStart/End. + XT_TCP_INV_SRCPT = 0x01 + // Invert the meaning of DestinationPortStart/End. + XT_TCP_INV_DSTPT = 0x02 + // Invert the meaning of FlagCompare. + XT_TCP_INV_FLAGS = 0x04 + // Invert the meaning of Option. + XT_TCP_INV_OPTION = 0x08 + // Enable all flags. + XT_TCP_INV_MASK = 0x0F +) + +// XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp +// in include/uapi/linux/netfilter/xt_tcpudp.h. +type XTUDP struct { + // SourcePortStart specifies the inclusive start of the range of source + // ports to which the matcher applies. + SourcePortStart uint16 + + // SourcePortEnd specifies the inclusive end of the range of source ports + // to which the matcher applies. + SourcePortEnd uint16 + + // DestinationPortStart specifies the start of the destination port + // range to which the matcher applies. + DestinationPortStart uint16 + + // DestinationPortEnd specifies the start of the destination port + // range to which the matcher applies. + DestinationPortEnd uint16 + + // InverseFlags flips the meaning of certain fields. See the + // TX_UDP_INV_* flags. + InverseFlags uint8 + + _ uint8 +} + +// SizeOfXTUDP is the size of an XTUDP. +const SizeOfXTUDP = 10 + +// Flags in XTUDP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter/xt_tcpudp.h. +const ( + // Invert the meaning of SourcePortStart/End. + XT_UDP_INV_SRCPT = 0x01 + // Invert the meaning of DestinationPortStart/End. + XT_UDP_INV_DSTPT = 0x02 + // Enable all flags. + XT_UDP_INV_MASK = 0x03 +) diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index e1f2bacce..45296b339 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -131,6 +131,7 @@ func FillDefaultIPTables(stack *stack.Stack) { stack.SetIPTables(ipt) } +// TODO: Return proto. // convertNetstackToBinary converts the iptables as stored in netstack to the // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a bit, reading some offsets, @@ -318,10 +319,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { } var entry linux.IPTEntry buf := optVal[:linux.SizeOfIPTEntry] - optVal = optVal[linux.SizeOfIPTEntry:] binary.Unmarshal(buf, usermem.ByteOrder, &entry) - if entry.TargetOffset != linux.SizeOfIPTEntry { - // TODO(gvisor.dev/issue/170): Support matchers. + initialOptValLen := len(optVal) + optVal = optVal[linux.SizeOfIPTEntry:] + + if entry.TargetOffset < linux.SizeOfIPTEntry { + log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset) return syserr.ErrInvalidArgument } @@ -332,19 +335,41 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { return err } + // TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables. + // Get matchers. + matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry + if len(optVal) < int(matchersSize) { + log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + } + matchers, err := parseMatchers(filter, optVal[:matchersSize]) + if err != nil { + log.Warningf("netfilter: failed to parse matchers: %v", err) + return err + } + optVal = optVal[matchersSize:] + // Get the target of the rule. - target, consumed, err := parseTarget(optVal) + targetSize := entry.NextOffset - entry.TargetOffset + if len(optVal) < int(targetSize) { + log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + } + target, err := parseTarget(optVal[:targetSize]) if err != nil { return err } - optVal = optVal[consumed:] + optVal = optVal[targetSize:] table.Rules = append(table.Rules, iptables.Rule{ - Filter: filter, - Target: target, + Filter: filter, + Target: target, + Matchers: matchers, }) offsets = append(offsets, offset) - offset += linux.SizeOfIPTEntry + consumed + offset += uint32(entry.NextOffset) + + if initialOptValLen-len(optVal) != int(entry.NextOffset) { + log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + } } // Go through the list of supported hooks for this table and, for each @@ -401,12 +426,105 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { return nil } -// parseTarget parses a target from the start of optVal and returns the target -// along with the number of bytes it occupies in optVal. -func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { +// parseMatchers parses 0 or more matchers from optVal. optVal should contain +// only the matchers. +func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) { + var matchers []iptables.Matcher + for len(optVal) > 0 { + log.Infof("parseMatchers: optVal has len %d", len(optVal)) + // Get the XTEntryMatch. + if len(optVal) < linux.SizeOfXTEntryMatch { + log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + var match linux.XTEntryMatch + buf := optVal[:linux.SizeOfXTEntryMatch] + binary.Unmarshal(buf, usermem.ByteOrder, &match) + log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match) + + // Check some invariants. + if match.MatchSize < linux.SizeOfXTEntryMatch { + log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) + return nil, syserr.ErrInvalidArgument + } + if len(optVal) < int(match.MatchSize) { + log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + + buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize] + var matcher iptables.Matcher + var err error + switch match.Name.String() { + case "tcp": + if len(buf) < linux.SizeOfXTTCP { + log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + var matchData linux.XTTCP + // For alignment reasons, the match's total size may exceed what's + // strictly necessary to hold matchData. + binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) + log.Infof("parseMatchers: parsed XTTCP: %+v", matchData) + matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{ + SourcePortStart: matchData.SourcePortStart, + SourcePortEnd: matchData.SourcePortEnd, + DestinationPortStart: matchData.DestinationPortStart, + DestinationPortEnd: matchData.DestinationPortEnd, + Option: matchData.Option, + FlagMask: matchData.FlagMask, + FlagCompare: matchData.FlagCompare, + InverseFlags: matchData.InverseFlags, + }) + if err != nil { + log.Warningf("netfilter: failed to create TCP matcher: %v", err) + return nil, syserr.ErrInvalidArgument + } + + case "udp": + if len(buf) < linux.SizeOfXTUDP { + log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + var matchData linux.XTUDP + // For alignment reasons, the match's total size may exceed what's + // strictly necessary to hold matchData. + binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) + log.Infof("parseMatchers: parsed XTUDP: %+v", matchData) + matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{ + SourcePortStart: matchData.SourcePortStart, + SourcePortEnd: matchData.SourcePortEnd, + DestinationPortStart: matchData.DestinationPortStart, + DestinationPortEnd: matchData.DestinationPortEnd, + InverseFlags: matchData.InverseFlags, + }) + if err != nil { + log.Warningf("netfilter: failed to create UDP matcher: %v", err) + return nil, syserr.ErrInvalidArgument + } + + default: + log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String()) + return nil, syserr.ErrInvalidArgument + } + + matchers = append(matchers, matcher) + + // TODO: Support revision. + // TODO: Support proto -- matchers usually specify which proto(s) they work with. + optVal = optVal[match.MatchSize:] + } + + // TODO: Check that optVal is exhausted. + return matchers, nil +} + +// parseTarget parses a target from optVal. optVal should contain only the +// target. +func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { if len(optVal) < linux.SizeOfXTEntryTarget { log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal)) - return nil, 0, syserr.ErrInvalidArgument + return nil, syserr.ErrInvalidArgument } var target linux.XTEntryTarget buf := optVal[:linux.SizeOfXTEntryTarget] @@ -414,9 +532,9 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { switch target.Name.String() { case "": // Standard target. - if len(optVal) < linux.SizeOfXTStandardTarget { - log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal)) - return nil, 0, syserr.ErrInvalidArgument + if len(optVal) != linux.SizeOfXTStandardTarget { + log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal)) + return nil, syserr.ErrInvalidArgument } var standardTarget linux.XTStandardTarget buf = optVal[:linux.SizeOfXTStandardTarget] @@ -424,22 +542,22 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { verdict, err := translateToStandardVerdict(standardTarget.Verdict) if err != nil { - return nil, 0, err + return nil, err } switch verdict { case iptables.Accept: - return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil + return iptables.UnconditionalAcceptTarget{}, nil case iptables.Drop: - return iptables.UnconditionalDropTarget{}, linux.SizeOfXTStandardTarget, nil + return iptables.UnconditionalDropTarget{}, nil default: panic(fmt.Sprintf("Unknown verdict: %v", verdict)) } case errorTargetName: // Error target. - if len(optVal) < linux.SizeOfXTErrorTarget { + if len(optVal) != linux.SizeOfXTErrorTarget { log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal)) - return nil, 0, syserr.ErrInvalidArgument + return nil, syserr.ErrInvalidArgument } var errorTarget linux.XTErrorTarget buf = optVal[:linux.SizeOfXTErrorTarget] @@ -454,16 +572,16 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { // rules have an error with the name of the chain. switch errorTarget.Name.String() { case errorTargetName: - return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil + return iptables.ErrorTarget{}, nil default: log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String()) - return nil, 0, syserr.ErrInvalidArgument + return nil, syserr.ErrInvalidArgument } } // Unknown target. log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) - return nil, 0, syserr.ErrInvalidArgument + return nil, syserr.ErrInvalidArgument } func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) { diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD index 297eaccaf..ff4e3c932 100644 --- a/pkg/tcpip/iptables/BUILD +++ b/pkg/tcpip/iptables/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "iptables.go", "targets.go", + "tcp_matcher.go", "types.go", + "udp_matcher.go", ], importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables", visibility = ["//visibility:public"], diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go index fc06b5b87..accedba1e 100644 --- a/pkg/tcpip/iptables/iptables.go +++ b/pkg/tcpip/iptables/iptables.go @@ -138,6 +138,8 @@ func EmptyFilterTable() Table { // Check runs pkt through the rules for hook. It returns true when the packet // should continue traversing the network stack and false when it should be // dropped. +// +// Precondition: pkt.NetworkHeader is set. func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool { // TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because // we're missing features. Jumps, the call stack, etc. aren't checked @@ -163,6 +165,7 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool { return true } +// Precondition: pkt.NetworkHeader is set. func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict { // Start from ruleIdx and walk the list of rules until a rule gives us // a verdict. diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go new file mode 100644 index 000000000..6acbd6eb9 --- /dev/null +++ b/pkg/tcpip/iptables/tcp_matcher.go @@ -0,0 +1,122 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +type TCPMatcher struct { + data TCPMatcherData + + // tablename string + // unsigned int matchsize; + // unsigned int usersize; + // #ifdef CONFIG_COMPAT + // unsigned int compatsize; + // #endif + // unsigned int hooks; + // unsigned short proto; + // unsigned short family; +} + +// TODO: Delete? +// MatchCheckEntryParams + +type TCPMatcherData struct { + // Filter IPHeaderFilter + + SourcePortStart uint16 + SourcePortEnd uint16 + DestinationPortStart uint16 + DestinationPortEnd uint16 + Option uint8 + FlagMask uint8 + FlagCompare uint8 + InverseFlags uint8 +} + +func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) { + // TODO: We currently only support source port and destination port. + log.Infof("Adding rule with TCPMatcherData: %+v", data) + + if data.Option != 0 || + data.FlagMask != 0 || + data.FlagCompare != 0 || + data.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported TCP matcher flags set") + } + + if filter.Protocol != header.TCPProtocolNumber { + log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber) + } + + return &TCPMatcher{data: data}, nil +} + +// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments). +func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + // TODO: Do we check proto here or elsewhere? I think elsewhere (check + // codesearch). + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.") + return false, true + } + return false, false + } + + // Now we need the transport header. However, this may not have been set + // yet. + // TODO + var tcpHeader header.TCP + if pkt.TransportHeader != nil { + tcpHeader = header.TCP(pkt.TransportHeader) + } else { + // The TCP header hasn't been parsed yet. We have to do it here. + if len(pkt.Data.First()) < header.TCPMinimumSize { + // There's no valid TCP header here, so we hotdrop the + // packet. + // TODO: Stats. + log.Warningf("Dropping TCP packet: size to small.") + return false, true + } + tcpHeader = header.TCP(pkt.Data.First()) + } + + // Check whether the source and destination ports are within the + // matching range. + sourcePort := tcpHeader.SourcePort() + destinationPort := tcpHeader.DestinationPort() + if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort { + return false, false + } + if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go index a0bfc8b41..54e66f09a 100644 --- a/pkg/tcpip/iptables/types.go +++ b/pkg/tcpip/iptables/types.go @@ -169,12 +169,29 @@ type IPHeaderFilter struct { Protocol tcpip.TransportProtocolNumber } +// TODO: Should these be able to marshal/unmarshal themselves? +// TODO: Something has to map the name to the matcher. // A Matcher is the interface for matching packets. type Matcher interface { // Match returns whether the packet matches and whether the packet // should be "hotdropped", i.e. dropped immediately. This is usually // used for suspicious packets. + // + // Precondition: packet.NetworkHeader is set. Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool) + + // TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry? + // CheckEntry(params MatchCheckEntryParams) bool +} + +// TODO: Unused? +type MatchCheckEntryParams struct { + Table string // TODO: Tables should be an enum... + Filter IPHeaderFilter + Info interface{} // TODO: Type unsafe. + // HookMask uint8 + // Family uint8 + // NFTCompat bool } // A Target is the interface for taking an action for a packet. diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go new file mode 100644 index 000000000..ce4368a3d --- /dev/null +++ b/pkg/tcpip/iptables/udp_matcher.go @@ -0,0 +1,127 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "fmt" + "runtime/debug" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +type UDPMatcher struct { + data UDPMatcherData + + // tablename string + // unsigned int matchsize; + // unsigned int usersize; + // #ifdef CONFIG_COMPAT + // unsigned int compatsize; + // #endif + // unsigned int hooks; + // unsigned short proto; + // unsigned short family; +} + +// TODO: Delete? +// MatchCheckEntryParams + +type UDPMatcherData struct { + // Filter IPHeaderFilter + + SourcePortStart uint16 + SourcePortEnd uint16 + DestinationPortStart uint16 + DestinationPortEnd uint16 + InverseFlags uint8 +} + +func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) { + // TODO: We currently only support source port and destination port. + log.Infof("Adding rule with UDPMatcherData: %+v", data) + + if data.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported UDP matcher flags set") + } + + if filter.Protocol != header.UDPProtocolNumber { + log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) + } + + return &UDPMatcher{data: data}, nil +} + +// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments). +func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { + log.Infof("UDPMatcher called from: %s", string(debug.Stack())) + netHeader := header.IPv4(pkt.NetworkHeader) + + // TODO: Do we check proto here or elsewhere? I think elsewhere (check + // codesearch). + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + log.Infof("UDPMatcher: wrong protocol number") + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + log.Infof("UDPMatcher: it's a fragment") + if frag == 1 { + return false, true + } + log.Warningf("Dropping UDP packet: malicious fragmented packet.") + return false, false + } + + // Now we need the transport header. However, this may not have been set + // yet. + // TODO + var udpHeader header.UDP + if pkt.TransportHeader != nil { + log.Infof("UDPMatcher: transport header is not nil") + udpHeader = header.UDP(pkt.TransportHeader) + } else { + log.Infof("UDPMatcher: transport header is nil") + log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil) + // The UDP header hasn't been parsed yet. We have to do it here. + if len(pkt.Data.First()) < header.UDPMinimumSize { + // There's no valid UDP header here, so we hotdrop the + // packet. + // TODO: Stats. + log.Warningf("Dropping UDP packet: size to small.") + return false, true + } + udpHeader = header.UDP(pkt.Data.First()) + } + + // Check whether the source and destination ports are within the + // matching range. + sourcePort := udpHeader.SourcePort() + destinationPort := udpHeader.DestinationPort() + log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)", + udpHeader.SourcePort(), udpHeader.DestinationPort(), + tm.data.SourcePortStart, tm.data.SourcePortEnd, + tm.data.DestinationPortStart, tm.data.DestinationPortEnd) + if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort { + return false, false + } + if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 85512f9b2..6597e6781 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -353,6 +353,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) { } pkt.NetworkHeader = headerView[:h.HeaderLength()] + hlen := int(h.HeaderLength()) + tlen := int(h.TotalLength()) + pkt.Data.TrimFront(hlen) + pkt.Data.CapLength(tlen - hlen) + // iptables filtering. All packets that reach here are intended for // this machine and will not be forwarded. ipt := e.stack.IPTables() @@ -361,11 +366,6 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) { return } - hlen := int(h.HeaderLength()) - tlen := int(h.TotalLength()) - pkt.Data.TrimFront(hlen) - pkt.Data.CapLength(tlen - hlen) - more := (h.Flags() & header.IPv4FlagMoreFragments) != 0 if more || h.FragmentOffset() != 0 { if pkt.Data.Size() == 0 { -- cgit v1.2.3 From 2661101ad470548cb15dce0afc694296668d780a Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Tue, 21 Jan 2020 14:51:28 -0800 Subject: Removed TCP work (saved in ipt-tcp-match). --- pkg/abi/linux/netfilter.go | 52 ------------- pkg/sentry/socket/netfilter/netfilter.go | 26 ------- pkg/tcpip/iptables/tcp_matcher.go | 122 ------------------------------- 3 files changed, 200 deletions(-) delete mode 100644 pkg/tcpip/iptables/tcp_matcher.go (limited to 'pkg') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index fb4588272..f0e544f9c 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -341,58 +341,6 @@ func goString(cstring []byte) string { return string(cstring) } -// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp -// in include/uapi/linux/netfilter/xt_tcpudp.h. -type XTTCP struct { - // SourcePortStart specifies the inclusive start of the range of source - // ports to which the matcher applies. - SourcePortStart uint16 - - // SourcePortEnd specifies the inclusive end of the range of source ports - // to which the matcher applies. - SourcePortEnd uint16 - - // DestinationPortStart specifies the start of the destination port - // range to which the matcher applies. - DestinationPortStart uint16 - - // DestinationPortEnd specifies the start of the destination port - // range to which the matcher applies. - DestinationPortEnd uint16 - - // Option specifies that a particular TCP option must be set. - Option uint8 - - // FlagMask masks the FlagCompare byte when comparing to the TCP flag - // fields. - FlagMask uint8 - - // FlagCompare is binary and-ed with the TCP flag fields. - FlagCompare uint8 - - // InverseFlags flips the meaning of certain fields. See the - // TX_TCP_INV_* flags. - InverseFlags uint8 -} - -// SizeOfXTTCP is the size of an XTTCP. -const SizeOfXTTCP = 12 - -// Flags in XTTCP.InverseFlags. Corresponding constants are in -// include/uapi/linux/netfilter/xt_tcpudp.h. -const ( - // Invert the meaning of SourcePortStart/End. - XT_TCP_INV_SRCPT = 0x01 - // Invert the meaning of DestinationPortStart/End. - XT_TCP_INV_DSTPT = 0x02 - // Invert the meaning of FlagCompare. - XT_TCP_INV_FLAGS = 0x04 - // Invert the meaning of Option. - XT_TCP_INV_OPTION = 0x08 - // Enable all flags. - XT_TCP_INV_MASK = 0x0F -) - // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp // in include/uapi/linux/netfilter/xt_tcpudp.h. type XTUDP struct { diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 45296b339..f8ed1acbc 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -131,7 +131,6 @@ func FillDefaultIPTables(stack *stack.Stack) { stack.SetIPTables(ipt) } -// TODO: Return proto. // convertNetstackToBinary converts the iptables as stored in netstack to the // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a bit, reading some offsets, @@ -456,31 +455,6 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma var matcher iptables.Matcher var err error switch match.Name.String() { - case "tcp": - if len(buf) < linux.SizeOfXTTCP { - log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal)) - return nil, syserr.ErrInvalidArgument - } - var matchData linux.XTTCP - // For alignment reasons, the match's total size may exceed what's - // strictly necessary to hold matchData. - binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) - log.Infof("parseMatchers: parsed XTTCP: %+v", matchData) - matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{ - SourcePortStart: matchData.SourcePortStart, - SourcePortEnd: matchData.SourcePortEnd, - DestinationPortStart: matchData.DestinationPortStart, - DestinationPortEnd: matchData.DestinationPortEnd, - Option: matchData.Option, - FlagMask: matchData.FlagMask, - FlagCompare: matchData.FlagCompare, - InverseFlags: matchData.InverseFlags, - }) - if err != nil { - log.Warningf("netfilter: failed to create TCP matcher: %v", err) - return nil, syserr.ErrInvalidArgument - } - case "udp": if len(buf) < linux.SizeOfXTUDP { log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal)) diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go deleted file mode 100644 index 6acbd6eb9..000000000 --- a/pkg/tcpip/iptables/tcp_matcher.go +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package iptables - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/header" -) - -type TCPMatcher struct { - data TCPMatcherData - - // tablename string - // unsigned int matchsize; - // unsigned int usersize; - // #ifdef CONFIG_COMPAT - // unsigned int compatsize; - // #endif - // unsigned int hooks; - // unsigned short proto; - // unsigned short family; -} - -// TODO: Delete? -// MatchCheckEntryParams - -type TCPMatcherData struct { - // Filter IPHeaderFilter - - SourcePortStart uint16 - SourcePortEnd uint16 - DestinationPortStart uint16 - DestinationPortEnd uint16 - Option uint8 - FlagMask uint8 - FlagCompare uint8 - InverseFlags uint8 -} - -func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) { - // TODO: We currently only support source port and destination port. - log.Infof("Adding rule with TCPMatcherData: %+v", data) - - if data.Option != 0 || - data.FlagMask != 0 || - data.FlagCompare != 0 || - data.InverseFlags != 0 { - return nil, fmt.Errorf("unsupported TCP matcher flags set") - } - - if filter.Protocol != header.TCPProtocolNumber { - log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber) - } - - return &TCPMatcher{data: data}, nil -} - -// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments). -func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader) - - // TODO: Do we check proto here or elsewhere? I think elsewhere (check - // codesearch). - if netHeader.TransportProtocol() != header.TCPProtocolNumber { - return false, false - } - - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.") - return false, true - } - return false, false - } - - // Now we need the transport header. However, this may not have been set - // yet. - // TODO - var tcpHeader header.TCP - if pkt.TransportHeader != nil { - tcpHeader = header.TCP(pkt.TransportHeader) - } else { - // The TCP header hasn't been parsed yet. We have to do it here. - if len(pkt.Data.First()) < header.TCPMinimumSize { - // There's no valid TCP header here, so we hotdrop the - // packet. - // TODO: Stats. - log.Warningf("Dropping TCP packet: size to small.") - return false, true - } - tcpHeader = header.TCP(pkt.Data.First()) - } - - // Check whether the source and destination ports are within the - // matching range. - sourcePort := tcpHeader.SourcePort() - destinationPort := tcpHeader.DestinationPort() - if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort { - return false, false - } - if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort { - return false, false - } - - return true, false -} -- cgit v1.2.3 From 421b6ff18154f80ea8cbbfd8340042ab458bf813 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Tue, 21 Jan 2020 14:54:39 -0800 Subject: Passes all filter table UDP tests. --- pkg/tcpip/iptables/BUILD | 1 - 1 file changed, 1 deletion(-) (limited to 'pkg') diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD index ff4e3c932..e41c645ed 100644 --- a/pkg/tcpip/iptables/BUILD +++ b/pkg/tcpip/iptables/BUILD @@ -7,7 +7,6 @@ go_library( srcs = [ "iptables.go", "targets.go", - "tcp_matcher.go", "types.go", "udp_matcher.go", ], -- cgit v1.2.3 From 538053538dfb378aa8bc512d484ea305177e617b Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Tue, 21 Jan 2020 16:51:17 -0800 Subject: Adding serialization. --- pkg/sentry/socket/netfilter/netfilter.go | 29 ++++++++++++++++++++++++++++- pkg/tcpip/iptables/udp_matcher.go | 14 +++++++------- 2 files changed, 35 insertions(+), 8 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index f8ed1acbc..3caabca9a 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -196,7 +196,9 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern } func marshalMatcher(matcher iptables.Matcher) []byte { - switch matcher.(type) { + switch m := matcher.(type) { + case *iptables.UDPMatcher: + return marshalUDPMatcher(m) default: // TODO(gvisor.dev/issue/170): We don't support any matchers // yet, so any call to marshalMatcher will panic. @@ -204,6 +206,31 @@ func marshalMatcher(matcher iptables.Matcher) []byte { } } +func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { + type udpMatch struct { + linux.XTEntryMatch + linux.XTUDP + } + linuxMatcher := udpMatch{ + XTEntryMatch: linux.XTEntryMatch{ + MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP, + // Name: "udp", + }, + XTUDP: linux.XTUDP{ + SourcePortStart: matcher.Data.SourcePortStart, + SourcePortEnd: matcher.Data.SourcePortEnd, + DestinationPortStart: matcher.Data.DestinationPortStart, + DestinationPortEnd: matcher.Data.DestinationPortEnd, + InverseFlags: matcher.Data.InverseFlags, + }, + } + copy(linuxMatcher.Name[:], "udp") + + var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte + binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher) + return buf[:] +} + func marshalTarget(target iptables.Target) []byte { switch target.(type) { case iptables.UnconditionalAcceptTarget: diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go index ce4368a3d..fca457199 100644 --- a/pkg/tcpip/iptables/udp_matcher.go +++ b/pkg/tcpip/iptables/udp_matcher.go @@ -24,7 +24,7 @@ import ( ) type UDPMatcher struct { - data UDPMatcherData + Data UDPMatcherData // tablename string // unsigned int matchsize; @@ -62,11 +62,11 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) } - return &UDPMatcher{data: data}, nil + return &UDPMatcher{Data: data}, nil } // TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments). -func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { +func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { log.Infof("UDPMatcher called from: %s", string(debug.Stack())) netHeader := header.IPv4(pkt.NetworkHeader) @@ -114,12 +114,12 @@ func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str destinationPort := udpHeader.DestinationPort() log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)", udpHeader.SourcePort(), udpHeader.DestinationPort(), - tm.data.SourcePortStart, tm.data.SourcePortEnd, - tm.data.DestinationPortStart, tm.data.DestinationPortEnd) - if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort { + um.Data.SourcePortStart, um.Data.SourcePortEnd, + um.Data.DestinationPortStart, um.Data.DestinationPortEnd) + if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort { return false, false } - if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort { + if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort { return false, false } -- cgit v1.2.3 From b7853f688b4bcd3465c0c3087fcbd8d53bdf26ae Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Wed, 22 Jan 2020 14:46:15 -0800 Subject: Error marshalling the matcher. The iptables binary is looking for libxt_.so when it should be looking for libxt_udp.so, so it's having an issue reading the data in xt_match_entry. I think it may be an alignment issue. Trying to fix this is leading to me fighting with the metadata struct, so I'm gonna go kill that. --- pkg/abi/linux/netfilter.go | 5 +++++ pkg/sentry/socket/netfilter/netfilter.go | 35 ++++++++++++++++++++------------ pkg/tcpip/iptables/udp_matcher.go | 2 +- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'pkg') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index f0e544f9c..effed7976 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -198,6 +198,11 @@ type XTEntryMatch struct { // SizeOfXTEntryMatch is the size of an XTEntryMatch. const SizeOfXTEntryMatch = 32 +type KernelXTEntryMatch struct { + XTEntryMatch + Data []byte +} + // XTEntryTarget holds a target for a rule. For example, it can specify that // packets matching the rule should DROP, ACCEPT, or use an extension target. // iptables-extension(8) has a list of possible targets. diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 3caabca9a..b49fe5b3e 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -207,26 +207,34 @@ func marshalMatcher(matcher iptables.Matcher) []byte { } func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { - type udpMatch struct { - linux.XTEntryMatch - linux.XTUDP - } - linuxMatcher := udpMatch{ + linuxMatcher := linux.KernelXTEntryMatch{ XTEntryMatch: linux.XTEntryMatch{ MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP, // Name: "udp", }, - XTUDP: linux.XTUDP{ - SourcePortStart: matcher.Data.SourcePortStart, - SourcePortEnd: matcher.Data.SourcePortEnd, - DestinationPortStart: matcher.Data.DestinationPortStart, - DestinationPortEnd: matcher.Data.DestinationPortEnd, - InverseFlags: matcher.Data.InverseFlags, - }, + Data: make([]byte, linux.SizeOfXTUDP+22), } + // copy(linuxMatcher.Name[:], "udp") copy(linuxMatcher.Name[:], "udp") - var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte + // TODO: Must be aligned. + xtudp := linux.XTUDP{ + SourcePortStart: matcher.Data.SourcePortStart, + SourcePortEnd: matcher.Data.SourcePortEnd, + DestinationPortStart: matcher.Data.DestinationPortStart, + DestinationPortEnd: matcher.Data.DestinationPortEnd, + InverseFlags: matcher.Data.InverseFlags, + } + binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp) + + if binary.Size(linuxMatcher)%64 != 0 { + panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher))) + } + + var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte + if len(buf)%64 != 0 { + panic(fmt.Sprintf("len is actually: %d", len(buf))) + } binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher) return buf[:] } @@ -245,6 +253,7 @@ func marshalTarget(target iptables.Target) []byte { } func marshalStandardTarget(verdict iptables.Verdict) []byte { + // TODO: Must be aligned. // The target's name will be the empty string. target := linux.XTStandardTarget{ Target: linux.XTEntryTarget{ diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go index fca457199..65ae7f9e0 100644 --- a/pkg/tcpip/iptables/udp_matcher.go +++ b/pkg/tcpip/iptables/udp_matcher.go @@ -59,7 +59,7 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) } if filter.Protocol != header.UDPProtocolNumber { - log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) + return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) } return &UDPMatcher{Data: data}, nil -- cgit v1.2.3 From 2946fe81627afa223853769ed736e2a56e0144b7 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 24 Jan 2020 17:12:03 -0800 Subject: We can now actually write out the udp matcher. --- pkg/sentry/socket/netfilter/netfilter.go | 78 ++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 20 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 3ca22932d..6c88a50a6 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -36,7 +36,7 @@ const errorTargetName = "ERROR" // metadata is opaque to netstack. It holds data that we need to translate // between Linux's and netstack's iptables representations. -// TODO(gvisor.dev/issue/170): This might be removable. +// TODO(gvisor.dev/issue/170): Use metadata to check correctness. type metadata struct { HookEntry [linux.NF_INET_NUMHOOKS]uint32 Underflow [linux.NF_INET_NUMHOOKS]uint32 @@ -44,6 +44,14 @@ type metadata struct { Size uint32 } +const enableDebugLog = true + +func nflog(format string, args ...interface{}) { + if enableDebugLog { + log.Infof("netfilter: "+format, args...) + } +} + // GetInfo returns information about iptables. func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { // Read in the struct and table name. @@ -72,6 +80,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT info.NumEntries = metadata.NumEntries info.Size = metadata.Size + nflog("GetInfo returning info: %+v", info) + return info, nil } @@ -80,21 +90,26 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen // Read in the struct and table name. var userEntries linux.IPTGetEntries if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name) return linux.KernelIPTGetEntries{}, syserr.FromError(err) } // Find the appropriate table. table, err := findTable(stack, userEntries.Name) if err != nil { + log.Warningf("netfilter: couldn't find table %q", userEntries.Name) return linux.KernelIPTGetEntries{}, err } // Convert netstack's iptables rules to something that the iptables // tool can understand. - entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table) + entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table) if err != nil { return linux.KernelIPTGetEntries{}, err } + if meta != table.Metadata().(metadata) { + panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta)) + } if binary.Size(entries) > uintptr(outLen) { log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen)) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument @@ -148,15 +163,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern copy(entries.Name[:], tablename) for ruleIdx, rule := range table.Rules { + nflog("Current offset: %d", entries.Size) + // Is this a chain entry point? for hook, hookRuleIdx := range table.BuiltinChains { if hookRuleIdx == ruleIdx { + nflog("Found hook %d at offset %d", hook, entries.Size) meta.HookEntry[hook] = entries.Size } } // Is this a chain underflow point? for underflow, underflowRuleIdx := range table.Underflows { if underflowRuleIdx == ruleIdx { + nflog("Found underflow %d at offset %d", underflow, entries.Size) meta.Underflow[underflow] = entries.Size } } @@ -176,6 +195,10 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern // Serialize the matcher and add it to the // entry. serialized := marshalMatcher(matcher) + nflog("matcher serialized as: %v", serialized) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) + } entry.Elems = append(entry.Elems, serialized...) entry.NextOffset += uint16(len(serialized)) entry.TargetOffset += uint16(len(serialized)) @@ -183,18 +206,25 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern // Serialize and append the target. serialized := marshalTarget(rule.Target) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) + } entry.Elems = append(entry.Elems, serialized...) entry.NextOffset += uint16(len(serialized)) + nflog("Adding entry: %+v", entry) + entries.Size += uint32(entry.NextOffset) entries.Entrytable = append(entries.Entrytable, entry) meta.NumEntries++ } + nflog("Finished with an marshalled size of %d", meta.Size) meta.Size = entries.Size return entries, meta, nil } +// TODO: SOMEHOW THIS IS NOT GETTING APPENDED! func marshalMatcher(matcher iptables.Matcher) []byte { switch m := matcher.(type) { case *iptables.UDPMatcher: @@ -207,17 +237,17 @@ func marshalMatcher(matcher iptables.Matcher) []byte { } func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { + nflog("Marshalling UDP matcher: %+v", matcher) + linuxMatcher := linux.KernelXTEntryMatch{ XTEntryMatch: linux.XTEntryMatch{ - MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP, + MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6, // Name: "udp", }, - Data: make([]byte, linux.SizeOfXTUDP+22), + Data: make([]byte, 0, linux.SizeOfXTUDP), } - // copy(linuxMatcher.Name[:], "udp") copy(linuxMatcher.Name[:], "udp") - // TODO: Must be aligned. xtudp := linux.XTUDP{ SourcePortStart: matcher.Data.SourcePortStart, SourcePortEnd: matcher.Data.SourcePortEnd, @@ -225,17 +255,17 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { DestinationPortEnd: matcher.Data.DestinationPortEnd, InverseFlags: matcher.Data.InverseFlags, } - binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp) - - if binary.Size(linuxMatcher)%64 != 0 { - panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher))) - } - - var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte - if len(buf)%64 != 0 { - panic(fmt.Sprintf("len is actually: %d", len(buf))) - } - binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher) + nflog("marshalUDPMatcher: xtudp: %+v", xtudp) + linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp) + nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher) + + // We have to pad this struct size to a multiple of 8 bytes, so we make + // this a little longer than it needs to be. + buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6) + buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher) + buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...) + nflog("Marshalled into matcher of size %d", len(buf)) + nflog("marshalUDPMatcher: buf is: %v", buf) return buf[:] } @@ -253,6 +283,8 @@ func marshalTarget(target iptables.Target) []byte { } func marshalStandardTarget(verdict iptables.Verdict) []byte { + nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget) + // TODO: Must be aligned. // The target's name will be the empty string. target := linux.XTStandardTarget{ @@ -321,7 +353,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) { // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { - printReplace(optVal) + // printReplace(optVal) // Get the basic rules data (struct ipt_replace). if len(optVal) < linux.SizeOfIPTReplace { @@ -343,10 +375,14 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { return syserr.ErrInvalidArgument } + nflog("Setting entries in table %q", replace.Name.String()) + // Convert input into a list of rules and their offsets. var offset uint32 var offsets []uint32 for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { + nflog("Processing entry at offset %d", offset) + // Get the struct ipt_entry. if len(optVal) < linux.SizeOfIPTEntry { log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal)) @@ -464,9 +500,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // parseMatchers parses 0 or more matchers from optVal. optVal should contain // only the matchers. func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) { + nflog("Parsing matchers of size %d", len(optVal)) var matchers []iptables.Matcher for len(optVal) > 0 { - log.Infof("parseMatchers: optVal has len %d", len(optVal)) + nflog("parseMatchers: optVal has len %d", len(optVal)) // Get the XTEntryMatch. if len(optVal) < linux.SizeOfXTEntryMatch { log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal)) @@ -475,7 +512,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma var match linux.XTEntryMatch buf := optVal[:linux.SizeOfXTEntryMatch] binary.Unmarshal(buf, usermem.ByteOrder, &match) - log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match) + nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match) // Check some invariants. if match.MatchSize < linux.SizeOfXTEntryMatch { @@ -532,6 +569,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma // parseTarget parses a target from optVal. optVal should contain only the // target. func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { + nflog("Parsing target of size %d", len(optVal)) if len(optVal) < linux.SizeOfXTEntryTarget { log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal)) return nil, syserr.ErrInvalidArgument -- cgit v1.2.3 From 29316e66adfc49c158425554761e34c12338f1d9 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 27 Jan 2020 12:27:04 -0800 Subject: Cleanup for GH review. --- pkg/abi/linux/netfilter.go | 14 +-- pkg/sentry/socket/netfilter/netfilter.go | 144 ++++++++++++------------------- pkg/tcpip/iptables/types.go | 15 ---- pkg/tcpip/iptables/udp_matcher.go | 62 ++++++------- test/iptables/filter_input.go | 6 +- 5 files changed, 88 insertions(+), 153 deletions(-) (limited to 'pkg') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index effed7976..8e40bcc62 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -198,6 +198,8 @@ type XTEntryMatch struct { // SizeOfXTEntryMatch is the size of an XTEntryMatch. const SizeOfXTEntryMatch = 32 +// KernelXTEntryMatch is identical to XTEntryMatch, but contains +// variable-length Data field. type KernelXTEntryMatch struct { XTEntryMatch Data []byte @@ -349,19 +351,19 @@ func goString(cstring []byte) string { // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp // in include/uapi/linux/netfilter/xt_tcpudp.h. type XTUDP struct { - // SourcePortStart specifies the inclusive start of the range of source - // ports to which the matcher applies. + // SourcePortStart is the inclusive start of the range of source ports + // to which the matcher applies. SourcePortStart uint16 - // SourcePortEnd specifies the inclusive end of the range of source ports - // to which the matcher applies. + // SourcePortEnd is the inclusive end of the range of source ports to + // which the matcher applies. SourcePortEnd uint16 - // DestinationPortStart specifies the start of the destination port + // DestinationPortStart is the inclusive start of the destination port // range to which the matcher applies. DestinationPortStart uint16 - // DestinationPortEnd specifies the start of the destination port + // DestinationPortEnd is the inclusive end of the destination port // range to which the matcher applies. DestinationPortEnd uint16 diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 6c88a50a6..b8848f08a 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -34,9 +34,16 @@ import ( // shouldn't be reached - an error has occurred if we fall through to one. const errorTargetName = "ERROR" -// metadata is opaque to netstack. It holds data that we need to translate -// between Linux's and netstack's iptables representations. -// TODO(gvisor.dev/issue/170): Use metadata to check correctness. +const ( + matcherNameUDP = "udp" +) + +// Metadata is used to verify that we are correctly serializing and +// deserializing iptables into structs consumable by the iptables tool. We save +// a metadata struct when the tables are written, and when they are read out we +// verify that certain fields are the same. +// +// metadata is opaque to netstack. type metadata struct { HookEntry [linux.NF_INET_NUMHOOKS]uint32 Underflow [linux.NF_INET_NUMHOOKS]uint32 @@ -44,10 +51,12 @@ type metadata struct { Size uint32 } -const enableDebugLog = true +const enableDebug = false +// nflog logs messages related to the writing and reading of iptables, but only +// when enableDebug is true. func nflog(format string, args ...interface{}) { - if enableDebugLog { + if enableDebug { log.Infof("netfilter: "+format, args...) } } @@ -80,7 +89,7 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT info.NumEntries = metadata.NumEntries info.Size = metadata.Size - nflog("GetInfo returning info: %+v", info) + nflog("returning info: %+v", info) return info, nil } @@ -163,19 +172,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern copy(entries.Name[:], tablename) for ruleIdx, rule := range table.Rules { - nflog("Current offset: %d", entries.Size) + nflog("convert to binary: current offset: %d", entries.Size) // Is this a chain entry point? for hook, hookRuleIdx := range table.BuiltinChains { if hookRuleIdx == ruleIdx { - nflog("Found hook %d at offset %d", hook, entries.Size) + nflog("convert to binary: found hook %d at offset %d", hook, entries.Size) meta.HookEntry[hook] = entries.Size } } // Is this a chain underflow point? for underflow, underflowRuleIdx := range table.Underflows { if underflowRuleIdx == ruleIdx { - nflog("Found underflow %d at offset %d", underflow, entries.Size) + nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size) meta.Underflow[underflow] = entries.Size } } @@ -195,7 +204,7 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern // Serialize the matcher and add it to the // entry. serialized := marshalMatcher(matcher) - nflog("matcher serialized as: %v", serialized) + nflog("convert to binary: matcher serialized as: %v", serialized) if len(serialized)%8 != 0 { panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) } @@ -212,14 +221,14 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern entry.Elems = append(entry.Elems, serialized...) entry.NextOffset += uint16(len(serialized)) - nflog("Adding entry: %+v", entry) + nflog("convert to binary: adding entry: %+v", entry) entries.Size += uint32(entry.NextOffset) entries.Entrytable = append(entries.Entrytable, entry) meta.NumEntries++ } - nflog("Finished with an marshalled size of %d", meta.Size) + nflog("convert to binary: finished with an marshalled size of %d", meta.Size) meta.Size = entries.Size return entries, meta, nil } @@ -237,16 +246,18 @@ func marshalMatcher(matcher iptables.Matcher) []byte { } func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { - nflog("Marshalling UDP matcher: %+v", matcher) + nflog("convert to binary: marshalling UDP matcher: %+v", matcher) + + // We have to pad this struct size to a multiple of 8 bytes. + const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6 linuxMatcher := linux.KernelXTEntryMatch{ XTEntryMatch: linux.XTEntryMatch{ - MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6, - // Name: "udp", + MatchSize: size, }, Data: make([]byte, 0, linux.SizeOfXTUDP), } - copy(linuxMatcher.Name[:], "udp") + copy(linuxMatcher.Name[:], matcherNameUDP) xtudp := linux.XTUDP{ SourcePortStart: matcher.Data.SourcePortStart, @@ -255,17 +266,12 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { DestinationPortEnd: matcher.Data.DestinationPortEnd, InverseFlags: matcher.Data.InverseFlags, } - nflog("marshalUDPMatcher: xtudp: %+v", xtudp) linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp) - nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher) - // We have to pad this struct size to a multiple of 8 bytes, so we make - // this a little longer than it needs to be. - buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6) + buf := make([]byte, 0, size) buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher) buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...) - nflog("Marshalled into matcher of size %d", len(buf)) - nflog("marshalUDPMatcher: buf is: %v", buf) + nflog("convert to binary: marshalled UDP matcher into %v", buf) return buf[:] } @@ -283,9 +289,8 @@ func marshalTarget(target iptables.Target) []byte { } func marshalStandardTarget(verdict iptables.Verdict) []byte { - nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget) + nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget) - // TODO: Must be aligned. // The target's name will be the empty string. target := linux.XTStandardTarget{ Target: linux.XTEntryTarget{ @@ -353,8 +358,6 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) { // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { - // printReplace(optVal) - // Get the basic rules data (struct ipt_replace). if len(optVal) < linux.SizeOfIPTReplace { log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal)) @@ -375,13 +378,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { return syserr.ErrInvalidArgument } - nflog("Setting entries in table %q", replace.Name.String()) + nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. var offset uint32 var offsets []uint32 for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { - nflog("Processing entry at offset %d", offset) + nflog("set entries: processing entry at offset %d", offset) // Get the struct ipt_entry. if len(optVal) < linux.SizeOfIPTEntry { @@ -406,11 +409,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { return err } - // TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables. + // TODO(gvisor.dev/issue/170): Matchers and targets can specify + // that they only work for certiain protocols, hooks, tables. // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry if len(optVal) < int(matchersSize) { log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + return syserr.ErrInvalidArgument } matchers, err := parseMatchers(filter, optVal[:matchersSize]) if err != nil { @@ -423,6 +428,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { targetSize := entry.NextOffset - entry.TargetOffset if len(optVal) < int(targetSize) { log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + return syserr.ErrInvalidArgument } target, err := parseTarget(optVal[:targetSize]) if err != nil { @@ -500,10 +506,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // parseMatchers parses 0 or more matchers from optVal. optVal should contain // only the matchers. func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) { - nflog("Parsing matchers of size %d", len(optVal)) + nflog("set entries: parsing matchers of size %d", len(optVal)) var matchers []iptables.Matcher for len(optVal) > 0 { - nflog("parseMatchers: optVal has len %d", len(optVal)) + nflog("set entries: optVal has len %d", len(optVal)) + // Get the XTEntryMatch. if len(optVal) < linux.SizeOfXTEntryMatch { log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal)) @@ -512,7 +519,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma var match linux.XTEntryMatch buf := optVal[:linux.SizeOfXTEntryMatch] binary.Unmarshal(buf, usermem.ByteOrder, &match) - nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match) + nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match) // Check some invariants. if match.MatchSize < linux.SizeOfXTEntryMatch { @@ -528,17 +535,17 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma var matcher iptables.Matcher var err error switch match.Name.String() { - case "udp": + case matcherNameUDP: if len(buf) < linux.SizeOfXTUDP { log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal)) return nil, syserr.ErrInvalidArgument } + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. var matchData linux.XTUDP - // For alignment reasons, the match's total size may exceed what's - // strictly necessary to hold matchData. binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) log.Infof("parseMatchers: parsed XTUDP: %+v", matchData) - matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{ + matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{ SourcePortStart: matchData.SourcePortStart, SourcePortEnd: matchData.SourcePortEnd, DestinationPortStart: matchData.DestinationPortStart, @@ -557,19 +564,22 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma matchers = append(matchers, matcher) - // TODO: Support revision. - // TODO: Support proto -- matchers usually specify which proto(s) they work with. + // TODO(gvisor.dev/issue/170): Check the revision field. optVal = optVal[match.MatchSize:] } - // TODO: Check that optVal is exhausted. + if len(optVal) != 0 { + log.Warningf("netfilter: optVal should be exhausted after parsing matchers") + return nil, syserr.ErrInvalidArgument + } + return matchers, nil } // parseTarget parses a target from optVal. optVal should contain only the // target. func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { - nflog("Parsing target of size %d", len(optVal)) + nflog("set entries: parsing target of size %d", len(optVal)) if len(optVal) < linux.SizeOfXTEntryTarget { log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal)) return nil, syserr.ErrInvalidArgument @@ -598,7 +608,8 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { case iptables.Drop: return iptables.UnconditionalDropTarget{}, nil default: - panic(fmt.Sprintf("Unknown verdict: %v", verdict)) + log.Warningf("Unknown verdict: %v", verdict) + return nil, syserr.ErrInvalidArgument } case errorTargetName: @@ -673,52 +684,3 @@ func hookFromLinux(hook int) iptables.Hook { } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } - -// printReplace prints information about the struct ipt_replace in optVal. It -// is only for debugging. -func printReplace(optVal []byte) { - // Basic replace info. - var replace linux.IPTReplace - replaceBuf := optVal[:linux.SizeOfIPTReplace] - optVal = optVal[linux.SizeOfIPTReplace:] - binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace) - log.Infof("Replacing table %q: %+v", replace.Name.String(), replace) - - // Read in the list of entries at the end of replace. - var totalOffset uint16 - for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { - var entry linux.IPTEntry - entryBuf := optVal[:linux.SizeOfIPTEntry] - binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry) - log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry) - - totalOffset += entry.NextOffset - if entry.TargetOffset == linux.SizeOfIPTEntry { - log.Infof("Entry has no matches.") - } else { - log.Infof("Entry has matches.") - } - - var target linux.XTEntryTarget - targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget] - binary.Unmarshal(targetBuf, usermem.ByteOrder, &target) - log.Infof("Target named %q: %+v", target.Name.String(), target) - - switch target.Name.String() { - case "": - var standardTarget linux.XTStandardTarget - stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget] - binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget) - log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict) - case errorTargetName: - var errorTarget linux.XTErrorTarget - etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget] - binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget) - log.Infof("Error target with name %q.", errorTarget.Name.String()) - default: - log.Infof("Unknown target type.") - } - - optVal = optVal[entry.NextOffset:] - } -} diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go index d47447d40..ba5ed75b4 100644 --- a/pkg/tcpip/iptables/types.go +++ b/pkg/tcpip/iptables/types.go @@ -169,8 +169,6 @@ type IPHeaderFilter struct { Protocol tcpip.TransportProtocolNumber } -// TODO: Should these be able to marshal/unmarshal themselves? -// TODO: Something has to map the name to the matcher. // A Matcher is the interface for matching packets. type Matcher interface { // Match returns whether the packet matches and whether the packet @@ -179,19 +177,6 @@ type Matcher interface { // // Precondition: packet.NetworkHeader is set. Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool) - - // TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry? - // CheckEntry(params MatchCheckEntryParams) bool -} - -// TODO: Unused? -type MatchCheckEntryParams struct { - Table string // TODO: Tables should be an enum... - Filter IPHeaderFilter - Info interface{} // TODO: Type unsafe. - // HookMask uint8 - // Family uint8 - // NFTCompat bool } // A Target is the interface for taking an action for a packet. diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go index 65ae7f9e0..f59ca2027 100644 --- a/pkg/tcpip/iptables/udp_matcher.go +++ b/pkg/tcpip/iptables/udp_matcher.go @@ -16,33 +16,28 @@ package iptables import ( "fmt" - "runtime/debug" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) +// TODO(gvisor.dev/issue/170): The following per-matcher params should be +// supported: +// - Table name +// - Match size +// - User size +// - Hooks +// - Proto +// - Family + +// UDPMatcher matches UDP packets and their headers. It implements Matcher. type UDPMatcher struct { - Data UDPMatcherData - - // tablename string - // unsigned int matchsize; - // unsigned int usersize; - // #ifdef CONFIG_COMPAT - // unsigned int compatsize; - // #endif - // unsigned int hooks; - // unsigned short proto; - // unsigned short family; + Data UDPMatcherParams } -// TODO: Delete? -// MatchCheckEntryParams - -type UDPMatcherData struct { - // Filter IPHeaderFilter - +// UDPMatcherParams are the parameters used to create a UDPMatcher. +type UDPMatcherParams struct { SourcePortStart uint16 SourcePortEnd uint16 DestinationPortStart uint16 @@ -50,12 +45,12 @@ type UDPMatcherData struct { InverseFlags uint8 } -func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) { - // TODO: We currently only support source port and destination port. - log.Infof("Adding rule with UDPMatcherData: %+v", data) +// NewUDPMatcher returns a new instance of UDPMatcher. +func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) { + log.Infof("Adding rule with UDPMatcherParams: %+v", data) if data.InverseFlags != 0 { - return nil, fmt.Errorf("unsupported UDP matcher flags set") + return nil, fmt.Errorf("unsupported UDP matcher inverse flags set") } if filter.Protocol != header.UDPProtocolNumber { @@ -65,21 +60,18 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) return &UDPMatcher{Data: data}, nil } -// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments). +// Match implements Matcher.Match. func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { - log.Infof("UDPMatcher called from: %s", string(debug.Stack())) netHeader := header.IPv4(pkt.NetworkHeader) - // TODO: Do we check proto here or elsewhere? I think elsewhere (check - // codesearch). + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the iptables.Check codepath as matchers are added. if netHeader.TransportProtocol() != header.UDPProtocolNumber { - log.Infof("UDPMatcher: wrong protocol number") return false, false } // We dont't match fragments. if frag := netHeader.FragmentOffset(); frag != 0 { - log.Infof("UDPMatcher: it's a fragment") if frag == 1 { return false, true } @@ -89,20 +81,18 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str // Now we need the transport header. However, this may not have been set // yet. - // TODO + // TODO(gvisor.dev/issue/170): Parsing the transport header should + // ultimately be moved into the iptables.Check codepath as matchers are + // added. var udpHeader header.UDP if pkt.TransportHeader != nil { - log.Infof("UDPMatcher: transport header is not nil") udpHeader = header.UDP(pkt.TransportHeader) } else { - log.Infof("UDPMatcher: transport header is nil") - log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil) // The UDP header hasn't been parsed yet. We have to do it here. if len(pkt.Data.First()) < header.UDPMinimumSize { // There's no valid UDP header here, so we hotdrop the // packet. - // TODO: Stats. - log.Warningf("Dropping UDP packet: size to small.") + log.Warningf("Dropping UDP packet: size too small.") return false, true } udpHeader = header.UDP(pkt.Data.First()) @@ -112,10 +102,6 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str // matching range. sourcePort := udpHeader.SourcePort() destinationPort := udpHeader.DestinationPort() - log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)", - udpHeader.SourcePort(), udpHeader.DestinationPort(), - um.Data.SourcePortStart, um.Data.SourcePortEnd, - um.Data.DestinationPortStart, um.Data.DestinationPortEnd) if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort { return false, false } diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go index bc963d40e..e9f0978eb 100644 --- a/test/iptables/filter_input.go +++ b/test/iptables/filter_input.go @@ -264,9 +264,9 @@ func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error { if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil { return err } - // if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil { - // return err - // } + if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil { + return err + } return filterTable("-L") } -- cgit v1.2.3 From e889f95671a9b7b1c6f65cb6fbc1b865a896e827 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 27 Jan 2020 12:30:06 -0800 Subject: More cleanup. --- pkg/tcpip/iptables/udp_matcher.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg') diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go index f59ca2027..3bb076f9c 100644 --- a/pkg/tcpip/iptables/udp_matcher.go +++ b/pkg/tcpip/iptables/udp_matcher.go @@ -73,9 +73,9 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str // We dont't match fragments. if frag := netHeader.FragmentOffset(); frag != 0 { if frag == 1 { + log.Warningf("Dropping UDP packet: malicious fragmented packet.") return false, true } - log.Warningf("Dropping UDP packet: malicious fragmented packet.") return false, false } -- cgit v1.2.3 From d6a2e01d3e57e0837c7e5cfda3b56c4dcfbb4627 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 27 Jan 2020 16:40:46 -0800 Subject: Address GH comments. --- pkg/sentry/socket/netfilter/netfilter.go | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index b8848f08a..a06562743 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -43,7 +43,7 @@ const ( // a metadata struct when the tables are written, and when they are read out we // verify that certain fields are the same. // -// metadata is opaque to netstack. +// metadata is used by this serialization/deserializing code, not netstack. type metadata struct { HookEntry [linux.NF_INET_NUMHOOKS]uint32 Underflow [linux.NF_INET_NUMHOOKS]uint32 @@ -51,14 +51,10 @@ type metadata struct { Size uint32 } -const enableDebug = false - // nflog logs messages related to the writing and reading of iptables, but only // when enableDebug is true. func nflog(format string, args ...interface{}) { - if enableDebug { - log.Infof("netfilter: "+format, args...) - } + log.Infof("netfilter: "+format, args...) } // GetInfo returns information about iptables. @@ -233,14 +229,12 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern return entries, meta, nil } -// TODO: SOMEHOW THIS IS NOT GETTING APPENDED! func marshalMatcher(matcher iptables.Matcher) []byte { switch m := matcher.(type) { case *iptables.UDPMatcher: return marshalUDPMatcher(m) default: - // TODO(gvisor.dev/issue/170): We don't support any matchers - // yet, so any call to marshalMatcher will panic. + // TODO(gvisor.dev/issue/170): Support other matchers. panic(fmt.Errorf("unknown matcher of type %T", matcher)) } } @@ -249,11 +243,11 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { nflog("convert to binary: marshalling UDP matcher: %+v", matcher) // We have to pad this struct size to a multiple of 8 bytes. - const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6 + size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8) linuxMatcher := linux.KernelXTEntryMatch{ XTEntryMatch: linux.XTEntryMatch{ - MatchSize: size, + MatchSize: uint16(size), }, Data: make([]byte, 0, linux.SizeOfXTUDP), } @@ -270,7 +264,7 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { buf := make([]byte, 0, size) buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher) - buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...) + buf = append(buf, make([]byte, size-len(buf))...) nflog("convert to binary: marshalled UDP matcher into %v", buf) return buf[:] } @@ -410,7 +404,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { } // TODO(gvisor.dev/issue/170): Matchers and targets can specify - // that they only work for certiain protocols, hooks, tables. + // that they only work for certain protocols, hooks, tables. // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry if len(optVal) < int(matchersSize) { @@ -684,3 +678,8 @@ func hookFromLinux(hook int) iptables.Hook { } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } + +// alignUp rounds a length up to an alignment. align must be a power of 2. +func alignUp(length int, align uint) int { + return (length + int(align) - 1) & ^(int(align) - 1) +} -- cgit v1.2.3 From 6adbdfe232c3da42a7f6f3a7d882d140196e4068 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 29 Jan 2020 07:50:34 -0500 Subject: supporting sError in guest kernel on Arm64 For test case 'TestBounce', we use KVM_SET_VCPU_EVENTS to trigger sError to leave guest. Signed-off-by: Bin Lu --- pkg/sentry/platform/ring0/aarch64.go | 6 +++--- pkg/sentry/platform/ring0/entry_arm64.s | 14 +++++++++++++- pkg/sentry/platform/ring0/offsets_arm64.go | 1 + 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go index 6b078cd1e..f6da41c27 100644 --- a/pkg/sentry/platform/ring0/aarch64.go +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -88,14 +88,14 @@ const ( El0Sync_undef El0Sync_dbg El0Sync_inv - VirtualizationException _NR_INTERRUPTS ) // System call vectors. const ( - Syscall Vector = El0Sync_svc - PageFault Vector = El0Sync_da + Syscall Vector = El0Sync_svc + PageFault Vector = El0Sync_da + VirtualizationException Vector = El0Error ) // VirtualAddressBits returns the number bits available for virtual addresses. diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index 679842288..baa6c4910 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -601,7 +601,19 @@ TEXT ·El0_fiq(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_error(SB),NOSPLIT,$0 - B ·Shutdown(SB) + KERNEL_ENTRY_FROM_EL0 + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + WORD $0xd538601a //MRS FAR_EL1, R26 + + MOVD R26, CPU_FAULT_ADDR(RSV_REG) + + MOVD $1, R3 + MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. + + MOVD $VirtualizationException, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + B ·Halt(SB) TEXT ·El0_sync_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index 8c960c749..057fb5c69 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -85,6 +85,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) p := &syscall.PtraceRegs{} fmt.Fprintf(w, "\n// Ptrace registers.\n") -- cgit v1.2.3 From 8dcedc953a610b97efe9f68ac8fecf5e15a7e26b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 29 Jan 2020 10:08:25 -0800 Subject: Add //pkg/sentry/devices/memdev. PiperOrigin-RevId: 292165063 --- pkg/abi/linux/dev.go | 3 ++ pkg/sentry/devices/memdev/BUILD | 28 +++++++++++ pkg/sentry/devices/memdev/full.go | 75 ++++++++++++++++++++++++++++++ pkg/sentry/devices/memdev/memdev.go | 59 ++++++++++++++++++++++++ pkg/sentry/devices/memdev/null.go | 76 ++++++++++++++++++++++++++++++ pkg/sentry/devices/memdev/random.go | 92 +++++++++++++++++++++++++++++++++++++ pkg/sentry/devices/memdev/zero.go | 88 +++++++++++++++++++++++++++++++++++ 7 files changed, 421 insertions(+) create mode 100644 pkg/sentry/devices/memdev/BUILD create mode 100644 pkg/sentry/devices/memdev/full.go create mode 100644 pkg/sentry/devices/memdev/memdev.go create mode 100644 pkg/sentry/devices/memdev/null.go create mode 100644 pkg/sentry/devices/memdev/random.go create mode 100644 pkg/sentry/devices/memdev/zero.go (limited to 'pkg') diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go index 421e11256..89f9a793f 100644 --- a/pkg/abi/linux/dev.go +++ b/pkg/abi/linux/dev.go @@ -36,6 +36,9 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) { // // See Documentations/devices.txt and uapi/linux/major.h. const ( + // MEM_MAJOR is the major device number for "memory" character devices. + MEM_MAJOR = 1 + // TTYAUX_MAJOR is the major device number for alternate TTY devices. TTYAUX_MAJOR = 5 diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD new file mode 100644 index 000000000..abe58f818 --- /dev/null +++ b/pkg/sentry/devices/memdev/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "memdev", + srcs = [ + "full.go", + "memdev.go", + "null.go", + "random.go", + "zero.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/rand", + "//pkg/safemem", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/pgalloc", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go new file mode 100644 index 000000000..c7e197691 --- /dev/null +++ b/pkg/sentry/devices/memdev/full.go @@ -0,0 +1,75 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const fullDevMinor = 7 + +// fullDevice implements vfs.Device for /dev/full. +type fullDevice struct{} + +// Open implements vfs.Device.Open. +func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &fullFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// fullFD implements vfs.FileDescriptionImpl for /dev/full. +type fullFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fullFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSPC +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSPC +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} diff --git a/pkg/sentry/devices/memdev/memdev.go b/pkg/sentry/devices/memdev/memdev.go new file mode 100644 index 000000000..5759900c4 --- /dev/null +++ b/pkg/sentry/devices/memdev/memdev.go @@ -0,0 +1,59 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memdev implements "mem" character devices, as implemented in Linux +// by drivers/char/mem.c and drivers/char/random.c. +package memdev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Register registers all devices implemented by this package in vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + for minor, dev := range map[uint32]vfs.Device{ + nullDevMinor: nullDevice{}, + zeroDevMinor: zeroDevice{}, + fullDevMinor: fullDevice{}, + randomDevMinor: randomDevice{}, + urandomDevMinor: randomDevice{}, + } { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MEM_MAJOR, minor, dev, &vfs.RegisterDeviceOptions{ + GroupName: "mem", + }); err != nil { + return err + } + } + return nil +} + +// CreateDevtmpfsFiles creates device special files in dev representing all +// devices implemented by this package. +func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { + for minor, name := range map[uint32]string{ + nullDevMinor: "null", + zeroDevMinor: "zero", + fullDevMinor: "full", + randomDevMinor: "random", + urandomDevMinor: "urandom", + } { + if err := dev.CreateDeviceFile(ctx, name, vfs.CharDevice, linux.MEM_MAJOR, minor, 0666 /* mode */); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go new file mode 100644 index 000000000..33d060d02 --- /dev/null +++ b/pkg/sentry/devices/memdev/null.go @@ -0,0 +1,76 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "io" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +const nullDevMinor = 3 + +// nullDevice implements vfs.Device for /dev/null. +type nullDevice struct{} + +// Open implements vfs.Device.Open. +func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &nullFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// nullFD implements vfs.FileDescriptionImpl for /dev/null. +type nullFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *nullFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *nullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, io.EOF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *nullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, io.EOF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *nullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *nullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *nullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go new file mode 100644 index 000000000..acfa23149 --- /dev/null +++ b/pkg/sentry/devices/memdev/random.go @@ -0,0 +1,92 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + randomDevMinor = 8 + urandomDevMinor = 9 +) + +// randomDevice implements vfs.Device for /dev/random and /dev/urandom. +type randomDevice struct{} + +// Open implements vfs.Device.Open. +func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &randomFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// randomFD implements vfs.FileDescriptionImpl for /dev/random. +type randomFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // off is the "file offset". off is accessed using atomic memory + // operations. + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *randomFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *randomFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *randomFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) + atomic.AddInt64(&fd.off, n) + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *randomFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // In Linux, this mixes the written bytes into the entropy pool; we just + // throw them away. + return src.NumBytes(), nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *randomFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + atomic.AddInt64(&fd.off, src.NumBytes()) + return src.NumBytes(), nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *randomFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Linux: drivers/char/random.c:random_fops.llseek == urandom_fops.llseek + // == noop_llseek + return atomic.LoadInt64(&fd.off), nil +} diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go new file mode 100644 index 000000000..3b1372b9e --- /dev/null +++ b/pkg/sentry/devices/memdev/zero.go @@ -0,0 +1,88 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +const zeroDevMinor = 5 + +// zeroDevice implements vfs.Device for /dev/zero. +type zeroDevice struct{} + +// Open implements vfs.Device.Open. +func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &zeroFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// zeroFD implements vfs.FileDescriptionImpl for /dev/zero. +type zeroFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *zeroFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *zeroFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *zeroFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *zeroFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *zeroFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) + if err != nil { + return err + } + opts.MappingIdentity = m + opts.Mappable = m + return nil +} -- cgit v1.2.3 From 37bb502670caefd4113da062495b4e318ea0f72e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 29 Jan 2020 10:35:32 -0800 Subject: sentry: rename SetRSEQInterruptedIP to SetOldRSeqInterruptedIP for arm64 For amd64, this has been done on cl/288342928. PiperOrigin-RevId: 292170856 --- pkg/sentry/arch/arch_arm64.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 94f1a808f..ac98897b5 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -137,8 +137,8 @@ func (c *context64) SetTLS(value uintptr) bool { return false } -// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. -func (c *context64) SetRSEQInterruptedIP(value uintptr) { +// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. +func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { c.Regs.Regs[3] = uint64(value) } -- cgit v1.2.3 From 148fda60e8dee29f2df85e3104e3d5de1a225bcf Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 29 Jan 2020 11:15:59 -0800 Subject: Add plumbing for file locks in VFS2. Updates #1480 PiperOrigin-RevId: 292180192 --- pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/file_description.go | 21 ++++++++++++++++++++- pkg/sentry/vfs/file_description_impl_util.go | 21 +++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) (limited to 'pkg') diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index ced9d07b1..14b39eb9d 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -44,6 +44,7 @@ go_library( "//pkg/context", "//pkg/fspath", "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sync", diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index badacb55e..5bac660c7 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" @@ -393,7 +394,25 @@ type FileDescriptionImpl interface { // Removexattr removes the given extended attribute from the file. Removexattr(ctx context.Context, name string) error - // TODO: file locking + // LockBSD tries to acquire a BSD-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): BSD-style file locking + LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error + + // LockBSD releases a BSD-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): BSD-style file locking + UnlockBSD(ctx context.Context, uid lock.UniqueID) error + + // LockPOSIX tries to acquire a POSIX-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): POSIX-style file locking + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error + + // UnlockPOSIX releases a POSIX-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): POSIX-style file locking + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error } // Dirent holds the information contained in struct linux_dirent64. diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index a4900c170..c2a52ec1b 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -152,6 +153,26 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) return syserror.ENOTSUP } +// LockBSD implements FileDescriptionImpl.LockBSD. +func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { + return syserror.EBADF +} + +// UnlockBSD implements FileDescriptionImpl.UnlockBSD. +func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { + return syserror.EBADF +} + +// LockPOSIX implements FileDescriptionImpl.LockPOSIX. +func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { + return syserror.EBADF +} + +// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. +func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { + return syserror.EBADF +} + // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. -- cgit v1.2.3 From 51b783505b1ec164b02b48a0fd234509fba01a73 Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Wed, 29 Jan 2020 15:41:51 -0800 Subject: Add support for TCP_DEFER_ACCEPT. PiperOrigin-RevId: 292233574 --- pkg/sentry/socket/netstack/netstack.go | 22 ++++ pkg/tcpip/tcpip.go | 6 ++ pkg/tcpip/transport/tcp/BUILD | 1 + pkg/tcpip/transport/tcp/accept.go | 25 ++--- pkg/tcpip/transport/tcp/connect.go | 53 +++++++++- pkg/tcpip/transport/tcp/endpoint.go | 26 ++++- pkg/tcpip/transport/tcp/forwarder.go | 4 +- pkg/tcpip/transport/tcp/tcp_test.go | 126 ++++++++++++++++++++++ test/syscalls/linux/socket_inet_loopback.cc | 158 ++++++++++++++++++++++++++++ test/syscalls/linux/tcp_socket.cc | 53 ++++++++++ 10 files changed, 451 insertions(+), 23 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 8619cc506..049d04bf2 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -1260,6 +1260,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_DEFER_ACCEPT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPDeferAcceptOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + default: emitUnimplementedEventTCP(t, name) } @@ -1713,6 +1725,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * v := usermem.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)))) + case linux.TCP_DEFER_ACCEPT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < 0 { + v = 0 + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 59c9b3fb0..0fa141d58 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -626,6 +626,12 @@ type TCPLingerTimeoutOption time.Duration // before being marked closed. type TCPTimeWaitTimeoutOption time.Duration +// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a +// accept to return a completed connection only when there is data to be +// read. This usually means the listening socket will drop the final ACK +// for a handshake till the specified timeout until a segment with data arrives. +type TCPDeferAcceptOption time.Duration + // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default // TTL value for multicast messages. The default is 1. type MulticastTTLOption uint8 diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 4acd9fb9a..7b4a87a2d 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -57,6 +57,7 @@ go_library( imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ + "//pkg/log", "//pkg/rand", "//pkg/sleep", "//pkg/sync", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index d469758eb..6101f2945 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -222,13 +222,13 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu // createConnectingEndpoint creates a new endpoint in a connecting state, with // the connection parameters given by the arguments. -func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) { +func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) { // Create a new endpoint. netProto := l.netProto if netProto == 0 { netProto = s.route.NetProto } - n := newEndpoint(l.stack, netProto, nil) + n := newEndpoint(l.stack, netProto, queue) n.v6only = l.v6only n.ID = s.id n.boundNICID = s.route.NICID() @@ -273,16 +273,17 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i // createEndpoint creates a new endpoint in connected state and then performs // the TCP 3-way handshake. -func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) { +func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) { // Create new endpoint. irs := s.sequenceNumber isn := generateSecureISN(s.id, l.stack.Seed()) - ep, err := l.createConnectingEndpoint(s, isn, irs, opts) + ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue) if err != nil { return nil, err } // listenEP is nil when listenContext is used by tcp.Forwarder. + deferAccept := time.Duration(0) if l.listenEP != nil { l.listenEP.mu.Lock() if l.listenEP.EndpointState() != StateListen { @@ -290,13 +291,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head return nil, tcpip.ErrConnectionAborted } l.addPendingEndpoint(ep) + deferAccept = l.listenEP.deferAccept l.listenEP.mu.Unlock() } // Perform the 3-way handshake. - h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow())) - - h.resetToSynRcvd(isn, irs, opts) + h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept) if err := h.execute(); err != nil { ep.Close() if l.listenEP != nil { @@ -377,16 +377,14 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header defer e.decSynRcvdCount() defer s.decRef() - n, err := ctx.createEndpointAndPerformHandshake(s, opts) + n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}) if err != nil { e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() return } ctx.removePendingEndpoint(n) - // Start the protocol goroutine. - wq := &waiter.Queue{} - n.startAcceptedLoop(wq) + n.startAcceptedLoop() e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() e.deliverAccepted(n) @@ -546,7 +544,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr } - n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions) + n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{}) if err != nil { e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() @@ -576,8 +574,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { // space available in the backlog. // Start the protocol goroutine. - wq := &waiter.Queue{} - n.startAcceptedLoop(wq) + n.startAcceptedLoop() e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() go e.deliverAccepted(n) } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 4e3c5419c..9ff7ac261 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -86,6 +86,19 @@ type handshake struct { // rcvWndScale is the receive window scale, as defined in RFC 1323. rcvWndScale int + + // startTime is the time at which the first SYN/SYN-ACK was sent. + startTime time.Time + + // deferAccept if non-zero will drop the final ACK for a passive + // handshake till an ACK segment with data is received or the timeout is + // hit. + deferAccept time.Duration + + // acked is true if the the final ACK for a 3-way handshake has + // been received. This is required to stop retransmitting the + // original SYN-ACK when deferAccept is enabled. + acked bool } func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake { @@ -112,6 +125,12 @@ func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake { return h } +func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake { + h := newHandshake(ep, rcvWnd) + h.resetToSynRcvd(isn, irs, opts, deferAccept) + return h +} + // FindWndScale determines the window scale to use for the given maximum window // size. func FindWndScale(wnd seqnum.Size) int { @@ -181,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 { // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD // state. -func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) { +func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) { h.active = false h.state = handshakeSynRcvd h.flags = header.TCPFlagSyn | header.TCPFlagAck @@ -189,6 +208,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea h.ackNum = irs + 1 h.mss = opts.MSS h.sndWndScale = opts.WS + h.deferAccept = deferAccept h.ep.mu.Lock() h.ep.setEndpointState(StateSynRecv) h.ep.mu.Unlock() @@ -352,6 +372,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { // We have previously received (and acknowledged) the peer's SYN. If the // peer acknowledges our SYN, the handshake is completed. if s.flagIsSet(header.TCPFlagAck) { + // If deferAccept is not zero and this is a bare ACK and the + // timeout is not hit then drop the ACK. + if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept { + h.acked = true + h.ep.stack.Stats().DroppedPackets.Increment() + return nil + } + // If the timestamp option is negotiated and the segment does // not carry a timestamp option then the segment must be dropped // as per https://tools.ietf.org/html/rfc7323#section-3.2. @@ -365,10 +393,16 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) } h.state = handshakeCompleted + h.ep.mu.Lock() h.ep.transitionToStateEstablishedLocked(h) + // If the segment has data then requeue it for the receiver + // to process it again once main loop is started. + if s.data.Size() > 0 { + s.incRef() + h.ep.enqueueSegment(s) + } h.ep.mu.Unlock() - return nil } @@ -471,6 +505,7 @@ func (h *handshake) execute() *tcpip.Error { } } + h.startTime = time.Now() // Initialize the resend timer. resendWaker := sleep.Waker{} timeOut := time.Duration(time.Second) @@ -524,11 +559,21 @@ func (h *handshake) execute() *tcpip.Error { switch index, _ := s.Fetch(true); index { case wakerForResend: timeOut *= 2 - if timeOut > 60*time.Second { + if timeOut > MaxRTO { return tcpip.ErrTimeout } rt.Reset(timeOut) - h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) + // Resend the SYN/SYN-ACK only if the following conditions hold. + // - It's an active handshake (deferAccept does not apply) + // - It's a passive handshake and we have not yet got the final-ACK. + // - It's a passive handshake and we got an ACK but deferAccept is + // enabled and we are now past the deferAccept duration. + // The last is required to provide a way for the peer to complete + // the connection with another ACK or data (as ACKs are never + // retransmitted on their own). + if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept { + h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) + } case wakerForNotification: n := h.ep.fetchNotifications() diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 13718ff55..8d52414b7 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -498,6 +498,13 @@ type endpoint struct { // without any data being acked. userTimeout time.Duration + // deferAccept if non-zero specifies a user specified time during + // which the final ACK of a handshake will be dropped provided the + // ACK is a bare ACK and carries no data. If the timeout is crossed then + // the bare ACK is accepted and the connection is delivered to the + // listener. + deferAccept time.Duration + // pendingAccepted is a synchronization primitive used to track number // of connections that are queued up to be delivered to the accepted // channel. We use this to ensure that all goroutines blocked on writing @@ -1574,6 +1581,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { e.mu.Unlock() return nil + case tcpip.TCPDeferAcceptOption: + e.mu.Lock() + if time.Duration(v) > MaxRTO { + v = tcpip.TCPDeferAcceptOption(MaxRTO) + } + e.deferAccept = time.Duration(v) + e.mu.Unlock() + return nil + default: return nil } @@ -1798,6 +1814,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { e.mu.Unlock() return nil + case *tcpip.TCPDeferAcceptOption: + e.mu.Lock() + *o = tcpip.TCPDeferAcceptOption(e.deferAccept) + e.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -2149,9 +2171,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error { // startAcceptedLoop sets up required state and starts a goroutine with the // main loop for accepted connections. -func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) { +func (e *endpoint) startAcceptedLoop() { e.mu.Lock() - e.waiterQueue = waiterQueue e.workerRunning = true e.mu.Unlock() wakerInitDone := make(chan struct{}) @@ -2177,7 +2198,6 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { default: return nil, nil, tcpip.ErrWouldBlock } - return n, n.waiterQueue, nil } diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index 7eb613be5..c9ee5bf06 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -157,13 +157,13 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, TSVal: r.synOptions.TSVal, TSEcr: r.synOptions.TSEcr, SACKPermitted: r.synOptions.SACKPermitted, - }) + }, queue) if err != nil { return nil, err } // Start the protocol goroutine. - ep.startAcceptedLoop(queue) + ep.startAcceptedLoop() return ep, nil } diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index df2fb1071..a12336d47 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -6787,3 +6787,129 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) { ), ) } + +func TestTCPDeferAccept(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.Create(-1) + + if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatal("Bind failed:", err) + } + + if err := c.EP.Listen(10); err != nil { + t.Fatal("Listen failed:", err) + } + + const tcpDeferAccept = 1 * time.Second + if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil { + t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err) + } + + irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */) + + if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock { + t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock) + } + + // Send data. This should result in an acceptable endpoint. + c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: irs + 1, + AckNum: iss + 1, + }) + + // Receive ACK for the data we sent. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck), + checker.SeqNum(uint32(iss+1)), + checker.AckNum(uint32(irs+5)))) + + // Give a bit of time for the socket to be delivered to the accept queue. + time.Sleep(50 * time.Millisecond) + aep, _, err := c.EP.Accept() + if err != nil { + t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err) + } + + aep.Close() + // Closing aep without reading the data should trigger a RST. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck), + checker.SeqNum(uint32(iss+1)), + checker.AckNum(uint32(irs+5)))) +} + +func TestTCPDeferAcceptTimeout(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.Create(-1) + + if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatal("Bind failed:", err) + } + + if err := c.EP.Listen(10); err != nil { + t.Fatal("Listen failed:", err) + } + + const tcpDeferAccept = 1 * time.Second + if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil { + t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err) + } + + irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */) + + if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock { + t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock) + } + + // Sleep for a little of the tcpDeferAccept timeout. + time.Sleep(tcpDeferAccept + 100*time.Millisecond) + + // On timeout expiry we should get a SYN-ACK retransmission. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.SrcPort(context.StackPort), + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn), + checker.AckNum(uint32(irs)+1))) + + // Send data. This should result in an acceptable endpoint. + c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: irs + 1, + AckNum: iss + 1, + }) + + // Receive ACK for the data we sent. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.SrcPort(context.StackPort), + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck), + checker.SeqNum(uint32(iss+1)), + checker.AckNum(uint32(irs+5)))) + + // Give sometime for the endpoint to be delivered to the accept queue. + time.Sleep(50 * time.Millisecond) + aep, _, err := c.EP.Accept() + if err != nil { + t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err) + } + + aep.Close() + // Closing aep without reading the data should trigger a RST. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.SrcPort(context.StackPort), + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck), + checker.SeqNum(uint32(iss+1)), + checker.AckNum(uint32(irs+5)))) +} diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index 2f9821555..3bf7081b9 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -828,6 +828,164 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) { EXPECT_EQ(get, kUserTimeout); } +// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not +// saved. Enable S/R once issue is fixed. +TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) { + // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not + // saved. Enable S/R issue is fixed. + DisableSave ds; + + auto const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast(&listen_addr), + listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), + reinterpret_cast(&listen_addr), &addrlen), + SyscallSucceeds()); + + const uint16_t port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + + // Set the TCP_DEFER_ACCEPT on the listening socket. + constexpr int kTCPDeferAccept = 3; + ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, + &kTCPDeferAccept, sizeof(kTCPDeferAccept)), + SyscallSucceeds()); + + // Connect to the listening socket. + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), + reinterpret_cast(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + + // Set the listening socket to nonblock so that we can verify that there is no + // connection in queue despite the connect above succeeding since the peer has + // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the + // FD to O_NONBLOCK. + int opts; + ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds()); + opts |= O_NONBLOCK; + ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds()); + + ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), + SyscallFailsWithErrno(EWOULDBLOCK)); + + // Set FD back to blocking. + opts &= ~O_NONBLOCK; + ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds()); + + // Now write some data to the socket. + int data = 0; + ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)), + SyscallSucceedsWithValue(sizeof(data))); + + // This should now cause the connection to complete and be delivered to the + // accept socket. + + // Accept the connection. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + + // Verify that the accepted socket returns the data written. + int get = -1; + ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0), + SyscallSucceedsWithValue(sizeof(get))); + + EXPECT_EQ(get, data); +} + +// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not +// saved. Enable S/R once issue is fixed. +TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) { + // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not + // saved. Enable S/R once issue is fixed. + DisableSave ds; + + auto const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast(&listen_addr), + listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), + reinterpret_cast(&listen_addr), &addrlen), + SyscallSucceeds()); + + const uint16_t port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + + // Set the TCP_DEFER_ACCEPT on the listening socket. + constexpr int kTCPDeferAccept = 3; + ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, + &kTCPDeferAccept, sizeof(kTCPDeferAccept)), + SyscallSucceeds()); + + // Connect to the listening socket. + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), + reinterpret_cast(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + + // Set the listening socket to nonblock so that we can verify that there is no + // connection in queue despite the connect above succeeding since the peer has + // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the + // FD to O_NONBLOCK. + int opts; + ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds()); + opts |= O_NONBLOCK; + ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds()); + + // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT + // timeout is hit. + absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1)); + ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), + SyscallFailsWithErrno(EWOULDBLOCK)); + + // Set FD back to blocking. + opts &= ~O_NONBLOCK; + ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds()); + + // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout + // is hit a SYN-ACK should be retransmitted by the listener as a last ditch + // attempt to complete the connection with or without data. + absl::SleepFor(absl::Seconds(2)); + + // Verify that we have a connection that can be accepted even though no + // data was written. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); +} + INSTANTIATE_TEST_SUITE_P( All, SocketInetLoopbackTest, ::testing::Values( diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 33a5ac66c..525ccbd88 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -1286,6 +1286,59 @@ TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) { EXPECT_EQ(get, kTCPUserTimeout); } +TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + // -ve TCP_DEFER_ACCEPT is same as setting it to zero. + constexpr int kNeg = -1; + EXPECT_THAT( + setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)), + SyscallSucceeds()); + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, 0); +} + +TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, 0); +} + +TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) { + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + // kTCPDeferAccept is in seconds. + // NOTE: linux translates seconds to # of retries and back from + // #of retries to seconds. Which means only certain values + // translate back exactly. That's why we use 3 here, a value of + // 5 will result in us getting back 7 instead of 5 in the + // getsockopt. + constexpr int kTCPDeferAccept = 3; + ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, + &kTCPDeferAccept, sizeof(kTCPDeferAccept)), + SyscallSucceeds()); + int get = -1; + socklen_t get_len = sizeof(get); + ASSERT_THAT( + getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len), + SyscallSucceeds()); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kTCPDeferAccept); +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest, ::testing::Values(AF_INET, AF_INET6)); -- cgit v1.2.3 From 6f841c304d7bd9af6167d7d049bd5c594358a1b9 Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Wed, 29 Jan 2020 19:54:11 -0800 Subject: Do not spawn a goroutine when calling stack.NDPDispatcher's methods Do not start a new goroutine when calling stack.NDPDispatcher.OnDuplicateAddressDetectionStatus. PiperOrigin-RevId: 292268574 --- pkg/tcpip/stack/ndp.go | 8 ++++---- pkg/tcpip/stack/ndp_test.go | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index 245694118..281ae786d 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -167,8 +167,8 @@ type NDPDispatcher interface { // reason, such as the address being removed). If an error occured // during DAD, err will be set and resolved must be ignored. // - // This function is permitted to block indefinitely without interfering - // with the stack's operation. + // This function is not permitted to block indefinitely. This function + // is also not permitted to call into the stack. OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) // OnDefaultRouterDiscovered will be called when a new default router is @@ -607,8 +607,8 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) { delete(ndp.dad, addr) // Let the integrator know DAD did not resolve. - if ndp.nic.stack.ndpDisp != nil { - go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil) + if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil { + ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil) } } diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 726468e41..8c76e80f2 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -497,7 +497,7 @@ func TestDADFail(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { ndpDisp := ndpDispatcher{ - dadC: make(chan ndpDADEvent), + dadC: make(chan ndpDADEvent, 1), } ndpConfigs := stack.DefaultNDPConfigurations() opts := stack.Options{ @@ -576,7 +576,7 @@ func TestDADFail(t *testing.T) { // removed. func TestDADStop(t *testing.T) { ndpDisp := ndpDispatcher{ - dadC: make(chan ndpDADEvent), + dadC: make(chan ndpDADEvent, 1), } ndpConfigs := stack.NDPConfigurations{ RetransmitTimer: time.Second, -- cgit v1.2.3 From ec0679737e8f9ab31ef6c7c3adb5a0005586b5a7 Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Thu, 30 Jan 2020 07:12:04 -0800 Subject: Do not include the Source Link Layer option with an unspecified source address When sending NDP messages with an unspecified source address, the Source Link Layer address must not be included. Test: stack_test.TestDADResolve PiperOrigin-RevId: 292341334 --- pkg/tcpip/stack/ndp.go | 22 ++-------------------- pkg/tcpip/stack/ndp_test.go | 12 ++++++++---- 2 files changed, 10 insertions(+), 24 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index 281ae786d..31294345d 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -538,29 +538,11 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error { r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false) defer r.Release() - linkAddr := ndp.nic.linkEP.LinkAddress() - isValidLinkAddr := header.IsValidUnicastEthernetAddress(linkAddr) - ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize - if isValidLinkAddr { - // Only include a Source Link Layer Address option if the NIC has a valid - // link layer address. - // - // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by - // LinkEndpoint.LinkAddress) before reaching this point. - ndpNSSize += header.NDPLinkLayerAddressSize - } - - hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + ndpNSSize) - pkt := header.ICMPv6(hdr.Prepend(ndpNSSize)) + hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize) + pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize)) pkt.SetType(header.ICMPv6NeighborSolicit) ns := header.NDPNeighborSolicit(pkt.NDPPayload()) ns.SetTargetAddress(addr) - - if isValidLinkAddr { - ns.Options().Serialize(header.NDPOptionsSerializer{ - header.NDPSourceLinkLayerAddressOption(linkAddr), - }) - } pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) sent := r.Stats().ICMP.V6PacketsSent diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 8c76e80f2..bc7cfbcb4 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -413,14 +413,18 @@ func TestDADResolve(t *testing.T) { t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber) } - // Check NDP packet. + // Check NDP NS packet. + // + // As per RFC 4861 section 4.3, a possible option is the Source Link + // Layer option, but this option MUST NOT be included when the source + // address of the packet is the unspecified address. checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(), + checker.SrcAddr(header.IPv6Any), + checker.DstAddr(header.SolicitedNodeAddr(addr1)), checker.TTL(header.NDPHopLimit), checker.NDPNS( checker.NDPNSTargetAddress(addr1), - checker.NDPNSOptions([]header.NDPOption{ - header.NDPSourceLinkLayerAddressOption(linkAddr1), - }), + checker.NDPNSOptions(nil), )) } }) -- cgit v1.2.3 From ede8dfab3760afc8063c3418f217e52f7ec70d42 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Thu, 30 Jan 2020 09:13:36 -0800 Subject: Enforce splice offset limits Splice must not allow negative offsets. Writes also must not allow offset + size to overflow int64. Reads are similarly broken, but not just in splice (b/148095030). Reported-by: syzbot+0e1ff0b95fb2859b4190@syzkaller.appspotmail.com PiperOrigin-RevId: 292361208 --- pkg/sentry/fs/tmpfs/inode_file.go | 10 ++++-- pkg/sentry/syscalls/linux/sys_splice.go | 16 ++++------ test/syscalls/linux/splice.cc | 56 +++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 12 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index dabc10662..25abbc151 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -17,6 +17,7 @@ package tmpfs import ( "fmt" "io" + "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -444,10 +445,15 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) defer rw.f.dataMu.Unlock() // Compute the range to write. - end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) - if end == rw.offset { // srcs.NumBytes() == 0? + if srcs.NumBytes() == 0 { + // Nothing to do. return 0, nil } + end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) + if end == math.MaxInt64 { + // Overflow. + return 0, syserror.EINVAL + } // Check if seals prevent either file growth or all writes. switch { diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index f43d6c155..fd642834b 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -25,6 +25,10 @@ import ( // doSplice implements a blocking splice operation. func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) { + if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 { + return 0, syserror.EINVAL + } + var ( total int64 n int64 @@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) - // Don't send a negative number of bytes. - if count < 0 { - return 0, nil, syserror.EINVAL - } - // Get files. inFile := t.GetFile(inFD) if inFile == nil { @@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - // The offset must be valid. - if offset < 0 { - return 0, nil, syserror.EINVAL - } - // Do the splice. n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{ Length: count, @@ -227,6 +221,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if _, err := t.CopyIn(outOffset, &offset); err != nil { return 0, nil, err } + // Use the destination offset. opts.DstOffset = true opts.DstStart = offset @@ -244,6 +239,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if _, err := t.CopyIn(inOffset, &offset); err != nil { return 0, nil, err } + // Use the source offset. opts.SrcOffset = true opts.SrcStart = offset diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc index 85232cb1f..faa1247f6 100644 --- a/test/syscalls/linux/splice.cc +++ b/test/syscalls/linux/splice.cc @@ -60,6 +60,62 @@ TEST(SpliceTest, TwoRegularFiles) { SyscallFailsWithErrno(EINVAL)); } +int memfd_create(const std::string& name, unsigned int flags) { + return syscall(__NR_memfd_create, name.c_str(), flags); +} + +TEST(SpliceTest, NegativeOffset) { + // Create a new pipe. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor rfd(fds[0]); + const FileDescriptor wfd(fds[1]); + + // Fill the pipe. + std::vector buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Open the output file as write only. + int fd; + EXPECT_THAT(fd = memfd_create("negative", 0), SyscallSucceeds()); + const FileDescriptor out_fd(fd); + + loff_t out_offset = 0xffffffffffffffffull; + constexpr int kSize = 2; + EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0), + SyscallFailsWithErrno(EINVAL)); +} + +// Write offset + size overflows int64. +// +// This is a regression test for b/148041624. +TEST(SpliceTest, WriteOverflow) { + // Create a new pipe. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor rfd(fds[0]); + const FileDescriptor wfd(fds[1]); + + // Fill the pipe. + std::vector buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Open the output file. + int fd; + EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds()); + const FileDescriptor out_fd(fd); + + // out_offset + kSize overflows INT64_MAX. + loff_t out_offset = 0x7ffffffffffffffeull; + constexpr int kSize = 3; + EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0), + SyscallFailsWithErrno(EINVAL)); +} + TEST(SpliceTest, SamePipe) { // Create a new pipe. int fds[2]; -- cgit v1.2.3 From 4ee64a248ec16fcc9e526a457a66648546611bfb Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Thu, 30 Jan 2020 11:48:36 -0800 Subject: Fix for panic in endpoint.Close(). When sending a RST on shutdown we need to double check the state after acquiring the work mutex as the endpoint could have transitioned out of a connected state from the time we checked it and we acquired the workMutex. I added two tests but sadly neither reproduce the panic. I am going to leave the tests in as they are good to have anyway. PiperOrigin-RevId: 292393800 --- pkg/tcpip/transport/tcp/BUILD | 1 + pkg/tcpip/transport/tcp/endpoint.go | 10 ++++- pkg/tcpip/transport/tcp/tcp_test.go | 55 ++++++++++++++++++++++++++++ test/syscalls/linux/BUILD | 1 + test/syscalls/linux/socket_ip_tcp_generic.cc | 33 +++++++++++++++++ 5 files changed, 98 insertions(+), 2 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 7b4a87a2d..272e8f570 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -91,6 +91,7 @@ go_test( tags = ["flaky"], deps = [ ":tcp", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/checker", diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 8d52414b7..b5a8e15ee 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2047,8 +2047,14 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { // work mutex is available. if e.workMu.TryLock() { e.mu.Lock() - e.resetConnectionLocked(tcpip.ErrConnectionAborted) - e.notifyProtocolGoroutine(notifyTickleWorker) + // We need to double check here to make + // sure worker has not transitioned the + // endpoint out of a connected state + // before trying to send a reset. + if e.EndpointState().connected() { + e.resetConnectionLocked(tcpip.ErrConnectionAborted) + e.notifyProtocolGoroutine(notifyTickleWorker) + } e.mu.Unlock() e.workMu.Unlock() } else { diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index a12336d47..2c1505067 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -21,6 +21,7 @@ import ( "testing" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" @@ -6913,3 +6914,57 @@ func TestTCPDeferAcceptTimeout(t *testing.T) { checker.SeqNum(uint32(iss+1)), checker.AckNum(uint32(irs+5)))) } + +func TestResetDuringClose(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + iss := seqnum.Value(789) + c.CreateConnected(iss, 30000, -1 /* epRecvBuf */) + // Send some data to make sure there is some unread + // data to trigger a reset on c.Close. + irs := c.IRS + c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: iss.Add(1), + AckNum: irs.Add(1), + RcvWnd: 30000, + }) + + // Receive ACK for the data we sent. + checker.IPv4(t, c.GetPacket(), checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck), + checker.SeqNum(uint32(irs.Add(1))), + checker.AckNum(uint32(iss.Add(5))))) + + // Close in a separate goroutine so that we can trigger + // a race with the RST we send below. This should not + // panic due to the route being released depeding on + // whether Close() sends an active RST or the RST sent + // below is processed by the worker first. + var wg sync.WaitGroup + + wg.Add(1) + go func() { + defer wg.Done() + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + SeqNum: iss.Add(5), + AckNum: c.IRS.Add(5), + RcvWnd: 30000, + Flags: header.TCPFlagRst, + }) + }() + + wg.Add(1) + go func() { + defer wg.Done() + c.EP.Close() + }() + + wg.Wait() +} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 74bf068ec..7958fd0d7 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -2173,6 +2173,7 @@ cc_library( ":socket_test_util", "//test/util:test_util", "//test/util:thread_util", + "@com_google_absl//absl/memory", "@com_google_absl//absl/time", "@com_google_googletest//:gtest", ], diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index 57ce8e169..27779e47c 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -24,6 +24,7 @@ #include #include "gtest/gtest.h" +#include "absl/memory/memory.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/socket_test_util.h" @@ -875,5 +876,37 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) { EXPECT_EQ(get, kAbove); } +TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) { + DisableSave ds; // Too many syscalls. + constexpr int kThreadCount = 1000; + std::unique_ptr instances[kThreadCount]; + for (int i = 0; i < kThreadCount; i++) { + instances[i] = absl::make_unique([&]() { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + ScopedThread t([&]() { + // Close one end to trigger sending of a FIN. + struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0}; + // Wait up to 20 seconds for the data. + constexpr int kPollTimeoutMs = 20000; + ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs), + SyscallSucceedsWithValue(1)); + ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds()); + }); + + // Send some data then close. + constexpr char kStr[] = "abc"; + ASSERT_THAT(write(sockets->first_fd(), kStr, 3), + SyscallSucceedsWithValue(3)); + absl::SleepFor(absl::Milliseconds(10)); + ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds()); + t.Join(); + }); + } + for (int i = 0; i < kThreadCount; i++) { + instances[i]->Join(); + } +} + } // namespace testing } // namespace gvisor -- cgit v1.2.3 From bc3a24d62788bd6b881afeda230f1a5550a5709a Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 31 Jan 2020 11:49:37 -0800 Subject: Internal change. PiperOrigin-RevId: 292587459 --- kokoro/kythe/generate_xrefs.sh | 2 +- pkg/sentry/fs/g3doc/inotify.md | 16 ++++++++-------- pkg/sentry/mm/README.md | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'pkg') diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh index 7a0fbb3cd..323b0f77b 100644 --- a/kokoro/kythe/generate_xrefs.sh +++ b/kokoro/kythe/generate_xrefs.sh @@ -23,7 +23,7 @@ bazel version python3 -V -readonly KYTHE_VERSION='v0.0.39' +readonly KYTHE_VERSION='v0.0.41' readonly WORKDIR="$(mktemp -d)" readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}" if [[ -n "$KOKORO_GIT_COMMIT" ]]; then diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md index 71a577d9d..85063d4e6 100644 --- a/pkg/sentry/fs/g3doc/inotify.md +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -112,11 +112,11 @@ attempts to queue a new event, it is already holding `fs.Watches.mu`. If we used `Inotify.mu` to also protect the event queue, this would violate the above lock ordering. -[dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go -[event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go -[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go -[inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go -[inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go -[inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go -[syscall_dir]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/syscalls/linux/ -[watch]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_watch.go +[dirent]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/dirent.go +[event]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_event.go +[fd_table]: https://github.com/google/gvisor/blob/master/pkg/sentry/kernel/fd_table.go +[inode]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode.go +[inode_watches]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode_inotify.go +[inotify]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify.go +[syscall_dir]: https://github.com/google/gvisor/blob/master/pkg/sentry/syscalls/linux/ +[watch]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_watch.go diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md index e1322e373..f4d43d927 100644 --- a/pkg/sentry/mm/README.md +++ b/pkg/sentry/mm/README.md @@ -274,7 +274,7 @@ In the sentry: methods [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform]. -[memmap]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/memmap/memmap.go -[mm]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/mm/mm.go -[pgalloc]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/pgalloc/pgalloc.go -[platform]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/platform/platform.go +[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go +[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go +[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go +[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go -- cgit v1.2.3 From 528dd1ec72fee1dd63c734fe92d1b972b5735b8f Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Fri, 31 Jan 2020 13:24:48 -0800 Subject: Extract multicast IP to Ethernet address mapping Test: header.TestEthernetAddressFromMulticastIPAddress PiperOrigin-RevId: 292604649 --- pkg/tcpip/header/eth.go | 41 +++++++++++++++++++++++++++++++++++++++++ pkg/tcpip/header/eth_test.go | 34 ++++++++++++++++++++++++++++++++++ pkg/tcpip/network/arp/arp.go | 19 ++----------------- pkg/tcpip/network/ipv6/icmp.go | 20 ++------------------ 4 files changed, 79 insertions(+), 35 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go index f5d2c127f..b1e92d2d7 100644 --- a/pkg/tcpip/header/eth.go +++ b/pkg/tcpip/header/eth.go @@ -134,3 +134,44 @@ func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool { // addr is a valid unicast ethernet address. return true } + +// EthernetAddressFromMulticastIPv4Address returns a multicast Ethernet address +// for a multicast IPv4 address. +// +// addr MUST be a multicast IPv4 address. +func EthernetAddressFromMulticastIPv4Address(addr tcpip.Address) tcpip.LinkAddress { + var linkAddrBytes [EthernetAddressSize]byte + // RFC 1112 Host Extensions for IP Multicasting + // + // 6.4. Extensions to an Ethernet Local Network Module: + // + // An IP host group address is mapped to an Ethernet multicast + // address by placing the low-order 23-bits of the IP address + // into the low-order 23 bits of the Ethernet multicast address + // 01-00-5E-00-00-00 (hex). + linkAddrBytes[0] = 0x1 + linkAddrBytes[2] = 0x5e + linkAddrBytes[3] = addr[1] & 0x7F + copy(linkAddrBytes[4:], addr[IPv4AddressSize-2:]) + return tcpip.LinkAddress(linkAddrBytes[:]) +} + +// EthernetAddressFromMulticastIPv6Address returns a multicast Ethernet address +// for a multicast IPv6 address. +// +// addr MUST be a multicast IPv6 address. +func EthernetAddressFromMulticastIPv6Address(addr tcpip.Address) tcpip.LinkAddress { + // RFC 2464 Transmission of IPv6 Packets over Ethernet Networks + // + // 7. Address Mapping -- Multicast + // + // An IPv6 packet with a multicast destination address DST, + // consisting of the sixteen octets DST[1] through DST[16], is + // transmitted to the Ethernet multicast address whose first + // two octets are the value 3333 hexadecimal and whose last + // four octets are the last four octets of DST. + linkAddrBytes := []byte(addr[IPv6AddressSize-EthernetAddressSize:]) + linkAddrBytes[0] = 0x33 + linkAddrBytes[1] = 0x33 + return tcpip.LinkAddress(linkAddrBytes[:]) +} diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go index 6634c90f5..7a0014ad9 100644 --- a/pkg/tcpip/header/eth_test.go +++ b/pkg/tcpip/header/eth_test.go @@ -66,3 +66,37 @@ func TestIsValidUnicastEthernetAddress(t *testing.T) { }) } } + +func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) { + tests := []struct { + name string + addr tcpip.Address + expectedLinkAddr tcpip.LinkAddress + }{ + { + name: "IPv4 Multicast without 24th bit set", + addr: "\xe0\x7e\xdc\xba", + expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba", + }, + { + name: "IPv4 Multicast with 24th bit set", + addr: "\xe0\xfe\xdc\xba", + expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr { + t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", got, test.expectedLinkAddr) + } + }) + } +} + +func TestEthernetAddressFromMulticastIPv6Address(t *testing.T) { + addr := tcpip.Address("\xff\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x1a") + if got, want := EthernetAddressFromMulticastIPv6Address(addr), tcpip.LinkAddress("\x33\x33\x0d\x0e\x0f\x1a"); got != want { + t.Fatalf("got EthernetAddressFromMulticastIPv6Address(%s) = %s, want = %s", addr, got, want) + } +} diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 1ceaebfbd..4da13c5df 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -178,24 +178,9 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo return broadcastMAC, true } if header.IsV4MulticastAddress(addr) { - // RFC 1112 Host Extensions for IP Multicasting - // - // 6.4. Extensions to an Ethernet Local Network Module: - // - // An IP host group address is mapped to an Ethernet multicast - // address by placing the low-order 23-bits of the IP address - // into the low-order 23 bits of the Ethernet multicast address - // 01-00-5E-00-00-00 (hex). - return tcpip.LinkAddress([]byte{ - 0x01, - 0x00, - 0x5e, - addr[header.IPv4AddressSize-3] & 0x7f, - addr[header.IPv4AddressSize-2], - addr[header.IPv4AddressSize-1], - }), true + return header.EthernetAddressFromMulticastIPv4Address(addr), true } - return "", false + return tcpip.LinkAddress([]byte(nil)), false } // SetOption implements NetworkProtocol. diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index dc20c0fd7..7491cfc41 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -441,23 +441,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack. // ResolveStaticAddress implements stack.LinkAddressResolver. func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { if header.IsV6MulticastAddress(addr) { - // RFC 2464 Transmission of IPv6 Packets over Ethernet Networks - // - // 7. Address Mapping -- Multicast - // - // An IPv6 packet with a multicast destination address DST, - // consisting of the sixteen octets DST[1] through DST[16], is - // transmitted to the Ethernet multicast address whose first - // two octets are the value 3333 hexadecimal and whose last - // four octets are the last four octets of DST. - return tcpip.LinkAddress([]byte{ - 0x33, - 0x33, - addr[header.IPv6AddressSize-4], - addr[header.IPv6AddressSize-3], - addr[header.IPv6AddressSize-2], - addr[header.IPv6AddressSize-1], - }), true + return header.EthernetAddressFromMulticastIPv6Address(addr), true } - return "", false + return tcpip.LinkAddress([]byte(nil)), false } -- cgit v1.2.3 From eba7bdc24d31388ca81eeab251ed2db108f785dc Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 31 Jan 2020 13:46:13 -0800 Subject: iptables: enable TCP matching with "-m tcp". A couple other things that changed: - There's a proper extension registration system for matchers. Anyone adding another matcher can use tcp_matcher.go or udp_matcher.go as a template. - All logging and use of syserr.Error in the netfilter package happens at the highest possible level (public functions). Lower-level functions just return normal, descriptive golang errors. --- pkg/abi/linux/netfilter.go | 52 ++++++++ pkg/sentry/socket/netfilter/BUILD | 4 + pkg/sentry/socket/netfilter/extensions.go | 98 +++++++++++++++ pkg/sentry/socket/netfilter/netfilter.go | 187 ++++++++--------------------- pkg/sentry/socket/netfilter/tcp_matcher.go | 143 ++++++++++++++++++++++ pkg/sentry/socket/netfilter/udp_matcher.go | 142 ++++++++++++++++++++++ pkg/tcpip/iptables/BUILD | 1 - pkg/tcpip/iptables/types.go | 3 + pkg/tcpip/iptables/udp_matcher.go | 113 ----------------- 9 files changed, 495 insertions(+), 248 deletions(-) create mode 100644 pkg/sentry/socket/netfilter/extensions.go create mode 100644 pkg/sentry/socket/netfilter/tcp_matcher.go create mode 100644 pkg/sentry/socket/netfilter/udp_matcher.go delete mode 100644 pkg/tcpip/iptables/udp_matcher.go (limited to 'pkg') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index 8e40bcc62..e4aabb6bb 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -348,6 +348,58 @@ func goString(cstring []byte) string { return string(cstring) } +// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp +// in include/uapi/linux/netfilter/xt_tcpudp.h. +type XTTCP struct { + // SourcePortStart specifies the inclusive start of the range of source + // ports to which the matcher applies. + SourcePortStart uint16 + + // SourcePortEnd specifies the inclusive end of the range of source ports + // to which the matcher applies. + SourcePortEnd uint16 + + // DestinationPortStart specifies the start of the destination port + // range to which the matcher applies. + DestinationPortStart uint16 + + // DestinationPortEnd specifies the start of the destination port + // range to which the matcher applies. + DestinationPortEnd uint16 + + // Option specifies that a particular TCP option must be set. + Option uint8 + + // FlagMask masks the FlagCompare byte when comparing to the TCP flag + // fields. + FlagMask uint8 + + // FlagCompare is binary and-ed with the TCP flag fields. + FlagCompare uint8 + + // InverseFlags flips the meaning of certain fields. See the + // TX_TCP_INV_* flags. + InverseFlags uint8 +} + +// SizeOfXTTCP is the size of an XTTCP. +const SizeOfXTTCP = 12 + +// Flags in XTTCP.InverseFlags. Corresponding constants are in +// include/uapi/linux/netfilter/xt_tcpudp.h. +const ( + // Invert the meaning of SourcePortStart/End. + XT_TCP_INV_SRCPT = 0x01 + // Invert the meaning of DestinationPortStart/End. + XT_TCP_INV_DSTPT = 0x02 + // Invert the meaning of FlagCompare. + XT_TCP_INV_FLAGS = 0x04 + // Invert the meaning of Option. + XT_TCP_INV_OPTION = 0x08 + // Enable all flags. + XT_TCP_INV_MASK = 0x0F +) + // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp // in include/uapi/linux/netfilter/xt_tcpudp.h. type XTUDP struct { diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index fa2a2cb66..c91ec7494 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -5,7 +5,10 @@ package(licenses = ["notice"]) go_library( name = "netfilter", srcs = [ + "extensions.go", "netfilter.go", + "tcp_matcher.go", + "udp_matcher.go", ], # This target depends on netstack and should only be used by epsocket, # which is allowed to depend on netstack. @@ -17,6 +20,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/syserr", "//pkg/tcpip", + "//pkg/tcpip/header", "//pkg/tcpip/iptables", "//pkg/tcpip/stack", "//pkg/usermem", diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go new file mode 100644 index 000000000..5a4cac84c --- /dev/null +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -0,0 +1,98 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/usermem" +) + +// TODO(gvisor.dev/issue/170): The following per-matcher params should be +// supported: +// - Table name +// - Match size +// - User size +// - Hooks +// - Proto +// - Family + +// matchMarshaler knows how to (un)marshal the matcher named name(). +type matchMarshaler interface { + // name is the matcher name as stored in the xt_entry_match struct. + name() string + + // marshal converts from an iptables.Matcher to an ABI struct. + marshal(matcher iptables.Matcher) []byte + + // unmarshal converts from the ABI matcher struct to an + // iptables.Matcher. + unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) +} + +var matchMarshalers = map[string]matchMarshaler{} + +// registerMatchMarshaler should be called by match extensions to register them +// with the netfilter package. +func registerMatchMarshaler(mm matchMarshaler) { + if _, ok := matchMarshalers[mm.name()]; ok { + panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name())) + } + matchMarshalers[mm.name()] = mm +} + +func marshalMatcher(matcher iptables.Matcher) []byte { + matchMaker, ok := matchMarshalers[matcher.Name()] + if !ok { + panic(fmt.Errorf("Unknown matcher of type %T.", matcher)) + } + return matchMaker.marshal(matcher) +} + +// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and +// data appended at the end. +func marshalEntryMatch(name string, data []byte) []byte { + nflog("marshaling matcher %q", name) + + // We have to pad this struct size to a multiple of 8 bytes. + size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8) + matcher := linux.KernelXTEntryMatch{ + XTEntryMatch: linux.XTEntryMatch{ + MatchSize: uint16(size), + }, + Data: data, + } + copy(matcher.Name[:], name) + + buf := make([]byte, 0, size) + buf = binary.Marshal(buf, usermem.ByteOrder, matcher) + return append(buf, make([]byte, size-len(buf))...) +} + +func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) { + matchMaker, ok := matchMarshalers[match.Name.String()] + if !ok { + return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) + } + return matchMaker.unmarshal(buf, filter) +} + +// alignUp rounds a length up to an alignment. align must be a power of 2. +func alignUp(length int, align uint) int { + return (length + int(align) - 1) & ^(int(align) - 1) +} diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 3dda6c7a1..8f14643b0 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,6 +17,7 @@ package netfilter import ( + "errors" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -34,10 +35,6 @@ import ( // shouldn't be reached - an error has occurred if we fall through to one. const errorTargetName = "ERROR" -const ( - matcherNameUDP = "udp" -) - // Metadata is used to verify that we are correctly serializing and // deserializing iptables into structs consumable by the iptables tool. We save // a metadata struct when the tables are written, and when they are read out we @@ -68,7 +65,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT // Find the appropriate table. table, err := findTable(stack, info.Name) if err != nil { - return linux.IPTGetinfo{}, err + nflog("%v", err) + return linux.IPTGetinfo{}, syserr.ErrInvalidArgument } // Get the hooks that apply to this table. @@ -95,39 +93,40 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen // Read in the struct and table name. var userEntries linux.IPTGetEntries if _, err := t.CopyIn(outPtr, &userEntries); err != nil { - log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name) + nflog("couldn't copy in entries %q", userEntries.Name) return linux.KernelIPTGetEntries{}, syserr.FromError(err) } // Find the appropriate table. table, err := findTable(stack, userEntries.Name) if err != nil { - log.Warningf("netfilter: couldn't find table %q", userEntries.Name) - return linux.KernelIPTGetEntries{}, err + nflog("%v", err) + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } // Convert netstack's iptables rules to something that the iptables // tool can understand. entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table) if err != nil { - return linux.KernelIPTGetEntries{}, err + nflog("couldn't read entries: %v", err) + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } if meta != table.Metadata().(metadata) { panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta)) } if binary.Size(entries) > uintptr(outLen) { - log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen)) + nflog("insufficient GetEntries output size: %d", uintptr(outLen)) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } return entries, nil } -func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, *syserr.Error) { +func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) { ipt := stack.IPTables() table, ok := ipt.Tables[tablename.String()] if !ok { - return iptables.Table{}, syserr.ErrInvalidArgument + return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename) } return table, nil } @@ -151,19 +150,19 @@ func FillDefaultIPTables(stack *stack.Stack) { stack.SetIPTables(ipt) } +// TODO: Return proto. // convertNetstackToBinary converts the iptables as stored in netstack to the // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a bit, reading some offsets, // jumping to those offsets, parsing again, etc. -func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) { +func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) { // Return values. var entries linux.KernelIPTGetEntries var meta metadata // The table name has to fit in the struct. if linux.XT_TABLE_MAXNAMELEN < len(tablename) { - log.Warningf("Table name %q too long.", tablename) - return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument + return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("Table name %q too long.", tablename) } copy(entries.Name[:], tablename) @@ -229,46 +228,6 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern return entries, meta, nil } -func marshalMatcher(matcher iptables.Matcher) []byte { - switch m := matcher.(type) { - case *iptables.UDPMatcher: - return marshalUDPMatcher(m) - default: - // TODO(gvisor.dev/issue/170): Support other matchers. - panic(fmt.Errorf("unknown matcher of type %T", matcher)) - } -} - -func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte { - nflog("convert to binary: marshalling UDP matcher: %+v", matcher) - - // We have to pad this struct size to a multiple of 8 bytes. - size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8) - - linuxMatcher := linux.KernelXTEntryMatch{ - XTEntryMatch: linux.XTEntryMatch{ - MatchSize: uint16(size), - }, - Data: make([]byte, 0, linux.SizeOfXTUDP), - } - copy(linuxMatcher.Name[:], matcherNameUDP) - - xtudp := linux.XTUDP{ - SourcePortStart: matcher.Data.SourcePortStart, - SourcePortEnd: matcher.Data.SourcePortEnd, - DestinationPortStart: matcher.Data.DestinationPortStart, - DestinationPortEnd: matcher.Data.DestinationPortEnd, - InverseFlags: matcher.Data.InverseFlags, - } - linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp) - - buf := make([]byte, 0, size) - buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher) - buf = append(buf, make([]byte, size-len(buf))...) - nflog("convert to binary: marshalled UDP matcher into %v", buf) - return buf[:] -} - func marshalTarget(target iptables.Target) []byte { switch target.(type) { case iptables.UnconditionalAcceptTarget: @@ -332,7 +291,7 @@ func translateFromStandardVerdict(verdict iptables.Verdict) int32 { // translateToStandardVerdict translates from the value in a // linux.XTStandardTarget to an iptables.Verdict. -func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) { +func translateToStandardVerdict(val int32) (iptables.Verdict, error) { // TODO(gvisor.dev/issue/170): Support other verdicts. switch val { case -linux.NF_ACCEPT - 1: @@ -340,13 +299,12 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) { case -linux.NF_DROP - 1: return iptables.Drop, nil case -linux.NF_QUEUE - 1: - log.Warningf("Unsupported iptables verdict QUEUE.") + return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE") case linux.NF_RETURN: - log.Warningf("Unsupported iptables verdict RETURN.") + return iptables.Invalid, errors.New("unsupported iptables verdict RETURN") default: - log.Warningf("Unknown iptables verdict %d.", val) + return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d.", val) } - return iptables.Invalid, syserr.ErrInvalidArgument } // SetEntries sets iptables rules for a single table. See @@ -354,7 +312,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) { func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // Get the basic rules data (struct ipt_replace). if len(optVal) < linux.SizeOfIPTReplace { - log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal)) + nflog("optVal has insufficient size for replace %d", len(optVal)) return syserr.ErrInvalidArgument } var replace linux.IPTReplace @@ -368,7 +326,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { case iptables.TablenameFilter: table = iptables.EmptyFilterTable() default: - log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) + nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) return syserr.ErrInvalidArgument } @@ -382,7 +340,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // Get the struct ipt_entry. if len(optVal) < linux.SizeOfIPTEntry { - log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal)) + nflog("optVal has insufficient size for entry %d", len(optVal)) return syserr.ErrInvalidArgument } var entry linux.IPTEntry @@ -392,7 +350,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { optVal = optVal[linux.SizeOfIPTEntry:] if entry.TargetOffset < linux.SizeOfIPTEntry { - log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset) + nflog("entry has too-small target offset %d", entry.TargetOffset) return syserr.ErrInvalidArgument } @@ -400,7 +358,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // filtering fields. filter, err := filterFromIPTIP(entry.IP) if err != nil { - return err + nflog("bad iptip: %v", err) + return syserr.ErrInvalidArgument } // TODO(gvisor.dev/issue/170): Matchers and targets can specify @@ -408,25 +367,26 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry if len(optVal) < int(matchersSize) { - log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return syserr.ErrInvalidArgument } matchers, err := parseMatchers(filter, optVal[:matchersSize]) if err != nil { - log.Warningf("netfilter: failed to parse matchers: %v", err) - return err + nflog("failed to parse matchers: %v", err) + return syserr.ErrInvalidArgument } optVal = optVal[matchersSize:] // Get the target of the rule. targetSize := entry.NextOffset - entry.TargetOffset if len(optVal) < int(targetSize) { - log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) return syserr.ErrInvalidArgument } target, err := parseTarget(optVal[:targetSize]) if err != nil { - return err + nflog("failed to parse target: %v", err) + return syserr.ErrInvalidArgument } optVal = optVal[targetSize:] @@ -439,7 +399,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { offset += uint32(entry.NextOffset) if initialOptValLen-len(optVal) != int(entry.NextOffset) { - log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) } } @@ -457,11 +417,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { } } if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset { - log.Warningf("Hook %v is unset.", hk) + nflog("hook %v is unset.", hk) return syserr.ErrInvalidArgument } if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset { - log.Warningf("Underflow %v is unset.", hk) + nflog("underflow %v is unset.", hk) return syserr.ErrInvalidArgument } } @@ -473,7 +433,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { for hook, ruleIdx := range table.BuiltinChains { if hook != iptables.Input { if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok { - log.Warningf("Hook %d is unsupported.", hook) + nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument } } @@ -499,7 +459,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // parseMatchers parses 0 or more matchers from optVal. optVal should contain // only the matchers. -func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) { +func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) { nflog("set entries: parsing matchers of size %d", len(optVal)) var matchers []iptables.Matcher for len(optVal) > 0 { @@ -507,8 +467,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma // Get the XTEntryMatch. if len(optVal) < linux.SizeOfXTEntryMatch { - log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal)) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal)) } var match linux.XTEntryMatch buf := optVal[:linux.SizeOfXTEntryMatch] @@ -517,45 +476,18 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma // Check some invariants. if match.MatchSize < linux.SizeOfXTEntryMatch { - log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) - return nil, syserr.ErrInvalidArgument + + return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) } if len(optVal) < int(match.MatchSize) { - log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal)) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal)) } - buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize] - var matcher iptables.Matcher - var err error - switch match.Name.String() { - case matcherNameUDP: - if len(buf) < linux.SizeOfXTUDP { - log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal)) - return nil, syserr.ErrInvalidArgument - } - // For alignment reasons, the match's total size may - // exceed what's strictly necessary to hold matchData. - var matchData linux.XTUDP - binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) - log.Infof("parseMatchers: parsed XTUDP: %+v", matchData) - matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{ - SourcePortStart: matchData.SourcePortStart, - SourcePortEnd: matchData.SourcePortEnd, - DestinationPortStart: matchData.DestinationPortStart, - DestinationPortEnd: matchData.DestinationPortEnd, - InverseFlags: matchData.InverseFlags, - }) - if err != nil { - log.Warningf("netfilter: failed to create UDP matcher: %v", err) - return nil, syserr.ErrInvalidArgument - } - - default: - log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String()) - return nil, syserr.ErrInvalidArgument + // Parse the specific matcher. + matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) + if err != nil { + return nil, fmt.Errorf("failed to create matcher: %v", err) } - matchers = append(matchers, matcher) // TODO(gvisor.dev/issue/170): Check the revision field. @@ -563,8 +495,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma } if len(optVal) != 0 { - log.Warningf("netfilter: optVal should be exhausted after parsing matchers") - return nil, syserr.ErrInvalidArgument + return nil, errors.New("optVal should be exhausted after parsing matchers") } return matchers, nil @@ -572,11 +503,10 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma // parseTarget parses a target from optVal. optVal should contain only the // target. -func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { +func parseTarget(optVal []byte) (iptables.Target, error) { nflog("set entries: parsing target of size %d", len(optVal)) if len(optVal) < linux.SizeOfXTEntryTarget { - log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal)) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal)) } var target linux.XTEntryTarget buf := optVal[:linux.SizeOfXTEntryTarget] @@ -585,8 +515,7 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { case "": // Standard target. if len(optVal) != linux.SizeOfXTStandardTarget { - log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal)) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal)) } var standardTarget linux.XTStandardTarget buf = optVal[:linux.SizeOfXTStandardTarget] @@ -602,15 +531,13 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { case iptables.Drop: return iptables.UnconditionalDropTarget{}, nil default: - log.Warningf("Unknown verdict: %v", verdict) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("Unknown verdict: %v", verdict) } case errorTargetName: // Error target. if len(optVal) != linux.SizeOfXTErrorTarget { - log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal)) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal)) } var errorTarget linux.XTErrorTarget buf = optVal[:linux.SizeOfXTErrorTarget] @@ -627,20 +554,17 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) { case errorTargetName: return iptables.ErrorTarget{}, nil default: - log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String()) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String()) } } // Unknown target. - log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) - return nil, syserr.ErrInvalidArgument + return nil, fmt.Errorf("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) } -func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) { +func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) { if containsUnsupportedFields(iptip) { - log.Warningf("netfilter: unsupported fields in struct iptip: %+v", iptip) - return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument + return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) } return iptables.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), @@ -678,8 +602,3 @@ func hookFromLinux(hook int) iptables.Hook { } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } - -// alignUp rounds a length up to an alignment. align must be a power of 2. -func alignUp(length int, align uint) int { - return (length + int(align) - 1) & ^(int(align) - 1) -} diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go new file mode 100644 index 000000000..1646d22f7 --- /dev/null +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameTCP = "tcp" + +func init() { + registerMatchMarshaler(tcpMarshaler{}) +} + +// tcpMarshaler implements matchMarshaler for TCP matching. +type tcpMarshaler struct{} + +// name implements matchMarshaler.name. +func (tcpMarshaler) name() string { + return matcherNameTCP +} + +// marshal implements matchMarshaler.marshal. +func (tcpMarshaler) marshal(mr iptables.Matcher) []byte { + matcher := mr.(*TCPMatcher) + xttcp := linux.XTTCP{ + SourcePortStart: matcher.sourcePortStart, + SourcePortEnd: matcher.sourcePortEnd, + DestinationPortStart: matcher.destinationPortStart, + DestinationPortEnd: matcher.destinationPortEnd, + } + buf := make([]byte, 0, linux.SizeOfXTUDP) + return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp)) +} + +// unmarshal implements matchMarshaler.unmarshal. +func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { + if len(buf) < linux.SizeOfXTTCP { + return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. + var matchData linux.XTTCP + binary.Unmarshal(buf[:linux.SizeOfXTTCP], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed XTTCP: %+v", matchData) + + if matchData.Option != 0 || + matchData.FlagMask != 0 || + matchData.FlagCompare != 0 || + matchData.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported TCP matcher flags set") + } + + if filter.Protocol != header.TCPProtocolNumber { + return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber) + } + + return &TCPMatcher{ + sourcePortStart: matchData.SourcePortStart, + sourcePortEnd: matchData.SourcePortEnd, + destinationPortStart: matchData.DestinationPortStart, + destinationPortEnd: matchData.DestinationPortEnd, + }, nil +} + +// TCPMatcher matches TCP packets and their headers. It implements Matcher. +type TCPMatcher struct { + sourcePortStart uint16 + sourcePortEnd uint16 + destinationPortStart uint16 + destinationPortEnd uint16 +} + +// Name implements Matcher.Name. +func (*TCPMatcher) Name() string { + return matcherNameTCP +} + +// Match implements Matcher.Match. +func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } + + // Now we need the transport header. However, this may not have been set + // yet. + // TODO(gvisor.dev/issue/170): Parsing the transport header should + // ultimately be moved into the iptables.Check codepath as matchers are + // added. + var tcpHeader header.TCP + if pkt.TransportHeader != nil { + tcpHeader = header.TCP(pkt.TransportHeader) + } else { + // The TCP header hasn't been parsed yet. We have to do it here. + if len(pkt.Data.First()) < header.TCPMinimumSize { + // There's no valid TCP header here, so we hotdrop the + // packet. + return false, true + } + tcpHeader = header.TCP(pkt.Data.First()) + } + + // Check whether the source and destination ports are within the + // matching range. + if sourcePort := tcpHeader.SourcePort(); sourcePort < tm.sourcePortStart || tm.sourcePortEnd < sourcePort { + return false, false + } + if destinationPort := tcpHeader.DestinationPort(); destinationPort < tm.destinationPortStart || tm.destinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go new file mode 100644 index 000000000..b6e95bbc5 --- /dev/null +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -0,0 +1,142 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameUDP = "udp" + +func init() { + registerMatchMarshaler(udpMarshaler{}) +} + +// udpMarshaler implements matchMarshaler for UDP matching. +type udpMarshaler struct{} + +// name implements matchMarshaler.name. +func (udpMarshaler) name() string { + return matcherNameUDP +} + +// marshal implements matchMarshaler.marshal. +func (udpMarshaler) marshal(mr iptables.Matcher) []byte { + matcher := mr.(*UDPMatcher) + xtudp := linux.XTUDP{ + SourcePortStart: matcher.sourcePortStart, + SourcePortEnd: matcher.sourcePortEnd, + DestinationPortStart: matcher.destinationPortStart, + DestinationPortEnd: matcher.destinationPortEnd, + } + buf := make([]byte, 0, linux.SizeOfXTUDP) + return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp)) +} + +// unmarshal implements matchMarshaler.unmarshal. +func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { + if len(buf) < linux.SizeOfXTUDP { + return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may exceed what's + // strictly necessary to hold matchData. + var matchData linux.XTUDP + binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed XTUDP: %+v", matchData) + + if matchData.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported UDP matcher inverse flags set") + } + + if filter.Protocol != header.UDPProtocolNumber { + return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) + } + + return &UDPMatcher{ + sourcePortStart: matchData.SourcePortStart, + sourcePortEnd: matchData.SourcePortEnd, + destinationPortStart: matchData.DestinationPortStart, + destinationPortEnd: matchData.DestinationPortEnd, + }, nil +} + +// UDPMatcher matches UDP packets and their headers. It implements Matcher. +type UDPMatcher struct { + sourcePortStart uint16 + sourcePortEnd uint16 + destinationPortStart uint16 + destinationPortEnd uint16 +} + +// Name implements Matcher.Name. +func (*UDPMatcher) Name() string { + return matcherNameUDP +} + +// Match implements Matcher.Match. +func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the iptables.Check codepath as matchers are added. + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } + + // Now we need the transport header. However, this may not have been set + // yet. + // TODO(gvisor.dev/issue/170): Parsing the transport header should + // ultimately be moved into the iptables.Check codepath as matchers are + // added. + var udpHeader header.UDP + if pkt.TransportHeader != nil { + udpHeader = header.UDP(pkt.TransportHeader) + } else { + // The UDP header hasn't been parsed yet. We have to do it here. + if len(pkt.Data.First()) < header.UDPMinimumSize { + // There's no valid UDP header here, so we hotdrop the + // packet. + return false, true + } + udpHeader = header.UDP(pkt.Data.First()) + } + + // Check whether the source and destination ports are within the + // matching range. + if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort { + return false, false + } + if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD index bab26580b..d1b73cfdf 100644 --- a/pkg/tcpip/iptables/BUILD +++ b/pkg/tcpip/iptables/BUILD @@ -8,7 +8,6 @@ go_library( "iptables.go", "targets.go", "types.go", - "udp_matcher.go", ], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go index 7f77802a0..d660aab04 100644 --- a/pkg/tcpip/iptables/types.go +++ b/pkg/tcpip/iptables/types.go @@ -171,6 +171,9 @@ type IPHeaderFilter struct { // A Matcher is the interface for matching packets. type Matcher interface { + // Name returns the name of the Matcher. + Name() string + // Match returns whether the packet matches and whether the packet // should be "hotdropped", i.e. dropped immediately. This is usually // used for suspicious packets. diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go deleted file mode 100644 index 3bb076f9c..000000000 --- a/pkg/tcpip/iptables/udp_matcher.go +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package iptables - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/header" -) - -// TODO(gvisor.dev/issue/170): The following per-matcher params should be -// supported: -// - Table name -// - Match size -// - User size -// - Hooks -// - Proto -// - Family - -// UDPMatcher matches UDP packets and their headers. It implements Matcher. -type UDPMatcher struct { - Data UDPMatcherParams -} - -// UDPMatcherParams are the parameters used to create a UDPMatcher. -type UDPMatcherParams struct { - SourcePortStart uint16 - SourcePortEnd uint16 - DestinationPortStart uint16 - DestinationPortEnd uint16 - InverseFlags uint8 -} - -// NewUDPMatcher returns a new instance of UDPMatcher. -func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) { - log.Infof("Adding rule with UDPMatcherParams: %+v", data) - - if data.InverseFlags != 0 { - return nil, fmt.Errorf("unsupported UDP matcher inverse flags set") - } - - if filter.Protocol != header.UDPProtocolNumber { - return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) - } - - return &UDPMatcher{Data: data}, nil -} - -// Match implements Matcher.Match. -func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader) - - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved - // into the iptables.Check codepath as matchers are added. - if netHeader.TransportProtocol() != header.UDPProtocolNumber { - return false, false - } - - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - log.Warningf("Dropping UDP packet: malicious fragmented packet.") - return false, true - } - return false, false - } - - // Now we need the transport header. However, this may not have been set - // yet. - // TODO(gvisor.dev/issue/170): Parsing the transport header should - // ultimately be moved into the iptables.Check codepath as matchers are - // added. - var udpHeader header.UDP - if pkt.TransportHeader != nil { - udpHeader = header.UDP(pkt.TransportHeader) - } else { - // The UDP header hasn't been parsed yet. We have to do it here. - if len(pkt.Data.First()) < header.UDPMinimumSize { - // There's no valid UDP header here, so we hotdrop the - // packet. - log.Warningf("Dropping UDP packet: size too small.") - return false, true - } - udpHeader = header.UDP(pkt.Data.First()) - } - - // Check whether the source and destination ports are within the - // matching range. - sourcePort := udpHeader.SourcePort() - destinationPort := udpHeader.DestinationPort() - if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort { - return false, false - } - if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort { - return false, false - } - - return true, false -} -- cgit v1.2.3 From 29ad5762e4549d961f48c65292cfdeb7256524f6 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 31 Jan 2020 13:53:58 -0800 Subject: Spelling --- pkg/sentry/socket/netfilter/extensions.go | 18 +++++++++--------- pkg/sentry/socket/netfilter/tcp_matcher.go | 10 +++++----- pkg/sentry/socket/netfilter/udp_matcher.go | 10 +++++----- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go index 5a4cac84c..b5fbb52e4 100644 --- a/pkg/sentry/socket/netfilter/extensions.go +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -32,8 +32,8 @@ import ( // - Proto // - Family -// matchMarshaler knows how to (un)marshal the matcher named name(). -type matchMarshaler interface { +// matchMaker knows how to (un)marshal the matcher named name(). +type matchMaker interface { // name is the matcher name as stored in the xt_entry_match struct. name() string @@ -45,19 +45,19 @@ type matchMarshaler interface { unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) } -var matchMarshalers = map[string]matchMarshaler{} +var matchMakers = map[string]matchMaker{} -// registerMatchMarshaler should be called by match extensions to register them +// registermatchMaker should be called by match extensions to register them // with the netfilter package. -func registerMatchMarshaler(mm matchMarshaler) { - if _, ok := matchMarshalers[mm.name()]; ok { +func registerMatchMaker(mm matchMaker) { + if _, ok := matchMakers[mm.name()]; ok { panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name())) } - matchMarshalers[mm.name()] = mm + matchMakers[mm.name()] = mm } func marshalMatcher(matcher iptables.Matcher) []byte { - matchMaker, ok := matchMarshalers[matcher.Name()] + matchMaker, ok := matchMakers[matcher.Name()] if !ok { panic(fmt.Errorf("Unknown matcher of type %T.", matcher)) } @@ -85,7 +85,7 @@ func marshalEntryMatch(name string, data []byte) []byte { } func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) { - matchMaker, ok := matchMarshalers[match.Name.String()] + matchMaker, ok := matchMakers[match.Name.String()] if !ok { return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) } diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 1646d22f7..6b2f4c31a 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -28,18 +28,18 @@ import ( const matcherNameTCP = "tcp" func init() { - registerMatchMarshaler(tcpMarshaler{}) + registerMatchMaker(tcpMarshaler{}) } -// tcpMarshaler implements matchMarshaler for TCP matching. +// tcpMarshaler implements matchMaker for TCP matching. type tcpMarshaler struct{} -// name implements matchMarshaler.name. +// name implements matchMaker.name. func (tcpMarshaler) name() string { return matcherNameTCP } -// marshal implements matchMarshaler.marshal. +// marshal implements matchMaker.marshal. func (tcpMarshaler) marshal(mr iptables.Matcher) []byte { matcher := mr.(*TCPMatcher) xttcp := linux.XTTCP{ @@ -52,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte { return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp)) } -// unmarshal implements matchMarshaler.unmarshal. +// unmarshal implements matchMaker.unmarshal. func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { if len(buf) < linux.SizeOfXTTCP { return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index b6e95bbc5..86aa11696 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -28,18 +28,18 @@ import ( const matcherNameUDP = "udp" func init() { - registerMatchMarshaler(udpMarshaler{}) + registerMatchMaker(udpMarshaler{}) } -// udpMarshaler implements matchMarshaler for UDP matching. +// udpMarshaler implements matchMaker for UDP matching. type udpMarshaler struct{} -// name implements matchMarshaler.name. +// name implements matchMaker.name. func (udpMarshaler) name() string { return matcherNameUDP } -// marshal implements matchMarshaler.marshal. +// marshal implements matchMaker.marshal. func (udpMarshaler) marshal(mr iptables.Matcher) []byte { matcher := mr.(*UDPMatcher) xtudp := linux.XTUDP{ @@ -52,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte { return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp)) } -// unmarshal implements matchMarshaler.unmarshal. +// unmarshal implements matchMaker.unmarshal. func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { if len(buf) < linux.SizeOfXTUDP { return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) -- cgit v1.2.3 From 77bf586db75b3dbd9dcb14c349bde8372d26425c Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Fri, 31 Jan 2020 13:54:57 -0800 Subject: Use multicast Ethernet address for multicast NDP As per RFC 2464 section 7, an IPv6 packet with a multicast destination address is transmitted to the mapped Ethernet multicast address. Test: - ipv6.TestLinkResolution - stack_test.TestDADResolve - stack_test.TestRouterSolicitation PiperOrigin-RevId: 292610529 --- pkg/tcpip/header/ipv6_test.go | 29 ++++++++++++++++++++++ pkg/tcpip/link/channel/channel.go | 29 ++++++++++++++-------- pkg/tcpip/network/ipv6/icmp.go | 6 ++++- pkg/tcpip/network/ipv6/icmp_test.go | 12 ++++++--- pkg/tcpip/stack/ndp.go | 17 +++++++++++++ pkg/tcpip/stack/ndp_test.go | 16 +++++++++++- pkg/tcpip/stack/route.go | 4 ++- pkg/tcpip/transport/tcp/testing/context/context.go | 6 ++++- 8 files changed, 101 insertions(+), 18 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go index 29f54bc57..c3ad503aa 100644 --- a/pkg/tcpip/header/ipv6_test.go +++ b/pkg/tcpip/header/ipv6_test.go @@ -17,6 +17,7 @@ package header_test import ( "bytes" "crypto/sha256" + "fmt" "testing" "github.com/google/go-cmp/cmp" @@ -300,3 +301,31 @@ func TestScopeForIPv6Address(t *testing.T) { }) } } + +func TestSolicitedNodeAddr(t *testing.T) { + tests := []struct { + addr tcpip.Address + want tcpip.Address + }{ + { + addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\xa0", + want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0", + }, + { + addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x0e\x0f\xa0", + want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0", + }, + { + addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x01\x02\x03", + want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x01\x02\x03", + }, + } + + for _, test := range tests { + t.Run(fmt.Sprintf("%s", test.addr), func(t *testing.T) { + if got := header.SolicitedNodeAddr(test.addr); got != test.want { + t.Fatalf("got header.SolicitedNodeAddr(%s) = %s, want = %s", test.addr, got, test.want) + } + }) + } +} diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index 71b9da797..78d447acd 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -30,15 +30,16 @@ type PacketInfo struct { Pkt tcpip.PacketBuffer Proto tcpip.NetworkProtocolNumber GSO *stack.GSO + Route stack.Route } // Endpoint is link layer endpoint that stores outbound packets in a channel // and allows injection of inbound packets. type Endpoint struct { - dispatcher stack.NetworkDispatcher - mtu uint32 - linkAddr tcpip.LinkAddress - GSO bool + dispatcher stack.NetworkDispatcher + mtu uint32 + linkAddr tcpip.LinkAddress + LinkEPCapabilities stack.LinkEndpointCapabilities // c is where outbound packets are queued. c chan PacketInfo @@ -122,11 +123,7 @@ func (e *Endpoint) MTU() uint32 { // Capabilities implements stack.LinkEndpoint.Capabilities. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { - caps := stack.LinkEndpointCapabilities(0) - if e.GSO { - caps |= stack.CapabilityHardwareGSO - } - return caps + return e.LinkEPCapabilities } // GSOMaxSize returns the maximum GSO packet size. @@ -146,11 +143,16 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress { } // WritePacket stores outbound packets into the channel. -func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error { +func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error { + // Clone r then release its resource so we only get the relevant fields from + // stack.Route without holding a reference to a NIC's endpoint. + route := r.Clone() + route.Release() p := PacketInfo{ Pkt: pkt, Proto: protocol, GSO: gso, + Route: route, } select { @@ -162,7 +164,11 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.Ne } // WritePackets stores outbound packets into the channel. -func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { +func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + // Clone r then release its resource so we only get the relevant fields from + // stack.Route without holding a reference to a NIC's endpoint. + route := r.Clone() + route.Release() payloadView := pkts[0].Data.ToView() n := 0 packetLoop: @@ -176,6 +182,7 @@ packetLoop: }, Proto: protocol, GSO: gso, + Route: route, } select { diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index 7491cfc41..60817d36d 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -408,10 +408,14 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { // LinkAddressRequest implements stack.LinkAddressResolver. func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { snaddr := header.SolicitedNodeAddr(addr) + + // TODO(b/148672031): Use stack.FindRoute instead of manually creating the + // route here. Note, we would need the nicID to do this properly so the right + // NIC (associated to linkEP) is used to send the NDP NS message. r := &stack.Route{ LocalAddress: localAddr, RemoteAddress: snaddr, - RemoteLinkAddress: broadcastMAC, + RemoteLinkAddress: header.EthernetAddressFromMulticastIPv6Address(snaddr), } hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go index 7a6820643..d0e930e20 100644 --- a/pkg/tcpip/network/ipv6/icmp_test.go +++ b/pkg/tcpip/network/ipv6/icmp_test.go @@ -270,8 +270,9 @@ func (c *testContext) cleanup() { } type routeArgs struct { - src, dst *channel.Endpoint - typ header.ICMPv6Type + src, dst *channel.Endpoint + typ header.ICMPv6Type + remoteLinkAddr tcpip.LinkAddress } func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) { @@ -292,6 +293,11 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header. t.Errorf("unexpected protocol number %d", pi.Proto) return } + + if len(args.remoteLinkAddr) != 0 && args.remoteLinkAddr != pi.Route.RemoteLinkAddress { + t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr) + } + ipv6 := header.IPv6(pi.Pkt.Header.View()) transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader()) if transProto != header.ICMPv6ProtocolNumber { @@ -339,7 +345,7 @@ func TestLinkResolution(t *testing.T) { t.Fatalf("ep.Write(_) = _, , %s, want = _, , tcpip.ErrNoLinkAddress", err) } for _, args := range []routeArgs{ - {src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit}, + {src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))}, {src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6NeighborAdvert}, } { routeICMPv6Packet(t, args, func(t *testing.T, icmpv6 header.ICMPv6) { diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index 31294345d..6123fda33 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -538,6 +538,14 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error { r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false) defer r.Release() + // Route should resolve immediately since snmc is a multicast address so a + // remote link address can be calculated without a resolution process. + if c, err := r.Resolve(nil); err != nil { + log.Fatalf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err) + } else if c != nil { + log.Fatalf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()) + } + hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize) pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize)) pkt.SetType(header.ICMPv6NeighborSolicit) @@ -1197,6 +1205,15 @@ func (ndp *ndpState) startSolicitingRouters() { r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false) defer r.Release() + // Route should resolve immediately since + // header.IPv6AllRoutersMulticastAddress is a multicast address so a + // remote link address can be calculated without a resolution process. + if c, err := r.Resolve(nil); err != nil { + log.Fatalf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err) + } else if c != nil { + log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()) + } + payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize) pkt := header.ICMPv6(hdr.Prepend(payloadSize)) diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index bc7cfbcb4..8af8565f7 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -336,6 +336,7 @@ func TestDADResolve(t *testing.T) { opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired s := stack.New(opts) if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) @@ -413,6 +414,12 @@ func TestDADResolve(t *testing.T) { t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber) } + // Make sure the right remote link address is used. + snmc := header.SolicitedNodeAddr(addr1) + if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want { + t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want) + } + // Check NDP NS packet. // // As per RFC 4861 section 4.3, a possible option is the Source Link @@ -420,7 +427,7 @@ func TestDADResolve(t *testing.T) { // address of the packet is the unspecified address. checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(), checker.SrcAddr(header.IPv6Any), - checker.DstAddr(header.SolicitedNodeAddr(addr1)), + checker.DstAddr(snmc), checker.TTL(header.NDPHopLimit), checker.NDPNS( checker.NDPNSTargetAddress(addr1), @@ -3292,6 +3299,7 @@ func TestRouterSolicitation(t *testing.T) { t.Run(test.name, func(t *testing.T) { t.Parallel() e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1) + e.LinkEPCapabilities |= stack.CapabilityResolutionRequired waitForPkt := func(timeout time.Duration) { t.Helper() ctx, _ := context.WithTimeout(context.Background(), timeout) @@ -3304,6 +3312,12 @@ func TestRouterSolicitation(t *testing.T) { if p.Proto != header.IPv6ProtocolNumber { t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber) } + + // Make sure the right remote link address is used. + if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want { + t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want) + } + checker.IPv6(t, p.Pkt.Header.View(), checker.SrcAddr(header.IPv6Any), diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 517f4b941..f565aafb2 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -225,7 +225,9 @@ func (r *Route) Release() { // Clone Clone a route such that the original one can be released and the new // one will remain valid. func (r *Route) Clone() Route { - r.ref.incRef() + if r.ref != nil { + r.ref.incRef() + } return *r } diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 730ac4292..1e9a0dea3 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -1082,7 +1082,11 @@ func (c *Context) SACKEnabled() bool { // SetGSOEnabled enables or disables generic segmentation offload. func (c *Context) SetGSOEnabled(enable bool) { - c.linkEP.GSO = enable + if enable { + c.linkEP.LinkEPCapabilities |= stack.CapabilityHardwareGSO + } else { + c.linkEP.LinkEPCapabilities &^= stack.CapabilityHardwareGSO + } } // MSSWithoutOptions returns the value for the MSS used by the stack when no -- cgit v1.2.3 From 6c3072243dfbf70062de5f610e14fd6ed2ce5f32 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 31 Jan 2020 14:14:52 -0800 Subject: Implement file locks for regular tmpfs files in VFSv2. Add a file lock implementation that can be embedded into various filesystem implementations. Updates #1480 PiperOrigin-RevId: 292614758 --- pkg/sentry/fsimpl/tmpfs/BUILD | 3 ++ pkg/sentry/fsimpl/tmpfs/regular_file.go | 23 +++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 56 ++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 43 +++++++++++++++++ pkg/sentry/vfs/lock/BUILD | 13 +++++ pkg/sentry/vfs/lock/lock.go | 72 ++++++++++++++++++++++++++++ 6 files changed, 210 insertions(+) create mode 100644 pkg/sentry/vfs/lock/BUILD create mode 100644 pkg/sentry/vfs/lock/lock.go (limited to 'pkg') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index c61366224..57abd5583 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -38,6 +38,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", @@ -47,6 +48,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -86,6 +88,7 @@ go_test( "//pkg/context", "//pkg/fspath", "//pkg/sentry/contexttest", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index e9e6faf67..dab346a41 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -192,6 +193,28 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { return nil } +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { + return fd.inode().lockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { + fd.inode().unlockBSD(uid) + return nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { + return fd.inode().lockPOSIX(uid, t, rng, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { + fd.inode().unlockPOSIX(uid, rng) + return nil +} + // regularFileReadWriter implements safemem.Reader and Safemem.Writer. type regularFileReadWriter struct { file *regularFile diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 32552e261..2b52992ea 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -24,9 +24,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -260,6 +262,60 @@ func TestPWrite(t *testing.T) { } } +func TestLocks(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + var ( + uid1 lock.UniqueID + uid2 lock.UniqueID + // Non-blocking. + block lock.Blocker + ) + + uid1 = 123 + uid2 = 456 + + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { + t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + + rng1 := lock.LockRange{0, 1} + rng2 := lock.LockRange{1, 2} + + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil { + t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) + } +} + func TestPRead(t *testing.T) { ctx := contexttest.Context(t) fd, cleanup, err := newFileFD(ctx, 0644) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 88dbd6e35..2108d0f4d 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -30,10 +30,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -153,6 +155,9 @@ type inode struct { rdevMajor uint32 rdevMinor uint32 + // Advisory file locks, which lock at the inode level. + locks lock.FileLocks + impl interface{} // immutable } @@ -352,6 +357,44 @@ func (i *inode) setStat(stat linux.Statx) error { return nil } +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + switch i.impl.(type) { + case *regularFile: + return i.locks.LockBSD(uid, t, block) + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) unlockBSD(uid fslock.UniqueID) error { + switch i.impl.(type) { + case *regularFile: + i.locks.UnlockBSD(uid) + return nil + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + switch i.impl.(type) { + case *regularFile: + return i.locks.LockPOSIX(uid, t, rng, block) + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { + switch i.impl.(type) { + case *regularFile: + i.locks.UnlockPOSIX(uid, rng) + return nil + } + return syserror.EBADF +} + // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD new file mode 100644 index 000000000..d9ab063b7 --- /dev/null +++ b/pkg/sentry/vfs/lock/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "lock", + srcs = ["lock.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/fs/lock", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go new file mode 100644 index 000000000..724dfe743 --- /dev/null +++ b/pkg/sentry/vfs/lock/lock.go @@ -0,0 +1,72 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock provides POSIX and BSD style file locking for VFS2 file +// implementations. +// +// The actual implementations can be found in the lock package under +// sentry/fs/lock. +package lock + +import ( + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) +// and flock(2) respectively in Linux. It can be embedded into various file +// implementations for VFS2 that support locking. +// +// Note that in Linux these two types of locks are _not_ cooperative, because +// race and deadlock conditions make merging them prohibitive. We do the same +// and keep them oblivious to each other. +type FileLocks struct { + // bsd is a set of BSD-style advisory file wide locks, see flock(2). + bsd fslock.Locks + + // posix is a set of POSIX-style regional advisory locks, see fcntl(2). + posix fslock.Locks +} + +// LockBSD tries to acquire a BSD-style lock on the entire file. +func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockBSD releases a BSD-style lock on the entire file. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { + fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) +} + +// LockPOSIX tries to acquire a POSIX-style lock on a file region. +func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + if fl.posix.LockRegion(uid, t, rng, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockPOSIX releases a POSIX-style lock on a file region. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { + fl.posix.UnlockRegion(uid, rng) +} -- cgit v1.2.3 From 02997af5abd62d778fca4d01b047a6bdebab2090 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Fri, 31 Jan 2020 15:08:17 -0800 Subject: Fix method comment to match method name. PiperOrigin-RevId: 292624867 --- pkg/tcpip/transport/tcp/accept.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 6101f2945..08afb7c17 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -271,8 +271,8 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i return n, nil } -// createEndpoint creates a new endpoint in connected state and then performs -// the TCP 3-way handshake. +// createEndpointAndPerformHandshake creates a new endpoint in connected state +// and then performs the TCP 3-way handshake. func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) { // Create new endpoint. irs := s.sequenceNumber -- cgit v1.2.3 From f37e913a358820ea98013772dd2880cc8a3c9218 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 3 Feb 2020 16:15:16 -0800 Subject: seccomp: allow to filter syscalls by instruction pointer PiperOrigin-RevId: 293029446 --- pkg/seccomp/seccomp.go | 20 ++++++++++++++++---- pkg/seccomp/seccomp_rules.go | 6 +++++- pkg/seccomp/seccomp_test.go | 27 +++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 5 deletions(-) (limited to 'pkg') diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index fc36efa23..55fd6967e 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -219,24 +219,36 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc switch a := arg.(type) { case AllowAny: case AllowValue: + dataOffsetLow := seccompDataOffsetArgLow(i) + dataOffsetHigh := seccompDataOffsetArgHigh(i) + if i == RuleIP { + dataOffsetLow = seccompDataOffsetIPLow + dataOffsetHigh = seccompDataOffsetIPHigh + } high, low := uint32(a>>32), uint32(a) // assert arg_low == low - p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i)) + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) // assert arg_high == high - p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i)) + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) labelled = true case GreaterThan: + dataOffsetLow := seccompDataOffsetArgLow(i) + dataOffsetHigh := seccompDataOffsetArgHigh(i) + if i == RuleIP { + dataOffsetLow = seccompDataOffsetIPLow + dataOffsetHigh = seccompDataOffsetIPHigh + } labelGood := fmt.Sprintf("gt%v", i) high, low := uint32(a>>32), uint32(a) // assert arg_high < high - p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i)) + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) // arg_high > high p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) // arg_low < low - p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i)) + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) labelled = true diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go index 84c841d7f..06308cd29 100644 --- a/pkg/seccomp/seccomp_rules.go +++ b/pkg/seccomp/seccomp_rules.go @@ -62,7 +62,11 @@ func (a AllowValue) String() (s string) { // rule := Rule { // AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0 // } -type Rule [6]interface{} +type Rule [7]interface{} // 6 arguments + RIP + +// RuleIP indicates what rules in the Rule array have to be applied to +// instruction pointer. +const RuleIP = 6 func (r Rule) String() (s string) { if len(r) == 0 { diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go index abbee7051..da5a5e4b2 100644 --- a/pkg/seccomp/seccomp_test.go +++ b/pkg/seccomp/seccomp_test.go @@ -388,6 +388,33 @@ func TestBasic(t *testing.T) { }, }, }, + { + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + RuleIP: AllowValue(0x7aabbccdd), + }, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, + specs: []spec{ + { + desc: "IP: Syscall instruction pointer allowed", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{}, instructionPointer: 0x7aabbccdd}, + want: linux.SECCOMP_RET_ALLOW, + }, + { + desc: "IP: Syscall instruction pointer disallowed", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{}, instructionPointer: 0x711223344}, + want: linux.SECCOMP_RET_TRAP, + }, + }, + }, } { instrs, err := BuildProgram(test.ruleSets, test.defaultAction) if err != nil { -- cgit v1.2.3 From d7cd484091543827678f1548b8e5668a7a86e13f Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 4 Feb 2020 08:20:10 -0800 Subject: Add support for sentry internal pipe for gofer mounts Internal pipes are supported similarly to how internal UDS is done. It is also controlled by the same flag. Fixes #1102 PiperOrigin-RevId: 293150045 --- pkg/sentry/fs/gofer/BUILD | 2 + pkg/sentry/fs/gofer/cache_policy.go | 3 + pkg/sentry/fs/gofer/fifo.go | 40 ++++++++ pkg/sentry/fs/gofer/gofer_test.go | 2 +- pkg/sentry/fs/gofer/path.go | 183 +++++++++++++++++++++++------------ pkg/sentry/fs/gofer/session.go | 183 ++++++++++++++++++++++------------- pkg/sentry/fs/gofer/session_state.go | 12 +-- pkg/sentry/fs/gofer/socket.go | 8 +- test/syscalls/BUILD | 1 - 9 files changed, 293 insertions(+), 141 deletions(-) create mode 100644 pkg/sentry/fs/gofer/fifo.go (limited to 'pkg') diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index 971d3718e..fea135eea 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -9,6 +9,7 @@ go_library( "cache_policy.go", "context_file.go", "device.go", + "fifo.go", "file.go", "file_state.go", "fs.go", @@ -38,6 +39,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/host", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go index ebea03c42..07a564e92 100644 --- a/pkg/sentry/fs/gofer/cache_policy.go +++ b/pkg/sentry/fs/gofer/cache_policy.go @@ -127,6 +127,9 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child childIops, ok := child.InodeOperations.(*inodeOperations) if !ok { + if _, ok := child.InodeOperations.(*fifo); ok { + return false + } panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations)) } parentIops, ok := parent.InodeOperations.(*inodeOperations) diff --git a/pkg/sentry/fs/gofer/fifo.go b/pkg/sentry/fs/gofer/fifo.go new file mode 100644 index 000000000..456557058 --- /dev/null +++ b/pkg/sentry/fs/gofer/fifo.go @@ -0,0 +1,40 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// +stateify savable +type fifo struct { + fs.InodeOperations + fileIops *inodeOperations +} + +var _ fs.InodeOperations = (*fifo)(nil) + +// Rename implements fs.InodeOperations. It forwards the call to the underlying +// file inode to handle the file rename. Note that file key remains the same +// after the rename to keep the endpoint mapping. +func (i *fifo) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { + return i.fileIops.Rename(ctx, inode, oldParent, oldName, newParent, newName, replacement) +} + +// StatFS implements fs.InodeOperations. +func (i *fifo) StatFS(ctx context.Context) (fs.Info, error) { + return i.fileIops.StatFS(ctx) +} diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 0c2f89ae8..2df2fe889 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -61,7 +61,7 @@ func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context ctx := contexttest.Context(t) sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{ file: rootFile, - }, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */) + }, root.QID, p9.AttrMaskAll(), root.Attr) m := fs.NewMountSource(ctx, s, &filesystem{}, fs.MountSourceFlags{}) rootInode := fs.NewInode(ctx, rootInodeOperations, m, sattr) diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 0c1be05ef..a35c3a23d 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -23,14 +23,24 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's // encoding of strings, which uses 2 bytes for the length prefix. const maxFilenameLen = (1 << 16) - 1 +func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode { + if newType&^p9.FileModeMask != 0 { + panic(fmt.Sprintf("newType contained more bits than just file mode: %x", newType)) + } + clear := mode &^ p9.FileModeMask + return clear | newType +} + // Lookup loads an Inode at name into a Dirent based on the session's cache // policy. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { @@ -69,8 +79,25 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string return nil, err } + if i.session().overrides != nil { + // Check if file belongs to a internal named pipe. Note that it doesn't need + // to check for sockets because it's done in newInodeOperations below. + deviceKey := device.MultiDeviceKey{ + Device: p9attr.RDev, + SecondaryDevice: i.session().connID, + Inode: qids[0].Path, + } + unlock := i.session().overrides.lock() + if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil { + unlock() + pipeInode.IncRef() + return fs.NewDirent(ctx, pipeInode, name), nil + } + unlock() + } + // Construct the Inode operations. - sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false) + sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr) // Construct a positive Dirent. return fs.NewDirent(ctx, fs.NewInode(ctx, node, dir.MountSource, sattr), name), nil @@ -138,7 +165,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string qid := qids[0] // Construct the InodeOperations. - sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false) + sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr) // Construct the positive Dirent. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) @@ -223,82 +250,115 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, return nil, syserror.ENAMETOOLONG } - if i.session().endpoints == nil { + if i.session().overrides == nil { return nil, syscall.EOPNOTSUPP } - // Create replaces the directory fid with the newly created/opened - // file, so clone this directory so it doesn't change out from under - // this node. - _, newFile, err := i.fileState.file.walk(ctx, nil) + // Stabilize the override map while creation is in progress. + unlock := i.session().overrides.lock() + defer unlock() + + sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket) if err != nil { return nil, err } - // We're not going to use newFile after return. - defer newFile.close(ctx) - // Stabilize the endpoint map while creation is in progress. - unlock := i.session().endpoints.lock() - defer unlock() + // Construct the positive Dirent. + childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) + i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) + return childDir, nil +} - // Create a regular file in the gofer and then mark it as a socket by - // adding this inode key in the 'endpoints' map. - owner := fs.FileOwnerFromContext(ctx) - hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) - if err != nil { - return nil, err +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + if len(name) > maxFilenameLen { + return syserror.ENAMETOOLONG } - // We're not going to use this file. - hostFile.Close() - i.touchModificationAndStatusChangeTime(ctx, dir) + owner := fs.FileOwnerFromContext(ctx) + mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe - // Get the attributes of the file to create inode key. - qid, mask, attr, err := getattr(ctx, newFile) - if err != nil { - return nil, err + // N.B. FIFOs use major/minor numbers 0. + if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + if i.session().overrides == nil || err != syscall.EPERM { + return err + } + // If gofer doesn't support mknod, check if we can create an internal fifo. + return i.createInternalFifo(ctx, dir, name, owner, perm) } - key := device.MultiDeviceKey{ - Device: attr.RDev, - SecondaryDevice: i.session().connID, - Inode: qid.Path, + i.touchModificationAndStatusChangeTime(ctx, dir) + return nil +} + +func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error { + if i.session().overrides == nil { + return syserror.EPERM } - // Create child dirent. + // Stabilize the override map while creation is in progress. + unlock := i.session().overrides.lock() + defer unlock() - // Get an unopened p9.File for the file we created so that it can be - // cloned and re-opened multiple times after creation. - _, unopened, err := i.fileState.file.walk(ctx, []string{name}) + sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe) if err != nil { - return nil, err + return err } - // Construct the InodeOperations. - sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true) + // First create a pipe. + p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) + + // Wrap the fileOps with our Fifo. + iops := &fifo{ + InodeOperations: pipe.NewInodeOperations(ctx, perm, p), + fileIops: fileOps, + } + inode := fs.NewInode(ctx, iops, dir.MountSource, sattr) // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().endpoints.add(key, childDir, ep) - return childDir, nil + i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode) + return nil } -// CreateFifo implements fs.InodeOperations.CreateFifo. -func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG +// Caller must hold Session.endpoint lock. +func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions, fileType p9.FileMode) (fs.StableAttr, *inodeOperations, error) { + _, dirClone, err := i.fileState.file.walk(ctx, nil) + if err != nil { + return fs.StableAttr{}, nil, err } + // We're not going to use dirClone after return. + defer dirClone.close(ctx) + // Create a regular file in the gofer and then mark it as a socket by + // adding this inode key in the 'overrides' map. owner := fs.FileOwnerFromContext(ctx) - mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe - - // N.B. FIFOs use major/minor numbers 0. - if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - return err + hostFile, err := dirClone.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) + if err != nil { + return fs.StableAttr{}, nil, err } + // We're not going to use this file. + hostFile.Close() i.touchModificationAndStatusChangeTime(ctx, dir) - return nil + + // Get the attributes of the file to create inode key. + qid, mask, attr, err := getattr(ctx, dirClone) + if err != nil { + return fs.StableAttr{}, nil, err + } + + // Get an unopened p9.File for the file we created so that it can be + // cloned and re-opened multiple times after creation. + _, unopened, err := i.fileState.file.walk(ctx, []string{name}) + if err != nil { + return fs.StableAttr{}, nil, err + } + + // Construct new inode with file type overridden. + attr.Mode = changeType(attr.Mode, fileType) + sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr) + return sattr, iops, nil } // Remove implements InodeOperations.Remove. @@ -307,20 +367,23 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return syserror.ENAMETOOLONG } - var key device.MultiDeviceKey - removeSocket := false - if i.session().endpoints != nil { - // Find out if file being deleted is a socket that needs to be + var key *device.MultiDeviceKey + if i.session().overrides != nil { + // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { defer d.DecRef() - if fs.IsSocket(d.Inode.StableAttr) { - child := d.Inode.InodeOperations.(*inodeOperations) - key = child.fileState.key - removeSocket = true - // Stabilize the endpoint map while deletion is in progress. - unlock := i.session().endpoints.lock() + if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) { + switch iops := d.Inode.InodeOperations.(type) { + case *inodeOperations: + key = &iops.fileState.key + case *fifo: + key = &iops.fileIops.fileState.key + } + + // Stabilize the override map while deletion is in progress. + unlock := i.session().overrides.lock() defer unlock() } } @@ -329,8 +392,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil { return err } - if removeSocket { - i.session().endpoints.remove(key) + if key != nil { + i.session().overrides.remove(*key) } i.touchModificationAndStatusChangeTime(ctx, dir) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 498c4645a..f6b3ef178 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -33,60 +33,107 @@ import ( var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize // +stateify savable -type endpointMaps struct { - // mu protexts the direntMap, the keyMap, and the pathMap below. - mu sync.RWMutex `state:"nosave"` +type overrideInfo struct { + dirent *fs.Dirent + + // endpoint is set when dirent points to a socket. inode must not be set. + endpoint transport.BoundEndpoint + + // inode is set when dirent points to a pipe. endpoint must not be set. + inode *fs.Inode +} - // direntMap links sockets to their dirents. - // It is filled concurrently with the keyMap and is stored upon save. - // Before saving, this map is used to populate the pathMap. - direntMap map[transport.BoundEndpoint]*fs.Dirent +func (l *overrideInfo) inodeType() fs.InodeType { + switch { + case l.endpoint != nil: + return fs.Socket + case l.inode != nil: + return fs.Pipe + } + panic("endpoint or node must be set") +} - // keyMap links MultiDeviceKeys (containing inode IDs) to their sockets. +// +stateify savable +type overrideMaps struct { + // mu protexts the keyMap, and the pathMap below. + mu sync.RWMutex `state:"nosave"` + + // keyMap links MultiDeviceKeys (containing inode IDs) to their sockets/pipes. // It is not stored during save because the inode ID may change upon restore. - keyMap map[device.MultiDeviceKey]transport.BoundEndpoint `state:"nosave"` + keyMap map[device.MultiDeviceKey]*overrideInfo `state:"nosave"` - // pathMap links the sockets to their paths. + // pathMap links the sockets/pipes to their paths. // It is filled before saving from the direntMap and is stored upon save. // Upon restore, this map is used to re-populate the keyMap. - pathMap map[transport.BoundEndpoint]string + pathMap map[*overrideInfo]string +} + +// addBoundEndpoint adds the bound endpoint to the map. +// A reference is taken on the dirent argument. +// +// Precondition: maps must have been locked with 'lock'. +func (e *overrideMaps) addBoundEndpoint(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) { + d.IncRef() + e.keyMap[key] = &overrideInfo{dirent: d, endpoint: ep} } -// add adds the endpoint to the maps. +// addPipe adds the pipe inode to the map. // A reference is taken on the dirent argument. // // Precondition: maps must have been locked with 'lock'. -func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) { - e.keyMap[key] = ep +func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *fs.Inode) { d.IncRef() - e.direntMap[ep] = d + e.keyMap[key] = &overrideInfo{dirent: d, inode: inode} } // remove deletes the key from the maps. // // Precondition: maps must have been locked with 'lock'. -func (e *endpointMaps) remove(key device.MultiDeviceKey) { - endpoint := e.get(key) +func (e *overrideMaps) remove(key device.MultiDeviceKey) { + endpoint := e.keyMap[key] delete(e.keyMap, key) - - d := e.direntMap[endpoint] - d.DecRef() - delete(e.direntMap, endpoint) + endpoint.dirent.DecRef() } // lock blocks other addition and removal operations from happening while // the backing file is being created or deleted. Returns a function that unlocks // the endpoint map. -func (e *endpointMaps) lock() func() { +func (e *overrideMaps) lock() func() { e.mu.Lock() return func() { e.mu.Unlock() } } -// get returns the endpoint mapped to the given key. +// getBoundEndpoint returns the bound endpoint mapped to the given key. // -// Precondition: maps must have been locked for reading. -func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint { - return e.keyMap[key] +// Precondition: maps must have been locked. +func (e *overrideMaps) getBoundEndpoint(key device.MultiDeviceKey) transport.BoundEndpoint { + if v := e.keyMap[key]; v != nil { + return v.endpoint + } + return nil +} + +// getPipe returns the pipe inode mapped to the given key. +// +// Precondition: maps must have been locked. +func (e *overrideMaps) getPipe(key device.MultiDeviceKey) *fs.Inode { + if v := e.keyMap[key]; v != nil { + return v.inode + } + return nil +} + +// getType returns the inode type if there is a corresponding endpoint for the +// given key. Returns false otherwise. +func (e *overrideMaps) getType(key device.MultiDeviceKey) (fs.InodeType, bool) { + e.mu.Lock() + v := e.keyMap[key] + e.mu.Unlock() + + if v != nil { + return v.inodeType(), true + } + return 0, false } // session holds state for each 9p session established during sys_mount. @@ -137,16 +184,16 @@ type session struct { // mounter is the EUID/EGID that mounted this file system. mounter fs.FileOwner `state:"wait"` - // endpoints is used to map inodes that represent socket files to their - // corresponding endpoint. Socket files are created as regular files in the - // gofer and their presence in this map indicate that they should indeed be - // socket files. This allows unix domain sockets to be used with paths that - // belong to a gofer. + // overrides is used to map inodes that represent socket/pipes files to their + // corresponding endpoint/iops. These files are created as regular files in + // the gofer and their presence in this map indicate that they should indeed + // be socket/pipe files. This allows unix domain sockets and named pipes to + // be used with paths that belong to a gofer. // // TODO(gvisor.dev/issue/1200): there are few possible races with someone // stat'ing the file and another deleting it concurrently, where the file // will not be reported as socket file. - endpoints *endpointMaps `state:"wait"` + overrides *overrideMaps `state:"wait"` } // Destroy tears down the session. @@ -179,15 +226,21 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) { // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, // because overlay copyUp may have changed them out from under us. // So much for "immutable". - sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr - s.inodeMappings[sattr.InodeID] = path + switch iops := inode.InodeOperations.(type) { + case *inodeOperations: + s.inodeMappings[iops.fileState.sattr.InodeID] = path + case *fifo: + s.inodeMappings[iops.fileIops.fileState.sattr.InodeID] = path + default: + panic(fmt.Sprintf("Invalid type: %T", iops)) + } } -// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes -// (p9.QID, p9.AttrMask, p9.Attr). +// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File +// and attributes (p9.QID, p9.AttrMask, p9.Attr). // // Endpoints lock must not be held if socket == false. -func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) { +func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) { deviceKey := device.MultiDeviceKey{ Device: attr.RDev, SecondaryDevice: s.connID, @@ -201,17 +254,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p BlockSize: bsize(attr), } - if s.endpoints != nil { - if socket { - sattr.Type = fs.Socket - } else { - // If unix sockets are allowed on this filesystem, check if this file is - // supposed to be a socket file. - unlock := s.endpoints.lock() - if s.endpoints.get(deviceKey) != nil { - sattr.Type = fs.Socket - } - unlock() + if s.overrides != nil && sattr.Type == fs.RegularFile { + // If overrides are allowed on this filesystem, check if this file is + // supposed to be of a different type, e.g. socket. + if t, ok := s.overrides.getType(deviceKey); ok { + sattr.Type = t } } @@ -267,7 +314,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF s.EnableLeakCheck("gofer.session") if o.privateunixsocket { - s.endpoints = newEndpointMaps() + s.overrides = newOverrideMaps() } // Construct the MountSource with the session and superBlockFlags. @@ -305,26 +352,24 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF return nil, err } - sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false) + sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr) return fs.NewInode(ctx, iops, m, sattr), nil } -// newEndpointMaps creates a new endpointMaps. -func newEndpointMaps() *endpointMaps { - return &endpointMaps{ - direntMap: make(map[transport.BoundEndpoint]*fs.Dirent), - keyMap: make(map[device.MultiDeviceKey]transport.BoundEndpoint), - pathMap: make(map[transport.BoundEndpoint]string), +// newOverrideMaps creates a new overrideMaps. +func newOverrideMaps() *overrideMaps { + return &overrideMaps{ + keyMap: make(map[device.MultiDeviceKey]*overrideInfo), + pathMap: make(map[*overrideInfo]string), } } -// fillKeyMap populates key and dirent maps upon restore from saved -// pathmap. +// fillKeyMap populates key and dirent maps upon restore from saved pathmap. func (s *session) fillKeyMap(ctx context.Context) error { - unlock := s.endpoints.lock() + unlock := s.overrides.lock() defer unlock() - for ep, dirPath := range s.endpoints.pathMap { + for ep, dirPath := range s.overrides.pathMap { _, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath)) if err != nil { return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err) @@ -341,25 +386,25 @@ func (s *session) fillKeyMap(ctx context.Context) error { Inode: qid.Path, } - s.endpoints.keyMap[key] = ep + s.overrides.keyMap[key] = ep } return nil } -// fillPathMap populates paths for endpoints from dirents in direntMap +// fillPathMap populates paths for overrides from dirents in direntMap // before save. func (s *session) fillPathMap() error { - unlock := s.endpoints.lock() + unlock := s.overrides.lock() defer unlock() - for ep, dir := range s.endpoints.direntMap { - mountRoot := dir.MountRoot() + for _, endpoint := range s.overrides.keyMap { + mountRoot := endpoint.dirent.MountRoot() defer mountRoot.DecRef() - dirPath, _ := dir.FullName(mountRoot) + dirPath, _ := endpoint.dirent.FullName(mountRoot) if dirPath == "" { return fmt.Errorf("error getting path from dirent") } - s.endpoints.pathMap[ep] = dirPath + s.overrides.pathMap[endpoint] = dirPath } return nil } @@ -368,7 +413,7 @@ func (s *session) fillPathMap() error { func (s *session) restoreEndpointMaps(ctx context.Context) error { // When restoring, only need to create the keyMap because the dirent and path // maps got stored through the save. - s.endpoints.keyMap = make(map[device.MultiDeviceKey]transport.BoundEndpoint) + s.overrides.keyMap = make(map[device.MultiDeviceKey]*overrideInfo) if err := s.fillKeyMap(ctx); err != nil { return fmt.Errorf("failed to insert sockets into endpoint map: %v", err) } @@ -376,6 +421,6 @@ func (s *session) restoreEndpointMaps(ctx context.Context) error { // Re-create pathMap because it can no longer be trusted as socket paths can // change while process continues to run. Empty pathMap will be re-filled upon // next save. - s.endpoints.pathMap = make(map[transport.BoundEndpoint]string) + s.overrides.pathMap = make(map[*overrideInfo]string) return nil } diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 0285c5361..111da59f9 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -25,9 +25,9 @@ import ( // beforeSave is invoked by stateify. func (s *session) beforeSave() { - if s.endpoints != nil { + if s.overrides != nil { if err := s.fillPathMap(); err != nil { - panic("failed to save paths to endpoint map before saving" + err.Error()) + panic("failed to save paths to override map before saving" + err.Error()) } } } @@ -74,10 +74,10 @@ func (s *session) afterLoad() { panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname)) } - // Check if endpointMaps exist when uds sockets are enabled - // (only pathmap will actualy have been saved). - if opts.privateunixsocket != (s.endpoints != nil) { - panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil)) + // Check if overrideMaps exist when uds sockets are enabled (only pathmaps + // will actually have been saved). + if opts.privateunixsocket != (s.overrides != nil) { + panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.overrides != nil)) } if args.Flags != s.superBlockFlags { panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags)) diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 376cfce2c..10ba2f5f0 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -32,15 +32,15 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return nil } - if i.session().endpoints != nil { - unlock := i.session().endpoints.lock() + if i.session().overrides != nil { + unlock := i.session().overrides.lock() defer unlock() - ep := i.session().endpoints.get(i.fileState.key) + ep := i.session().overrides.getBoundEndpoint(i.fileState.key) if ep != nil { return ep } - // Not found in endpoints map, it may be a gofer backed unix socket... + // Not found in overrides map, it may be a gofer backed unix socket... } inode.IncRef() diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 40e974314..8f2b75a1c 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -225,7 +225,6 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:mknod_test", - use_tmpfs = True, # mknod is not supported over gofer. ) syscall_test( -- cgit v1.2.3 From 492229d0176c1af2ab4ea4cf91bf211e940b5b12 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 4 Feb 2020 11:28:36 -0800 Subject: VFS2 gofer client Updates #1198 Opening host pipes (by spinning in fdpipe) and host sockets is not yet complete, and will be done in a future CL. Major differences from VFS1 gofer client (sentry/fs/gofer), with varying levels of backportability: - "Cache policies" are replaced by InteropMode, which control the behavior of timestamps in addition to caching. Under InteropModeExclusive (analogous to cacheAll) and InteropModeWritethrough (analogous to cacheAllWritethrough), client timestamps are *not* written back to the server (it is not possible in 9P or Linux for clients to set ctime, so writing back client-authoritative timestamps results in incoherence between atime/mtime and ctime). Under InteropModeShared (analogous to cacheRemoteRevalidating), client timestamps are not used at all (remote filesystem clocks are authoritative). cacheNone is translated to InteropModeShared + new option filesystemOptions.specialRegularFiles. - Under InteropModeShared, "unstable attribute" reloading for permission checks, lookup, and revalidation are fused, which is feasible in VFS2 since gofer.filesystem controls path resolution. This results in a ~33% reduction in RPCs for filesystem operations compared to cacheRemoteRevalidating. For example, consider stat("/foo/bar/baz") where "/foo/bar/baz" fails revalidation, resulting in the instantiation of a new dentry: VFS1 RPCs: getattr("/") // fs.MountNamespace.FindLink() => fs.Inode.CheckPermission() => gofer.inodeOperations.check() => gofer.inodeOperations.UnstableAttr() walkgetattr("/", "foo") = fid1 // fs.Dirent.walk() => gofer.session.Revalidate() => gofer.cachePolicy.Revalidate() clunk(fid1) getattr("/foo") // CheckPermission walkgetattr("/foo", "bar") = fid2 // Revalidate clunk(fid2) getattr("/foo/bar") // CheckPermission walkgetattr("/foo/bar", "baz") = fid3 // Revalidate clunk(fid3) walkgetattr("/foo/bar", "baz") = fid4 // fs.Dirent.walk() => gofer.inodeOperations.Lookup getattr("/foo/bar/baz") // linux.stat() => gofer.inodeOperations.UnstableAttr() VFS2 RPCs: getattr("/") // gofer.filesystem.walkExistingLocked() walkgetattr("/", "foo") = fid1 // gofer.filesystem.stepExistingLocked() clunk(fid1) // No getattr: walkgetattr already updated metadata for permission check walkgetattr("/foo", "bar") = fid2 clunk(fid2) walkgetattr("/foo/bar", "baz") = fid3 // No clunk: fid3 used for new gofer.dentry // No getattr: walkgetattr already updated metadata for stat() - gofer.filesystem.unlinkAt() does not require instantiation of a dentry that represents the file to be deleted. Updates #898. - gofer.regularFileFD.OnClose() skips Tflushf for regular files under InteropModeExclusive, as it's nonsensical to request a remote file flush without flushing locally-buffered writes to that remote file first. - Symlink targets are cached when InteropModeShared is not in effect. - p9.QID.Path (which is already required to be unique for each file within a server, and is accordingly already synthesized from device/inode numbers in all known gofers) is used as-is for inode numbers, rather than being mapped along with attr.RDev in the client to yet another synthetic inode number. - Relevant parts of fsutil.CachingInodeOperations are inlined directly into gofer package code. This avoids having to duplicate part of its functionality in fsutil.HostMappable. PiperOrigin-RevId: 293190213 --- pkg/safemem/seq_unsafe.go | 17 + pkg/sentry/fs/fsutil/BUILD | 4 +- pkg/sentry/fs/fsutil/frame_ref_set.go | 13 +- pkg/sentry/fs/fsutil/inode_cached.go | 2 +- pkg/sentry/fsimpl/gofer/BUILD | 55 ++ pkg/sentry/fsimpl/gofer/directory.go | 190 +++++ pkg/sentry/fsimpl/gofer/filesystem.go | 1087 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/gofer.go | 1147 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/handle.go | 135 ++++ pkg/sentry/fsimpl/gofer/handle_unsafe.go | 66 ++ pkg/sentry/fsimpl/gofer/p9file.go | 219 ++++++ pkg/sentry/fsimpl/gofer/pagemath.go | 31 + pkg/sentry/fsimpl/gofer/regular_file.go | 860 ++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/special_file.go | 159 +++++ pkg/sentry/fsimpl/gofer/symlink.go | 47 ++ pkg/sentry/fsimpl/gofer/time.go | 75 ++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/socket/hostinet/socket.go | 23 +- 18 files changed, 4103 insertions(+), 29 deletions(-) create mode 100644 pkg/sentry/fsimpl/gofer/BUILD create mode 100644 pkg/sentry/fsimpl/gofer/directory.go create mode 100644 pkg/sentry/fsimpl/gofer/filesystem.go create mode 100644 pkg/sentry/fsimpl/gofer/gofer.go create mode 100644 pkg/sentry/fsimpl/gofer/handle.go create mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go create mode 100644 pkg/sentry/fsimpl/gofer/p9file.go create mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go create mode 100644 pkg/sentry/fsimpl/gofer/regular_file.go create mode 100644 pkg/sentry/fsimpl/gofer/special_file.go create mode 100644 pkg/sentry/fsimpl/gofer/symlink.go create mode 100644 pkg/sentry/fsimpl/gofer/time.go (limited to 'pkg') diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go index 354a95dde..dcdfc9600 100644 --- a/pkg/safemem/seq_unsafe.go +++ b/pkg/safemem/seq_unsafe.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "reflect" + "syscall" "unsafe" ) @@ -297,3 +298,19 @@ func ZeroSeq(dsts BlockSeq) (uint64, error) { } return done, nil } + +// IovecsFromBlockSeq returns a []syscall.Iovec representing seq. +func IovecsFromBlockSeq(bs BlockSeq) []syscall.Iovec { + iovs := make([]syscall.Iovec, 0, bs.NumBlocks()) + for ; !bs.IsEmpty(); bs = bs.Tail() { + b := bs.Head() + iovs = append(iovs, syscall.Iovec{ + Base: &b.ToSlice()[0], + Len: uint64(b.Len()), + }) + // We don't need to care about b.NeedSafecopy(), because the host + // kernel will handle such address ranges just fine (by returning + // EFAULT). + } + return iovs +} diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 4ab2a384f..789369220 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -28,13 +28,13 @@ go_template_instance( "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", - prefix = "frameRef", + prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", "Range": "platform.FileRange", "Value": "uint64", - "Functions": "frameRefSetFunctions", + "Functions": "FrameRefSetFunctions", }, ) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd63db32b..6564fd0c6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -20,24 +20,25 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" ) -type frameRefSetFunctions struct{} +// FrameRefSetFunctions implements segment.Functions for FrameRefSet. +type FrameRefSetFunctions struct{} // MinKey implements segment.Functions.MinKey. -func (frameRefSetFunctions) MinKey() uint64 { +func (FrameRefSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. -func (frameRefSetFunctions) MaxKey() uint64 { +func (FrameRefSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. -func (frameRefSetFunctions) ClearValue(val *uint64) { +func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -45,6 +46,6 @@ func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 573b8586e..800c8b4e1 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -111,7 +111,7 @@ type CachingInodeOperations struct { // refs tracks active references to data in the cache. // // refs is protected by dataMu. - refs frameRefSet + refs FrameRefSet } // CachingInodeOperationsOptions configures a CachingInodeOperations. diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD new file mode 100644 index 000000000..4ba76a1e8 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -0,0 +1,55 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "gofer", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + +go_library( + name = "gofer", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "gofer.go", + "handle.go", + "handle_unsafe.go", + "p9file.go", + "pagemath.go", + "regular_file.go", + "special_file.go", + "symlink.go", + "time.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fd", + "//pkg/fspath", + "//pkg/log", + "//pkg/p9", + "//pkg/safemem", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/unet", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go new file mode 100644 index 000000000..baa2cdd8e --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -0,0 +1,190 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func (d *dentry) isDir() bool { + return d.fileType() == linux.S_IFDIR +} + +// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != +// InteropModeShared. +func (d *dentry) cacheNegativeChildLocked(name string) { + if d.negativeChildren == nil { + d.negativeChildren = make(map[string]struct{}) + } + d.negativeChildren[name] = struct{}{} +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + mu sync.Mutex + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + if fd.dirents == nil { + ds, err := fd.dentry().getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + for fd.off < int64(len(fd.dirents)) { + if !cb.Handle(fd.dirents[fd.off]) { + return nil + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). There exists at least one directoryFD representing d. +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + // 9P2000.L's readdir does not specify behavior in the presence of + // concurrent mutation of an iterated directory, so implementations may + // duplicate or omit entries in this case, which violates POSIX semantics. + // Thus we read all directory entries while holding d.dirMu to exclude + // directory mutations. (Note that it is impossible for the client to + // exclude concurrent mutation from other remote filesystem users. Since + // there is no way to detect if the server has incorrectly omitted + // directory entries, we simply assume that the server is well-behaved + // under InteropModeShared.) This is inconsistent with Linux (which appears + // to assume that directory fids have the correct semantics, and translates + // struct file_operations::readdir calls directly to readdir RPCs), but is + // consistent with VFS1. + + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + if d.dirents != nil { + return d.dirents, nil + } + + // It's not clear if 9P2000.L's readdir is expected to return "." and "..", + // so we generate them here. + parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: d.ino, + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: parent.ino, + NextOff: 2, + }, + } + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + defer d.handleMu.RUnlock() + if !d.handleReadable { + // This should not be possible because a readable handle should have + // been opened when the calling directoryFD was opened. + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + return nil, err + } + if len(p9ds) == 0 { + // Cache dirents for future directoryFDs if permitted. + if d.fs.opts.interop != InteropModeShared { + d.dirents = dirents + } + return dirents, nil + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + } + off = p9ds[len(p9ds)-1].Offset + } +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go new file mode 100644 index 000000000..8eb61debf --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -0,0 +1,1087 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // Snapshot current dentries and special files. + fs.syncMu.Lock() + ds := make([]*dentry, 0, len(fs.dentries)) + for d := range fs.dentries { + ds = append(ds, d) + } + sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) + for sffd := range fs.specialFileFDs { + sffds = append(sffds, sffd) + } + fs.syncMu.Unlock() + + // Return the first error we encounter, but sync everything we can + // regardless. + var retErr error + + // Sync regular files. + for _, d := range ds { + if !d.TryIncRef() { + continue + } + err := d.syncSharedHandle(ctx) + d.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + // Sync special files, which may be writable but do not use dentry shared + // handles (so they won't be synced by the above). + for _, sffd := range sffds { + if !sffd.vfsfd.TryIncRef() { + continue + } + err := sffd.Sync(ctx) + sffd.vfsfd.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + return retErr +} + +// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's +// encoding of strings, which uses 2 bytes for the length prefix. +const maxFilenameLen = (1 << 16) - 1 + +// dentrySlicePool is a pool of *[]*dentry used to store dentries for which +// dentry.checkCachingLocked() must be called. The pool holds pointers to +// slices because Go lacks generics, so sync.Pool operates on interface{}, so +// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy +// of the slice header on the heap. +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may become cached as a result of the traversal are appended +// to *ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached +// metadata must be up to date. +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + parentVFSD, err := rp.ResolveParent(&d.vfsd) + if err != nil { + return nil, err + } + parent := parentVFSD.Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // We must assume that parentVFSD is correct, because if d has been + // moved elsewhere in the remote filesystem so that its parent has + // changed, we have no way of determining its new parent's location + // in the filesystem. Get updated metadata for parentVFSD. + _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + if err != nil { + return nil, err + } + parent.updateFromP9Attrs(attrMask, &attr) + } + rp.Advance() + return parent, nil + } + childVFSD, err := rp.ResolveChild(&d.vfsd, name) + if err != nil { + return nil, err + } + // FIXME(jamieliu): Linux performs revalidation before mount lookup + // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), + // __follow_mount_rcu()). + child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + if err != nil { + return nil, err + } + if child == nil { + return nil, syserror.ENOENT + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) +// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be +// nil) to verify that the returned child (or lack thereof) is correct. If no file +// exists at name, revalidateChildLocked returns (nil, nil). +// +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// parent.isDir(). name is not "." or "..". +// +// Postconditions: If revalidateChildLocked returns a non-nil dentry, its +// cached metadata is up to date. +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { + if childVFSD != nil && fs.opts.interop != InteropModeShared { + // We have a cached dentry that is assumed to be correct. + return childVFSD.Impl().(*dentry), nil + } + // We either don't have a cached dentry or need to verify that it's still + // correct, either of which requires a remote lookup. Check if this name is + // valid before performing the lookup. + if len(name) > maxFilenameLen { + return nil, syserror.ENAMETOOLONG + } + // Check if we've already cached this lookup with a negative result. + if _, ok := parent.negativeChildren[name]; ok { + return nil, nil + } + // Perform the remote lookup. + qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) + if err != nil && err != syserror.ENOENT { + return nil, err + } + if childVFSD != nil { + child := childVFSD.Impl().(*dentry) + if !file.isNil() && qid.Path == child.ino { + // The file at this path hasn't changed. Just update cached + // metadata. + file.close(ctx) + child.updateFromP9Attrs(attrMask, &attr) + return child, nil + } + // The file at this path has changed or no longer exists. Remove + // the stale dentry from the tree, and re-evaluate its caching + // status (i.e. if it has 0 references, drop it). + vfsObj.ForceDeleteDentry(childVFSD) + *ds = appendDentry(*ds, child) + childVFSD = nil + } + if file.isNil() { + // No file exists at this path now. Cache the negative lookup if + // allowed. + if fs.opts.interop != InteropModeShared { + parent.cacheNegativeChildLocked(name) + } + return nil, nil + } + // Create a new dentry representing the file. + child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + if err != nil { + file.close(ctx) + return nil, err + } + parent.IncRef() // reference held by child on its parent + parent.vfsd.InsertChild(&child.vfsd, name) + // For now, child has 0 references, so our caller should call + // child.checkCachingLocked(). + *ds = appendDentry(*ds, child) + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == +// InteropModeShared, then d's cached metadata must be up to date. +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for rp.Start() as required by fs.stepLocked(). + if err := d.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + if parent.isDeleted() { + return syserror.ENOENT + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if len(name) > maxFilenameLen { + return syserror.ENAMETOOLONG + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + if fs.opts.interop == InteropModeShared { + // The existence of a dentry at name would be inconclusive because the + // file it represents may have been deleted from the remote filesystem, + // so we would need to make an RPC to revalidate the dentry. Just + // attempt the file creation RPC instead. If a file does exist, the RPC + // will fail with EEXIST like we would have. If the RPC succeeds, and a + // stale dentry exists, the dentry will fail revalidation next time + // it's used. + return create(parent, name) + } + if parent.vfsd.Child(name) != nil { + return syserror.EEXIST + } + // No cached dentry exists; however, there might still be an existing file + // at name. As above, we attempt the file creation RPC anyway. + if err := create(parent, name); err != nil { + return err + } + parent.touchCMtime(ctx) + delete(parent.negativeChildren, name) + parent.dirents = nil + return nil +} + +// Preconditions: !rp.Done(). +func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + name := rp.Component() + if dir { + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + } else { + if name == "." || name == ".." { + return syserror.EISDIR + } + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + childVFSD := parent.vfsd.Child(name) + var child *dentry + // We only need a dentry representing the file at name if it can be a mount + // point. If childVFSD is nil, then it can't be a mount point. If childVFSD + // is non-nil but stale, the actual file can't be a mount point either; we + // detect this case by just speculatively calling PrepareDeleteDentry and + // only revalidating the dentry if that fails (indicating that the existing + // dentry is a mount point). + if childVFSD != nil { + child = childVFSD.Impl().(*dentry) + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if err != nil { + return err + } + if child != nil { + childVFSD = &child.vfsd + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + return err + } + } else { + childVFSD = nil + } + } + } else if _, ok := parent.negativeChildren[name]; ok { + return syserror.ENOENT + } + flags := uint32(0) + if dir { + if child != nil && !child.isDir() { + return syserror.ENOTDIR + } + flags = linux.AT_REMOVEDIR + } else { + if child != nil && child.isDir() { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + } + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if childVFSD != nil { + vfsObj.AbortDeleteDentry(childVFSD) + } + return err + } + if fs.opts.interop != InteropModeShared { + parent.touchCMtime(ctx) + parent.cacheNegativeChildLocked(name) + parent.dirents = nil + } + if child != nil { + child.setDeleted() + vfsObj.CommitDeleteDentry(childVFSD) + ds = appendDentry(ds, child) + } + return nil +} + +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + // 9P2000.L supports hard links, but we don't. + return syserror.EPERM + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Reject O_TMPFILE, which is not supported; supporting it correctly in the + // presence of other remote filesystem users requires remote filesystem + // support, and it isn't clear that there's any way to implement this in + // 9P. + if opts.Flags&linux.O_TMPFILE != 0 { + return nil, syserror.EOPNOTSUPP + } + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by fs.stepLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + if rp.Done() { + return start.openLocked(ctx, rp, opts.Flags) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, &ds) + if err == syserror.ENOENT && mayCreate { + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) + parent.dirMu.Unlock() + return fd, err + } + if err != nil { + parent.dirMu.Unlock() + return nil, err + } + // Open existing child or follow symlink. + parent.dirMu.Unlock() + if mustCreate { + return nil, syserror.EEXIST + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, opts.Flags) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + return nil, err + } + mnt := rp.Mount() + filetype := d.fileType() + switch { + case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil { + return nil, err + } + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case filetype == linux.S_IFDIR: + // Can't open directories with O_CREAT. + if flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + if flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } + fd := &directoryFD{} + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case filetype == linux.S_IFLNK: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + default: + if flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0) + if err != nil { + return nil, err + } + fd := &specialFileFD{ + handle: h, + } + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil + } +} + +// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if d.isDeleted() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + // 9P2000.L's lcreate takes a fid representing the parent directory, and + // converts it into an open fid representing the created file, so we need + // to duplicate the directory fid first. + _, dirfile, err := d.file.walk(ctx, nil) + if err != nil { + return nil, err + } + creds := rp.Credentials() + name := rp.Component() + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + if err != nil { + dirfile.close(ctx) + return nil, err + } + // Then we need to walk to the file we just created to get a non-open fid + // representing it, and to get its metadata. This must use d.file since, as + // explained above, dirfile was invalidated by dirfile.Create(). + walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + if err != nil { + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + // Sanity-check that we walked to the file we created. + if createQID.Path != walkQID.Path { + // Probably due to concurrent remote filesystem mutation? + ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, syserror.EAGAIN + } + + // Construct the new dentry. + child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) + if err != nil { + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + // Incorporate the fid that was opened by lcreate. + useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD + if useRegularFileFD { + child.handleMu.Lock() + child.handle.file = openFile + if fdobj != nil { + child.handle.fd = int32(fdobj.Release()) + } + child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) + child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) + child.handleMu.Unlock() + } + // Take a reference on the new dentry to be held by the new file + // description. (This reference also means that the new dentry is not + // eligible for caching yet, so we don't need to append to a dentry slice.) + child.refs = 1 + // Insert the dentry into the tree. + d.IncRef() // reference held by child on its parent d + d.vfsd.InsertChild(&child.vfsd, name) + if d.fs.opts.interop != InteropModeShared { + d.touchCMtime(ctx) + delete(d.negativeChildren, name) + d.dirents = nil + } + + // Finally, construct a file description representing the created file. + var childVFSFD *vfs.FileDescription + mnt.IncRef() + if useRegularFileFD { + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + childVFSFD = &fd.vfsfd + } else { + fd := &specialFileFD{ + handle: handle{ + file: openFile, + fd: -1, + }, + } + if fdobj != nil { + fd.handle.fd = int32(fdobj.Release()) + } + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + fd.handle.close(ctx) + return nil, err + } + childVFSFD = &fd.vfsfd + } + return childVFSFD, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + if !d.isSymlink() { + return "", syserror.EINVAL + } + return d.readlink(ctx, rp.Mount()) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // Requires 9P support. + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckCaching(&ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + if err := oldParent.updateFromGetattr(ctx); err != nil { + return err + } + } + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + vfsObj := rp.VirtualFilesystem() + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + if err != nil { + return err + } + if renamed == nil { + return syserror.ENOENT + } + if renamed.isDir() { + if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.isDeleted() { + return syserror.ENOENT + } + replacedVFSD := newParent.vfsd.Child(newName) + var replaced *dentry + // This is similar to unlinkAt, except: + // + // - We revalidate the replaced dentry unconditionally for simplicity. + // + // - If rp.MustBeDir(), then we need a dentry representing the replaced + // file regardless to confirm that it's a directory. + if replacedVFSD != nil || rp.MustBeDir() { + replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) + if err != nil { + return err + } + if replaced != nil { + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + replacedVFSD = &replaced.vfsd + } else { + replacedVFSD = nil + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil { + return err + } + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + if fs.opts.interop != InteropModeShared { + oldParent.cacheNegativeChildLocked(oldName) + oldParent.dirents = nil + delete(newParent.negativeChildren, newName) + newParent.dirents = nil + } + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, true /* dir */) +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + // Since walking updates metadata for all traversed dentries under + // InteropModeShared, including the returned one, we can return cached + // metadata here regardless of fs.opts.interop. + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + fsstat, err := d.file.statFS(ctx) + if err != nil { + return linux.Statfs{}, err + } + nameLen := uint64(fsstat.NameLength) + if nameLen > maxFilenameLen { + nameLen = maxFilenameLen + } + return linux.Statfs{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + BlockSize: int64(fsstat.BlockSize), + Blocks: fsstat.Blocks, + BlocksFree: fsstat.BlocksFree, + BlocksAvailable: fsstat.BlocksAvailable, + Files: fsstat.Files, + FilesFree: fsstat.FilesFree, + NameLength: nameLen, + }, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, false /* dir */) +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + return d.listxattr(ctx) +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return d.getxattr(ctx, name) +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.setxattr(ctx, &opts) +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.removexattr(ctx, name) +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go new file mode 100644 index 000000000..d0552bd99 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -0,0 +1,1147 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gofer provides a filesystem implementation that is backed by a 9p +// server, interchangably referred to as "gofers" throughout this package. +// +// Lock order: +// regularFileFD/directoryFD.mu +// filesystem.renameMu +// dentry.dirMu +// filesystem.syncMu +// dentry.metadataMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.handleMu +// dentry.dataMu +// +// Locking dentry.dirMu in multiple dentries requires holding +// filesystem.renameMu for writing. +package gofer + +import ( + "fmt" + "strconv" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/usermem" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // mfp is used to allocate memory that caches regular file contents. mfp is + // immutable. + mfp pgalloc.MemoryFileProvider + + // Immutable options. + opts filesystemOptions + + // client is the client used by this filesystem. client is immutable. + client *p9.Client + + // uid and gid are the effective KUID and KGID of the filesystem's creator, + // and are used as the owner and group for files that don't specify one. + // uid and gid are immutable. + uid auth.KUID + gid auth.KGID + + // renameMu serves two purposes: + // + // - It synchronizes path resolution with renaming initiated by this + // client. + // + // - It is held by path resolution to ensure that reachable dentries remain + // valid. A dentry is reachable by path resolution if it has a non-zero + // reference count (such that it is usable as vfs.ResolvingPath.Start() or + // is reachable from its children), or if it is a child dentry (such that + // it is reachable from its parent). + renameMu sync.RWMutex + + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by renameMu. + cachedDentries dentryList + cachedDentriesLen uint64 + + // dentries contains all dentries in this filesystem. specialFileFDs + // contains all open specialFileFDs. These fields are protected by syncMu. + syncMu sync.Mutex + dentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} +} + +type filesystemOptions struct { + // "Standard" 9P options. + fd int + aname string + interop InteropMode // derived from the "cache" mount option + msize uint32 + version string + + // maxCachedDentries is the maximum number of dentries with 0 references + // retained by the client. + maxCachedDentries uint64 + + // If forcePageCache is true, host FDs may not be used for application + // memory mappings even if available; instead, the client must perform its + // own caching of regular file pages. This is primarily useful for testing. + forcePageCache bool + + // If limitHostFDTranslation is true, apply maxFillRange() constraints to + // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This + // makes memory accounting behavior more consistent between cases where + // host FDs are / are not available, but may increase the frequency of + // sentry-handled page faults on files for which a host FD is available. + limitHostFDTranslation bool + + // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote + // filesystem may not be coherent with writable host FDs opened later, so + // mappings of the former must be replaced by mappings of the latter. This + // is usually only the case when the remote filesystem is an overlayfs + // mount on Linux < 4.19. + overlayfsStaleRead bool + + // If regularFilesUseSpecialFileFD is true, application FDs representing + // regular files will use distinct file handles for each FD, in the same + // way that application FDs representing "special files" such as sockets + // do. Note that this disables client caching and mmap for regular files. + regularFilesUseSpecialFileFD bool +} + +// InteropMode controls the client's interaction with other remote filesystem +// users. +type InteropMode uint32 + +const ( + // InteropModeExclusive is appropriate when the filesystem client is the + // only user of the remote filesystem. + // + // - The client may cache arbitrary filesystem state (file data, metadata, + // filesystem structure, etc.). + // + // - Client changes to filesystem state may be sent to the remote + // filesystem asynchronously, except when server permission checks are + // necessary. + // + // - File timestamps are based on client clocks. This ensures that users of + // the client observe timestamps that are coherent with their own clocks + // and consistent with Linux's semantics. However, since it is not always + // possible for clients to set arbitrary atimes and mtimes, and never + // possible for clients to set arbitrary ctimes, file timestamp changes are + // stored in the client only and never sent to the remote filesystem. + InteropModeExclusive InteropMode = iota + + // InteropModeWritethrough is appropriate when there are read-only users of + // the remote filesystem that expect to observe changes made by the + // filesystem client. + // + // - The client may cache arbitrary filesystem state. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on client clocks. As a corollary, access + // timestamp changes from other remote filesystem users will not be visible + // to the client. + InteropModeWritethrough + + // InteropModeShared is appropriate when there are users of the remote + // filesystem that may mutate its state other than the client. + // + // - The client must verify cached filesystem state before using it. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on server clocks. This is necessary to + // ensure that timestamp changes are synchronized between remote filesystem + // users. + // + // Note that the correctness of InteropModeShared depends on the server + // correctly implementing 9P fids (i.e. each fid immutably represents a + // single filesystem object), even in the presence of remote filesystem + // mutations from other users. If this is violated, the behavior of the + // client is undefined. + InteropModeShared +) + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") + return nil, nil, syserror.EINVAL + } + + mopts := vfs.GenericParseMountOptions(opts.Data) + var fsopts filesystemOptions + + // Check that the transport is "fd". + trans, ok := mopts["trans"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'") + return nil, nil, syserror.EINVAL + } + delete(mopts, "trans") + if trans != "fd" { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans) + return nil, nil, syserror.EINVAL + } + + // Check that read and write FDs are provided and identical. + rfdstr, ok := mopts["rfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=") + return nil, nil, syserror.EINVAL + } + delete(mopts, "rfdno") + rfd, err := strconv.Atoi(rfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr) + return nil, nil, syserror.EINVAL + } + wfdstr, ok := mopts["wfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=") + return nil, nil, syserror.EINVAL + } + delete(mopts, "wfdno") + wfd, err := strconv.Atoi(wfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr) + return nil, nil, syserror.EINVAL + } + if rfd != wfd { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd) + return nil, nil, syserror.EINVAL + } + fsopts.fd = rfd + + // Get the attach name. + fsopts.aname = "/" + if aname, ok := mopts["aname"]; ok { + delete(mopts, "aname") + fsopts.aname = aname + } + + // Parse the cache policy. For historical reasons, this defaults to the + // least generally-applicable option, InteropModeExclusive. + fsopts.interop = InteropModeExclusive + if cache, ok := mopts["cache"]; ok { + delete(mopts, "cache") + switch cache { + case "fscache": + fsopts.interop = InteropModeExclusive + case "fscache_writethrough": + fsopts.interop = InteropModeWritethrough + case "none": + fsopts.regularFilesUseSpecialFileFD = true + fallthrough + case "remote_revalidating": + fsopts.interop = InteropModeShared + default: + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache) + return nil, nil, syserror.EINVAL + } + } + + // Parse the 9P message size. + fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M + if msizestr, ok := mopts["msize"]; ok { + delete(mopts, "msize") + msize, err := strconv.ParseUint(msizestr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr) + return nil, nil, syserror.EINVAL + } + fsopts.msize = uint32(msize) + } + + // Parse the 9P protocol version. + fsopts.version = p9.HighestVersionString() + if version, ok := mopts["version"]; ok { + delete(mopts, "version") + fsopts.version = version + } + + // Parse the dentry cache limit. + fsopts.maxCachedDentries = 1000 + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + maxCachedDentries, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return nil, nil, syserror.EINVAL + } + fsopts.maxCachedDentries = maxCachedDentries + } + + // Handle simple flags. + if _, ok := mopts["force_page_cache"]; ok { + delete(mopts, "force_page_cache") + fsopts.forcePageCache = true + } + if _, ok := mopts["limit_host_fd_translation"]; ok { + delete(mopts, "limit_host_fd_translation") + fsopts.limitHostFDTranslation = true + } + if _, ok := mopts["overlayfs_stale_read"]; ok { + delete(mopts, "overlayfs_stale_read") + fsopts.overlayfsStaleRead = true + } + // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying + // "cache=none". + + // Check for unparsed options. + if len(mopts) != 0 { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + // Establish a connection with the server. + conn, err := unet.NewSocket(fsopts.fd) + if err != nil { + return nil, nil, err + } + + // Perform version negotiation with the server. + ctx.UninterruptibleSleepStart(false) + client, err := p9.NewClient(conn, fsopts.msize, fsopts.version) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + conn.Close() + return nil, nil, err + } + // Ownership of conn has been transferred to client. + + // Perform attach to obtain the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := client.Attach(fsopts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + client.Close() + return nil, nil, err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } + + // Construct the filesystem object. + fs := &filesystem{ + mfp: mfp, + opts: fsopts, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + client: client, + dentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), + } + fs.vfsfs.Init(vfsObj, fs) + + // Construct the root dentry. + root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) + if err != nil { + attachFile.close(ctx) + fs.vfsfs.DecRef() + return nil, nil, err + } + // Set the root's reference count to 2. One reference is returned to the + // caller, and the other is deliberately leaked to prevent the root from + // being "cached" and subsequently evicted. Its resources will still be + // cleaned up by fs.Release(). + root.refs = 2 + + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + ctx := context.Background() + mf := fs.mfp.MemoryFile() + + fs.syncMu.Lock() + for d := range fs.dentries { + d.handleMu.Lock() + d.dataMu.Lock() + if d.handleWritable { + // Write dirty cached data to the remote file. + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) + } + // TODO(jamieliu): Do we need to flushf/fsync d? + } + // Discard cached pages. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Close the host fd if one exists. + if d.handle.fd >= 0 { + syscall.Close(int(d.handle.fd)) + d.handle.fd = -1 + } + d.handleMu.Unlock() + } + // There can't be any specialFileFDs still using fs, since each such + // FileDescription would hold a reference on a Mount holding a reference on + // fs. + fs.syncMu.Unlock() + + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. refs is accessed using atomic memory + // operations. + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // We don't support hard links, so each dentry maps 1:1 to an inode. + + // file is the unopened p9.File that backs this dentry. file is immutable. + file p9file + + // If deleted is non-zero, the file represented by this dentry has been + // deleted. deleted is accessed using atomic memory operations. + deleted uint32 + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // filesystem.renameMu. + cached bool + dentryEntry + + dirMu sync.Mutex + + // If this dentry represents a directory, and InteropModeShared is not in + // effect, negativeChildren is a set of child names in this directory that + // are known not to exist. negativeChildren is protected by dirMu. + negativeChildren map[string]struct{} + + // If this dentry represents a directory, InteropModeShared is not in + // effect, and dirents is not nil, it is a cache of all entries in the + // directory, in the order they were returned by the server. dirents is + // protected by dirMu. + dirents []vfs.Dirent + + // Cached metadata; protected by metadataMu and accessed using atomic + // memory operations unless otherwise specified. + metadataMu sync.Mutex + ino uint64 // immutable + mode uint32 // type is immutable, perms are mutable + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + blockSize uint32 // 0 if unknown + // Timestamps, all nsecs from the Unix epoch. + atime int64 + mtime int64 + ctime int64 + btime int64 + // File size, protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it). + size uint64 + + mapsMu sync.Mutex + + // If this dentry represents a regular file, mappings tracks mappings of + // the file into memmap.MappingSpaces. mappings is protected by mapsMu. + mappings memmap.MappingSet + + // If this dentry represents a regular file or directory: + // + // - handle is the I/O handle used by all regularFileFDs/directoryFDs + // representing this dentry. + // + // - handleReadable is true if handle is readable. + // + // - handleWritable is true if handle is writable. + // + // Invariants: + // + // - If handleReadable == handleWritable == false, then handle.file == nil + // (i.e. there is no open handle). Conversely, if handleReadable || + // handleWritable == true, then handle.file != nil (i.e. there is an open + // handle). + // + // - handleReadable and handleWritable cannot transition from true to false + // (i.e. handles may not be downgraded). + // + // These fields are protected by handleMu. + handleMu sync.RWMutex + handle handle + handleReadable bool + handleWritable bool + + dataMu sync.RWMutex + + // If this dentry represents a regular file that is client-cached, cache + // maps offsets into the cached file to offsets into + // filesystem.mfp.MemoryFile() that store the file's data. cache is + // protected by dataMu. + cache fsutil.FileRangeSet + + // If this dentry represents a regular file that is client-cached, dirty + // tracks dirty segments in cache. dirty is protected by dataMu. + dirty fsutil.DirtySet + + // pf implements platform.File for mappings of handle.fd. + pf dentryPlatformFile + + // If this dentry represents a symbolic link, InteropModeShared is not in + // effect, and haveTarget is true, target is the symlink target. haveTarget + // and target are protected by dataMu. + haveTarget bool + target string +} + +// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the +// gofer client. +func dentryAttrMask() p9.AttrMask { + return p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + ATime: true, + MTime: true, + CTime: true, + Size: true, + BTime: true, + } +} + +// newDentry creates a new dentry representing the given file. The dentry +// initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.checkCachingLocked() as appropriate. +func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { + if !mask.Mode { + ctx.Warningf("can't create gofer.dentry without file type") + return nil, syserror.EIO + } + if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { + ctx.Warningf("can't create regular file gofer.dentry without file size") + return nil, syserror.EIO + } + + d := &dentry{ + fs: fs, + file: file, + ino: qid.Path, + mode: uint32(attr.Mode), + uid: uint32(fs.uid), + gid: uint32(fs.gid), + blockSize: usermem.PageSize, + handle: handle{ + fd: -1, + }, + } + d.pf.dentry = d + if mask.UID { + d.uid = uint32(attr.UID) + } + if mask.GID { + d.gid = uint32(attr.GID) + } + if mask.Size { + d.size = attr.Size + } + if attr.BlockSize != 0 { + d.blockSize = uint32(attr.BlockSize) + } + if mask.ATime { + d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds) + } + if mask.MTime { + d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds) + } + if mask.CTime { + d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds) + } + if mask.BTime { + d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) + } + d.vfsd.Init(d) + + fs.syncMu.Lock() + fs.dentries[d] = struct{}{} + fs.syncMu.Unlock() + return d, nil +} + +// updateFromP9Attrs is called to update d's metadata after an update from the +// remote filesystem. +func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { + d.metadataMu.Lock() + if mask.Mode { + if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { + d.metadataMu.Unlock() + panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) + } + atomic.StoreUint32(&d.mode, uint32(attr.Mode)) + } + if mask.UID { + atomic.StoreUint32(&d.uid, uint32(attr.UID)) + } + if mask.GID { + atomic.StoreUint32(&d.gid, uint32(attr.GID)) + } + // There is no P9_GETATTR_* bit for I/O block size. + if attr.BlockSize != 0 { + atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) + } + if mask.ATime { + atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) + } + if mask.MTime { + atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) + } + if mask.CTime { + atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)) + } + if mask.BTime { + atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) + } + if mask.Size { + d.dataMu.Lock() + atomic.StoreUint64(&d.size, attr.Size) + d.dataMu.Unlock() + } + d.metadataMu.Unlock() +} + +func (d *dentry) updateFromGetattr(ctx context.Context) error { + // Use d.handle.file, which represents a 9P fid that has been opened, in + // preference to d.file, which represents a 9P fid that has not. This may + // be significantly more efficient in some implementations. + var ( + file p9file + handleMuRLocked bool + ) + d.handleMu.RLock() + if !d.handle.file.isNil() { + file = d.handle.file + handleMuRLocked = true + } else { + file = d.file + d.handleMu.RUnlock() + } + _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) + if handleMuRLocked { + d.handleMu.RUnlock() + } + if err != nil { + return err + } + d.updateFromP9Attrs(attrMask, &attr) + return nil +} + +func (d *dentry) fileType() uint32 { + return atomic.LoadUint32(&d.mode) & linux.S_IFMT +} + +func (d *dentry) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME + stat.Blksize = atomic.LoadUint32(&d.blockSize) + stat.Nlink = 1 + if d.isDir() { + stat.Nlink = 2 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = d.ino + stat.Size = atomic.LoadUint64(&d.size) + // This is consistent with regularFileFD.Seek(), which treats regular files + // as having no holes. + stat.Blocks = (stat.Size + 511) / 512 + stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime)) + stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) + stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) + stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) + // TODO(jamieliu): device number +} + +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { + if stat.Mask == 0 { + return nil + } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + setLocalAtime := false + setLocalMtime := false + if d.fs.opts.interop != InteropModeShared { + // Timestamp updates will be handled locally. + setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 + setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 + stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME + if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + } + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since it'll + // be overwritten by revalidation before the next time it's used + // anyway. (InteropModeShared inhibits client caching of regular file + // data, so there's no cache to truncate either.) + return nil + } + now, haveNow := nowFromContext(ctx) + if !haveNow { + ctx.Warningf("gofer.dentry.setStat: current time not available") + } + if stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) + } + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, stat.GID) + } + if setLocalAtime { + if stat.Atime.Nsec == linux.UTIME_NOW { + if haveNow { + atomic.StoreInt64(&d.atime, now) + } + } else { + atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) + } + } + if setLocalMtime { + if stat.Mtime.Nsec == linux.UTIME_NOW { + if haveNow { + atomic.StoreInt64(&d.mtime, now) + } + } else { + atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) + } + } + if haveNow { + atomic.StoreInt64(&d.ctime, now) + } + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Lock() + oldSize := d.size + d.size = stat.Size + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend := pageRoundUp(oldSize) + newpgend := pageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. + d.dataMu.Lock() + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) + d.dataMu.Unlock() + } + } + return nil +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { + return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkCachingLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked() + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } +} + +// checkCachingLocked should be called after d's reference count becomes 0 or it +// becomes disowned. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkCachingLocked() { + // Dentries with a non-zero reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) + if atomic.LoadInt64(&d.refs) != 0 { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + return + } + // Non-child dentries with zero references are no longer reachable by path + // resolution and should be dropped immediately. + if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + d.destroyLocked() + return + } + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries { + victim := d.fs.cachedDentries.Back() + d.fs.cachedDentries.Remove(victim) + d.fs.cachedDentriesLen-- + victim.cached = false + // victim.refs may have become non-zero from an earlier path + // resolution since it was inserted into fs.cachedDentries; see + // dentry.incRefLocked(). Either way, we brought + // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so + // we don't loop. + if atomic.LoadInt64(&victim.refs) == 0 { + if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { + victimParent := victimParentVFSD.Impl().(*dentry) + victimParent.dirMu.Lock() + if !victim.vfsd.IsDisowned() { + // victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount + // points. + d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + // We're only deleting the dentry, not the file it + // represents, so we don't need to update + // victimParent.dirents etc. + } + victimParent.dirMu.Unlock() + } + victim.destroyLocked() + } + } +} + +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is +// not a child dentry. +func (d *dentry) destroyLocked() { + ctx := context.Background() + d.handleMu.Lock() + if !d.handle.file.isNil() { + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + // Write dirty pages back to the remote filesystem. + if d.handleWritable { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) + } + } + // Discard cached data. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + d.handle.close(ctx) + } + d.handleMu.Unlock() + d.file.close(ctx) + // Remove d from the set of all dentries. + d.fs.syncMu.Lock() + delete(d.fs.dentries, d) + d.fs.syncMu.Unlock() + // Drop the reference held by d on its parent. + if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { + parent := parentVFSD.Impl().(*dentry) + // This is parent.DecRef() without recursive locking of d.fs.renameMu. + if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { + parent.checkCachingLocked() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } + } +} + +func (d *dentry) isDeleted() bool { + return atomic.LoadUint32(&d.deleted) != 0 +} + +func (d *dentry) setDeleted() { + atomic.StoreUint32(&d.deleted, 1) +} + +func (d *dentry) listxattr(ctx context.Context) ([]string, error) { + return nil, syserror.ENOTSUP +} + +func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { + // TODO(jamieliu): add vfs.GetxattrOptions.Size + return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +} + +func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { + return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) +} + +func (d *dentry) removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + +// Preconditions: d.isRegularFile() || d.isDirectory(). +func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { + // O_TRUNC unconditionally requires us to obtain a new handle (opened with + // O_TRUNC). + if !trunc { + d.handleMu.RLock() + if (!read || d.handleReadable) && (!write || d.handleWritable) { + // The current handle is sufficient. + d.handleMu.RUnlock() + return nil + } + d.handleMu.RUnlock() + } + + haveOldFD := false + d.handleMu.Lock() + if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { + // Get a new handle. + wantReadable := d.handleReadable || read + wantWritable := d.handleWritable || write + h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if err != nil { + d.handleMu.Unlock() + return err + } + if !d.handle.file.isNil() { + // Check that old and new handles are compatible: If the old handle + // includes a host file descriptor but the new one does not, or + // vice versa, old and new memory mappings may be incoherent. + haveOldFD = d.handle.fd >= 0 + haveNewFD := h.fd >= 0 + if haveOldFD != haveNewFD { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) + h.close(ctx) + return syserror.EIO + } + if haveOldFD { + // We may have raced with callers of d.pf.FD() that are now + // using the old file descriptor, preventing us from safely + // closing it. We could handle this by invalidating existing + // memmap.Translations, but this is expensive. Instead, use + // dup2() to make the old file descriptor refer to the new file + // description, then close the new file descriptor (which is no + // longer needed). Racing callers may use the old or new file + // description, but this doesn't matter since they refer to the + // same file (unless d.fs.opts.overlayfsStaleRead is true, + // which we handle separately). + if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + h.close(ctx) + return err + } + syscall.Close(int(h.fd)) + h.fd = d.handle.fd + if d.fs.opts.overlayfsStaleRead { + // Replace sentry mappings of the old FD with mappings of + // the new FD, since the two are not necessarily coherent. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) + h.close(ctx) + return err + } + } + // Clunk the old fid before making the new handle visible (by + // unlocking d.handleMu). + d.handle.file.close(ctx) + } + } + // Switch to the new handle. + d.handle = h + d.handleReadable = wantReadable + d.handleWritable = wantWritable + } + d.handleMu.Unlock() + + if d.fs.opts.overlayfsStaleRead && haveOldFD { + // Invalidate application mappings that may be using the old FD; they + // will be replaced with mappings using the new FD after future calls + // to d.Translate(). This requires holding d.mapsMu, which precedes + // d.handleMu in the lock order. + d.mapsMu.Lock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + } + + return nil +} + +// fileDescription is embedded by gofer implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + d := fd.dentry() + if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if + // available? + if err := d.updateFromGetattr(ctx); err != nil { + return linux.Statx{}, err + } + } + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()) +} + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { + return fd.dentry().listxattr(ctx) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { + return fd.dentry().getxattr(ctx, name) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + return fd.dentry().setxattr(ctx, &opts) +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + return fd.dentry().removexattr(ctx, name) +} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go new file mode 100644 index 000000000..cfe66f797 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/safemem" +) + +// handle represents a remote "open file descriptor", consisting of an opened +// fid (p9.File) and optionally a host file descriptor. +type handle struct { + file p9file + fd int32 // -1 if unavailable +} + +// Preconditions: read || write. +func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) { + _, newfile, err := file.walk(ctx, nil) + if err != nil { + return handle{fd: -1}, err + } + var flags p9.OpenFlags + switch { + case read && !write: + flags = p9.ReadOnly + case !read && write: + flags = p9.WriteOnly + case read && write: + flags = p9.ReadWrite + } + if trunc { + flags |= p9.OpenTruncate + } + fdobj, _, _, err := newfile.open(ctx, flags) + if err != nil { + newfile.close(ctx) + return handle{fd: -1}, err + } + fd := int32(-1) + if fdobj != nil { + fd = int32(fdobj.Release()) + } + return handle{ + file: newfile, + fd: fd, + }, nil +} + +func (h *handle) close(ctx context.Context) { + h.file.close(ctx) + h.file = p9file{} + if h.fd >= 0 { + syscall.Close(int(h.fd)) + h.fd = -1 + } +} + +func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostPreadv(h.fd, dsts, int64(offset)) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { + n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the read since p9.File.ReadAt() takes []byte. + buf := make([]byte, dsts.NumBytes()) + n, err := h.file.readAt(ctx, buf, offset) + if n == 0 { + return 0, err + } + if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { + return cp, cperr + } + return uint64(n), err +} + +func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostPwritev(h.fd, srcs, int64(offset)) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { + n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the write since p9.File.WriteAt() takes []byte. + buf := make([]byte, srcs.NumBytes()) + cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs) + if cp == 0 { + return 0, cperr + } + n, err := h.file.writeAt(ctx, buf[:cp], offset) + if err != nil { + return uint64(n), err + } + return cp, cperr +} + +func (h *handle) sync(ctx context.Context) error { + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(h.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + return h.file.fsync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go new file mode 100644 index 000000000..19560ab26 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle_unsafe.go @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/safemem" +) + +// Preconditions: !dsts.IsEmpty(). +func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + if dsts.NumBlocks() == 1 { + // Use pread() instead of preadv() to avoid iovec allocation and + // copying. + dst := dsts.Head() + n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil + } + iovs := safemem.IovecsFromBlockSeq(dsts) + n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil +} + +// Preconditions: !srcs.IsEmpty(). +func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + if srcs.NumBlocks() == 1 { + // Use pwrite() instead of pwritev() to avoid iovec allocation and + // copying. + src := srcs.Head() + n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil + } + iovs := safemem.IovecsFromBlockSeq(srcs) + n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil +} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go new file mode 100644 index 000000000..755ac2985 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -0,0 +1,219 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/syserror" +) + +// p9file is a wrapper around p9.File that provides methods that are +// Context-aware. +type p9file struct { + file p9.File +} + +func (f p9file) isNil() bool { + return f.file == nil +} + +func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, err := f.file.Walk(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, err +} + +func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, attrMask, attr, err +} + +// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single +// path component and returns a single qid. +func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name}) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err + } + if len(qids) != 1 { + ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids) + if newfile != nil { + p9file{newfile}.close(ctx) + } + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + } + return qids[0], p9file{newfile}, attrMask, attr, nil +} + +func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) { + ctx.UninterruptibleSleepStart(false) + fsstat, err := f.file.StatFS() + ctx.UninterruptibleSleepFinish(false) + return fsstat, err +} + +func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qid, attrMask, attr, err := f.file.GetAttr(req) + ctx.UninterruptibleSleepFinish(false) + return qid, attrMask, attr, err +} + +func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetAttr(valid, attr) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { + ctx.UninterruptibleSleepStart(false) + val, err := f.file.GetXattr(name, size) + ctx.UninterruptibleSleepFinish(false) + return val, err +} + +func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetXattr(name, value, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Allocate(mode, offset, length) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) close(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Close() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, qid, iounit, err := f.file.Open(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, qid, iounit, err +} + +func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.ReadAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.WriteAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) fsync(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.FSync() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return fdobj, p9file{newfile}, qid, iounit, err +} + +func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mkdir(name, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Symlink(oldName, newName, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) link(ctx context.Context, target p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Link(target.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mknod(name, mode, major, minor, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Rename(newDir.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.UnlinkAt(name, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) { + ctx.UninterruptibleSleepStart(false) + dirents, err := f.file.Readdir(offset, count) + ctx.UninterruptibleSleepFinish(false) + return dirents, err +} + +func (f p9file) readlink(ctx context.Context) (string, error) { + ctx.UninterruptibleSleepStart(false) + target, err := f.file.Readlink() + ctx.UninterruptibleSleepFinish(false) + return target, err +} + +func (f p9file) flush(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Flush() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, err := f.file.Connect(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, err +} diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go new file mode 100644 index 000000000..847cb0784 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/pagemath.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/usermem" +) + +// This are equivalent to usermem.Addr.RoundDown/Up, but without the +// potentially truncating conversion to usermem.Addr. This is necessary because +// there is no way to define generic "PageRoundDown/Up" functions in Go. + +func pageRoundDown(x uint64) uint64 { + return x &^ (usermem.PageSize - 1) +} + +func pageRoundUp(x uint64) uint64 { + return pageRoundDown(x + usermem.PageSize - 1) +} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go new file mode 100644 index 000000000..8e11e06b3 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -0,0 +1,860 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isRegularFile() bool { + return d.fileType() == linux.S_IFREG +} + +type regularFileFD struct { + fileDescription + + // off is the file offset. off is protected by mu. + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *regularFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + // Skip flushing if writes may be buffered by the client, since (as with + // the VFS1 client) we don't flush buffered writes on close anyway. + d := fd.dentry() + if d.fs.opts.interop == InteropModeExclusive { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.handle.file.flush(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Check for reading at EOF before calling into MM (but not under + // InteropModeShared, which makes d.size unreliable). + d := fd.dentry() + if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + return 0, io.EOF + } + + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Lock d.metadataMu for the rest of the read to prevent d.size from + // changing. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + // Write dirty cached pages that will be touched by the read back to + // the remote file. + if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { + return 0, err + } + } + + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the read to go to the remote file. + rw.direct = true + } + n, err := dst.CopyOutFrom(ctx, rw) + putDentryReadWriter(rw) + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + d.touchAtime(ctx, fd.vfsfd.Mount()) + } + return n, err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => + // file_update_time(). This is d.touchCMtime(), but without locking + // d.metadataMu (recursively). + if now, ok := nowFromContext(ctx); ok { + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + } + } + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Remove touched pages from the cache. + pgstart := pageRoundDown(uint64(offset)) + pgend := pageRoundUp(uint64(offset + src.NumBytes())) + if pgend < pgstart { + return 0, syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []platform.FileRange + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + } + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the write to go to the remote file. + rw.direct = true + } + n, err := src.CopyInTo(ctx, rw) + putDentryReadWriter(rw) + if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Write dirty cached pages touched by the write back to the remote + // file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Request the remote filesystem to sync the remote file. + if err := d.handle.file.fsync(ctx); err != nil { + return 0, err + } + } + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +type dentryReadWriter struct { + ctx context.Context + d *dentry + off uint64 + direct bool +} + +var dentryReadWriterPool = sync.Pool{ + New: func() interface{} { + return &dentryReadWriter{} + }, +} + +func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { + rw := dentryReadWriterPool.Get().(*dentryReadWriter) + rw.ctx = ctx + rw.d = d + rw.off = uint64(offset) + rw.direct = false + return rw +} + +func putDentryReadWriter(rw *dentryReadWriter) { + rw.ctx = nil + rw.d = nil + dentryReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents and makes dentry.size + // unreliable), or if the file was opened O_DIRECT, read directly from + // dentry.handle without locking dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.d.handleMu.RUnlock() + rw.off += n + return n, err + } + + // Otherwise read from/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + fillCache := mf.ShouldCacheEvictable() + var dataMuUnlock func() + if fillCache { + rw.d.dataMu.Lock() + dataMuUnlock = rw.d.dataMu.Unlock + } else { + rw.d.dataMu.RLock() + dataMuUnlock = rw.d.dataMu.RUnlock + } + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= rw.d.size { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return 0, io.EOF + } + end := rw.d.size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + gapMR := gap.Range().Intersect(mr) + if fillCache { + // Read into the cache, then re-enter the loop to read from the + // cache. + reqMR := memmap.MappableRange{ + Start: pageRoundDown(gapMR.Start), + End: pageRoundUp(gapMR.End), + } + optMR := gap.Range() + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) + seg, gap = rw.d.cache.Find(rw.off) + if !seg.Ok() { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + // err might have occurred in part of gap.Range() outside + // gapMR. Forget about it for now; if the error matters and + // persists, we'll run into it again in a later iteration of + // this loop. + } else { + // Read directly from the file. + gapDsts := dsts.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != gapDsts.NumBytes() || err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + } + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.d.metadataMu must be locked. +func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents), or if the file was + // opened with O_DIRECT, write directly to dentry.handle without locking + // dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.d.handleMu.RUnlock() + rw.off += n + return n, err + } + + // Otherwise write to/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + rw.d.dataMu.Lock() + + // Compute the range to write (overflow-checked). + start := rw.off + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + var ( + done uint64 + retErr error + ) + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + rw.d.dirty.MarkDirty(segMR) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Write directly to the file. At present, we never fill the cache + // when writing, since doing so can convert small writes into + // inefficient read-modify-write cycles, and we have no mechanism + // for detecting or avoiding this. + gapMR := gap.Range().Intersect(mr) + gapSrcs := srcs.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != gapSrcs.NumBytes() || err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + // If InteropModeWritethrough is in effect, flush written data back to the + // remote filesystem. + if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { + if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ + Start: start, + End: rw.off, + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + // We have no idea how many bytes were actually flushed. + rw.off = start + done = 0 + retErr = err + } + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() + return done, retErr +} + +func (d *dentry) writeback(ctx context.Context, offset, size int64) error { + if size == 0 { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + // Compute the range of valid bytes (overflow-checked). + if uint64(offset) >= d.size { + return nil + } + end := int64(d.size) + if rend := offset + size; rend > offset && rend < end { + end = rend + } + return fsutil.SyncDirty(ctx, memmap.MappableRange{ + Start: uint64(offset), + End: uint64(end), + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: + // Ensure file size is up to date. + d := fd.dentry() + if fd.filesystem().opts.interop == InteropModeShared { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, err + } + } + size := int64(atomic.LoadUint64(&d.size)) + // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous + // block of data. + switch whence { + case linux.SEEK_END: + offset += size + case linux.SEEK_DATA: + if offset > size { + return 0, syserror.ENXIO + } + // Use offset as specified. + case linux.SEEK_HOLE: + if offset > size { + return 0, syserror.ENXIO + } + offset = size + } + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return fd.dentry().syncSharedHandle(ctx) +} + +func (d *dentry) syncSharedHandle(ctx context.Context) error { + d.handleMu.RLock() + if !d.handleWritable { + d.handleMu.RUnlock() + return nil + } + d.dataMu.Lock() + // Write dirty cached data to the remote file. + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + d.dataMu.Unlock() + if err == nil { + // Sync the remote file. + err = d.handle.sync(ctx) + } + d.handleMu.RUnlock() + return err +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + switch d.fs.opts.interop { + case InteropModeExclusive: + // Any mapping is fine. + case InteropModeWritethrough: + // Shared writable mappings require a host FD, since otherwise we can't + // synchronously flush memory-mapped writes to the remote file. + if opts.Private || !opts.MaxPerms.Write { + break + } + fallthrough + case InteropModeShared: + // All mappings require a host FD to be coherent with other filesystem + // users. + if d.fs.opts.forcePageCache { + // Whether or not we have a host FD, we're not allowed to use it. + return syserror.ENODEV + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + if !haveFD { + return syserror.ENODEV + } + default: + panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) + } + return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) +} + +func (d *dentry) mayCachePages() bool { + if d.fs.opts.interop == InteropModeShared { + return false + } + if d.fs.opts.forcePageCache { + return true + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + return haveFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + mapped := d.mappings.AddMapping(ms, ar, offset, writable) + // Do this unconditionally since whether we have a host FD can change + // across save/restore. + for _, r := range mapped { + d.pf.hostFileMapper.IncRefOn(r) + } + if d.mayCachePages() { + // d.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := d.fs.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + } + d.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + d.pf.hostFileMapper.DecRefOn(r) + } + if d.mayCachePages() { + // Pages that are no longer referenced by any application memory + // mappings are now considered unused; allow MemoryFile to evict them + // when necessary. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + for _, r := range unmapped { + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + d.dirty.AllowClean(r) + mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + d.dataMu.Unlock() + } + d.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return d.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.handleMu.RLock() + if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + d.handleMu.RUnlock() + mr := optional + if d.fs.opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: &d.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil + } + + d.dataMu.Lock() + + // Constrain translations to d.size (rounded up) to prevent translation to + // pages that may be concurrently truncated. + pgend := pageRoundUp(d.size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + d.dataMu.Unlock() + d.handleMu.RUnlock() + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mf := d.fs.mfp.MemoryFile() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + // TODO(jamieliu): Make Translations writable even if writability is + // not required if already kept-dirty by another writable translation. + perms := usermem.AccessType{ + Read: true, + Execute: true, + } + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + d.dirty.KeepDirty(segMR) + perms.Write = true + } + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mf, + Offset: seg.FileRangeOf(segMR).Start, + Perms: perms, + }) + translatedEnd = segMR.End + } + + d.dataMu.Unlock() + d.handleMu.RUnlock() + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Write the cache's contents back to the remote file so that if we have a + // host fd after restore, the remote file's contents are coherent. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + + return nil +} + +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + d.cache.Drop(mgapMR, mf) + d.dirty.KeepClean(mgapMR) + } +} + +// dentryPlatformFile implements platform.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// +// dentryPlatformFile is only used when a host FD representing the remote file +// is available (i.e. dentry.handle.fd >= 0), and that FD is used for +// application memory mappings (i.e. !filesystem.opts.forcePageCache). +type dentryPlatformFile struct { + *dentry + + // fdRefs counts references on platform.File offsets. fdRefs is protected + // by dentry.dataMu. + fdRefs fsutil.FrameRefSet + + // If this dentry represents a regular file, and handle.fd >= 0, + // hostFileMapper caches mappings of handle.fd. + hostFileMapper fsutil.HostFileMapper +} + +// IncRef implements platform.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { + d.dataMu.Lock() + seg, gap := d.fdRefs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = d.fdRefs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + d.fdRefs.MergeAdjacent(fr) + d.dataMu.Unlock() + return + } + } +} + +// DecRef implements platform.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { + d.dataMu.Lock() + seg := d.fdRefs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = d.fdRefs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = d.fdRefs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + d.fdRefs.MergeAdjacent(fr) + d.dataMu.Unlock() + +} + +// MapInternal implements platform.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + d.handleMu.RLock() + bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) + d.handleMu.RUnlock() + return bs, err +} + +// FD implements platform.File.FD. +func (d *dentryPlatformFile) FD() int { + d.handleMu.RLock() + fd := d.handle.fd + d.handleMu.RUnlock() + return int(fd) +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go new file mode 100644 index 000000000..08c691c47 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -0,0 +1,159 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// specialFileFD implements vfs.FileDescriptionImpl for files other than +// regular files, directories, and symlinks: pipes, sockets, etc. It is also +// used for regular files when filesystemOptions.specialRegularFiles is in +// effect. specialFileFD differs from regularFileFD by using per-FD handles +// instead of shared per-dentry handles, and never buffering I/O. +type specialFileFD struct { + fileDescription + + // handle is immutable. + handle handle + + // off is the file offset. off is protected by mu. (POSIX 2.9.7 only + // requires operations using the file offset to be atomic for regular files + // and symlinks; however, since specialFileFD may be used for regular + // files, we apply this atomicity unconditionally.) + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *specialFileFD) Release() { + fd.handle.close(context.Background()) + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) + fs.syncMu.Lock() + delete(fs.specialFileFDs, fd) + fs.syncMu.Unlock() +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *specialFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.file.flush(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Going through dst.CopyOutFrom() holds MM locks around file operations of + // unknown duration. For regularFileFD, doing so is necessary to support + // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't + // hold here since specialFileFD doesn't client-cache data. Just buffer the + // read instead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchAtime(ctx, fd.vfsfd.Mount()) + } + buf := make([]byte, dst.NumBytes()) + n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if n == 0 { + return 0, err + } + if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { + return int64(cp), cperr + } + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Do a buffered write. See rationale in PRead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchCMtime(ctx) + } + buf := make([]byte, src.NumBytes()) + // Don't do partial writes if we get a partial read from src. + if _, err := src.CopyIn(ctx, buf); err != nil { + return 0, err + } + n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + return int64(n), err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not + // clear that file size is even meaningful for these files. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *specialFileFD) Sync(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.sync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go new file mode 100644 index 000000000..adf43be60 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func (d *dentry) isSymlink() bool { + return d.fileType() == linux.S_IFLNK +} + +// Precondition: d.isSymlink(). +func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if d.fs.opts.interop != InteropModeShared { + d.touchAtime(ctx, mnt) + d.dataMu.Lock() + if d.haveTarget { + target := d.target + d.dataMu.Unlock() + return target, nil + } + } + target, err := d.file.readlink(ctx) + if d.fs.opts.interop != InteropModeShared { + if err == nil { + d.haveTarget = true + d.target = target + } + d.dataMu.Unlock() + } + return target, err +} diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go new file mode 100644 index 000000000..7598ec6a8 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func dentryTimestampFromP9(s, ns uint64) int64 { + return int64(s*1e9 + ns) +} + +func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 { + return ts.Sec*1e9 + int64(ts.Nsec) +} + +func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { + return linux.StatxTimestamp{ + Sec: ns / 1e9, + Nsec: uint32(ns % 1e9), + } +} + +func nowFromContext(ctx context.Context) (int64, bool) { + if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { + return clock.Now().Nanoseconds(), true + } + return 0, false +} + +// Preconditions: fs.interop != InteropModeShared. +func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now, ok := nowFromContext(ctx) + if !ok { + mnt.EndWrite() + return + } + d.metadataMu.Lock() + atomic.StoreInt64(&d.atime, now) + d.metadataMu.Unlock() + mnt.EndWrite() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime(ctx context.Context) { + now, ok := nowFromContext(ctx) + if !ok { + return + } + d.metadataMu.Lock() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5ee9cf1e9..72bc15264 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -622,7 +622,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if child.inode.isDir() { return syserror.EISDIR } - if !rp.MustBeDir() { + if rp.MustBeDir() { return syserror.ENOTDIR } mnt := rp.Mount() diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index bde4c7a1e..34f63986f 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -126,7 +126,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } return uint64(n), nil } - return readv(s.fd, iovecsFromBlockSeq(dsts)) + return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) })) return int64(n), err } @@ -149,7 +149,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } return uint64(n), nil } - return writev(s.fd, iovecsFromBlockSeq(srcs)) + return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) })) return int64(n), err } @@ -402,7 +402,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // We always do a non-blocking recv*(). sysflags := flags | syscall.MSG_DONTWAIT - iovs := iovecsFromBlockSeq(dsts) + iovs := safemem.IovecsFromBlockSeq(dsts) msg := syscall.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), @@ -522,7 +522,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return uint64(n), nil } - iovs := iovecsFromBlockSeq(srcs) + iovs := safemem.IovecsFromBlockSeq(srcs) msg := syscall.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), @@ -567,21 +567,6 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return int(n), syserr.FromError(err) } -func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec { - iovs := make([]syscall.Iovec, 0, bs.NumBlocks()) - for ; !bs.IsEmpty(); bs = bs.Tail() { - b := bs.Head() - iovs = append(iovs, syscall.Iovec{ - Base: &b.ToSlice()[0], - Len: uint64(b.Len()), - }) - // We don't need to care about b.NeedSafecopy(), because the host - // kernel will handle such address ranges just fine (by returning - // EFAULT). - } - return iovs -} - func translateIOSyscallError(err error) error { if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK { return syserror.ErrWouldBlock -- cgit v1.2.3 From dcffddf0cae026411e7e678744a1e39dc2b513cf Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 4 Feb 2020 11:47:41 -0800 Subject: Remove argument from vfs.MountNamespace.DecRef() Updates #1035 PiperOrigin-RevId: 293194631 --- pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 2 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 2 +- pkg/sentry/fsimpl/testutil/testutil.go | 2 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 2 +- pkg/sentry/vfs/mount.go | 3 ++- 6 files changed, 8 insertions(+), 7 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index d36fa74fb..e03a0c665 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -86,7 +86,7 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth // Release must be called when a is no longer in use. func (a *Accessor) Release() { a.root.DecRef() - a.mntns.DecRef(a.vfsObj) + a.mntns.DecRef() } // accessorContext implements context.Context by extending an existing diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 82c58c900..73308a2b5 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -45,7 +45,7 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() root := mntns.Root() defer root.DecRef() devpop := vfs.PathOperation{ diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 1c98335c1..69fd84ddd 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { s.Root.DecRef() - s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem. + s.mns.DecRef() // Reference on mns passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 54241c8e8..9fce5e4b4 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -183,7 +183,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -374,7 +374,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 2b52992ea..e9f71e334 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -51,7 +51,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr root := mntns.Root() return vfsObj, root, func() { root.DecRef() - mntns.DecRef(vfsObj) + mntns.DecRef() }, nil } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index d39528051..1fbb420f9 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -423,7 +423,8 @@ func (mntns *MountNamespace) IncRef() { } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) { +func (mntns *MountNamespace) DecRef() { + vfs := mntns.root.fs.VirtualFilesystem() if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() -- cgit v1.2.3 From f5072caaf85b9f067d737a874804c04e2b9039b8 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 4 Feb 2020 12:39:58 -0800 Subject: Fix safecopy test. This is failing in Go1.14 due to new checkptr constraints. This version should avoid hitting this constraints by doing "dangerous" pointer math less dangerously? PiperOrigin-RevId: 293205764 --- pkg/safecopy/safecopy_test.go | 88 ++++++++++++++++++++---------------- pkg/safecopy/safecopy_unsafe.go | 98 ++++++++++++++++++++++++++--------------- 2 files changed, 112 insertions(+), 74 deletions(-) (limited to 'pkg') diff --git a/pkg/safecopy/safecopy_test.go b/pkg/safecopy/safecopy_test.go index 5818f7f9b..7f7f69d61 100644 --- a/pkg/safecopy/safecopy_test.go +++ b/pkg/safecopy/safecopy_test.go @@ -138,10 +138,14 @@ func TestSwapUint32Success(t *testing.T) { func TestSwapUint32AlignmentError(t *testing.T) { // Test that SwapUint32 returns an AlignmentError when passed an unaligned // address. - data := new(struct{ val uint64 }) - addr := uintptr(unsafe.Pointer(&data.val)) + 1 - want := AlignmentError{Addr: addr, Alignment: 4} - if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want { + data := make([]byte, 8) // 2 * sizeof(uint32). + alignedIndex := uintptr(0) + if offset := uintptr(unsafe.Pointer(&data[0])) % 4; offset != 0 { + alignedIndex = 4 - offset + } + ptr := unsafe.Pointer(&data[alignedIndex+1]) + want := AlignmentError{Addr: uintptr(ptr), Alignment: 4} + if _, err := SwapUint32(ptr, 1); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) } } @@ -171,10 +175,14 @@ func TestSwapUint64Success(t *testing.T) { func TestSwapUint64AlignmentError(t *testing.T) { // Test that SwapUint64 returns an AlignmentError when passed an unaligned // address. - data := new(struct{ val1, val2 uint64 }) - addr := uintptr(unsafe.Pointer(&data.val1)) + 1 - want := AlignmentError{Addr: addr, Alignment: 8} - if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want { + data := make([]byte, 16) // 2 * sizeof(uint64). + alignedIndex := uintptr(0) + if offset := uintptr(unsafe.Pointer(&data[0])) % 8; offset != 0 { + alignedIndex = 8 - offset + } + ptr := unsafe.Pointer(&data[alignedIndex+1]) + want := AlignmentError{Addr: uintptr(ptr), Alignment: 8} + if _, err := SwapUint64(ptr, 1); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) } } @@ -201,10 +209,14 @@ func TestCompareAndSwapUint32Success(t *testing.T) { func TestCompareAndSwapUint32AlignmentError(t *testing.T) { // Test that CompareAndSwapUint32 returns an AlignmentError when passed an // unaligned address. - data := new(struct{ val uint64 }) - addr := uintptr(unsafe.Pointer(&data.val)) + 1 - want := AlignmentError{Addr: addr, Alignment: 4} - if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want { + data := make([]byte, 8) // 2 * sizeof(uint32). + alignedIndex := uintptr(0) + if offset := uintptr(unsafe.Pointer(&data[0])) % 4; offset != 0 { + alignedIndex = 4 - offset + } + ptr := unsafe.Pointer(&data[alignedIndex+1]) + want := AlignmentError{Addr: uintptr(ptr), Alignment: 4} + if _, err := CompareAndSwapUint32(ptr, 0, 1); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) } } @@ -252,8 +264,8 @@ func TestCopyInSegvError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) dst := randBuf(pageSize) n, err := CopyIn(dst, src) if n != bytesBeforeFault { @@ -276,8 +288,8 @@ func TestCopyInBusError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) dst := randBuf(pageSize) n, err := CopyIn(dst, src) if n != bytesBeforeFault { @@ -300,8 +312,8 @@ func TestCopyOutSegvError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) src := randBuf(pageSize) n, err := CopyOut(dst, src) if n != bytesBeforeFault { @@ -324,8 +336,8 @@ func TestCopyOutBusError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) src := randBuf(pageSize) n, err := CopyOut(dst, src) if n != bytesBeforeFault { @@ -348,8 +360,8 @@ func TestCopySourceSegvError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) dst := randBuf(pageSize) n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) if n != uintptr(bytesBeforeFault) { @@ -372,8 +384,8 @@ func TestCopySourceBusError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) dst := randBuf(pageSize) n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) if n != uintptr(bytesBeforeFault) { @@ -396,8 +408,8 @@ func TestCopyDestinationSegvError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) src := randBuf(pageSize) n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) if n != uintptr(bytesBeforeFault) { @@ -420,8 +432,8 @@ func TestCopyDestinationBusError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) src := randBuf(pageSize) n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) if n != uintptr(bytesBeforeFault) { @@ -444,8 +456,8 @@ func TestZeroOutSegvError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) n, err := ZeroOut(dst, pageSize) if n != uintptr(bytesBeforeFault) { t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) @@ -467,8 +479,8 @@ func TestZeroOutBusError(t *testing.T) { for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) + dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault]) n, err := ZeroOut(dst, pageSize) if n != uintptr(bytesBeforeFault) { t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) @@ -488,7 +500,7 @@ func TestSwapUint32SegvError(t *testing.T) { // Test that SwapUint32 returns a SegvError when reaching a page that // signals SIGSEGV. withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) _, err := SwapUint32(unsafe.Pointer(secondPage), 1) if want := (SegvError{secondPage}); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) @@ -500,7 +512,7 @@ func TestSwapUint32BusError(t *testing.T) { // Test that SwapUint32 returns a BusError when reaching a page that // signals SIGBUS. withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) _, err := SwapUint32(unsafe.Pointer(secondPage), 1) if want := (BusError{secondPage}); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) @@ -512,7 +524,7 @@ func TestSwapUint64SegvError(t *testing.T) { // Test that SwapUint64 returns a SegvError when reaching a page that // signals SIGSEGV. withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) _, err := SwapUint64(unsafe.Pointer(secondPage), 1) if want := (SegvError{secondPage}); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) @@ -524,7 +536,7 @@ func TestSwapUint64BusError(t *testing.T) { // Test that SwapUint64 returns a BusError when reaching a page that // signals SIGBUS. withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) _, err := SwapUint64(unsafe.Pointer(secondPage), 1) if want := (BusError{secondPage}); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) @@ -536,7 +548,7 @@ func TestCompareAndSwapUint32SegvError(t *testing.T) { // Test that CompareAndSwapUint32 returns a SegvError when reaching a page // that signals SIGSEGV. withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) if want := (SegvError{secondPage}); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) @@ -548,7 +560,7 @@ func TestCompareAndSwapUint32BusError(t *testing.T) { // Test that CompareAndSwapUint32 returns a BusError when reaching a page // that signals SIGBUS. withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + secondPage := uintptr(unsafe.Pointer(&mapping[pageSize])) _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) if want := (BusError{secondPage}); err != want { t.Errorf("Unexpected error: got %v, want %v", err, want) diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go index eef028e68..41dd567f3 100644 --- a/pkg/safecopy/safecopy_unsafe.go +++ b/pkg/safecopy/safecopy_unsafe.go @@ -16,6 +16,7 @@ package safecopy import ( "fmt" + "runtime" "syscall" "unsafe" ) @@ -35,7 +36,7 @@ const maxRegisterSize = 16 // successfully copied. // //go:noescape -func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +func memcpy(dst, src uintptr, n uintptr) (fault uintptr, sig int32) // memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS // signal is received during the write, it returns the address that caused the @@ -47,7 +48,7 @@ func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32 // successfully written. // //go:noescape -func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +func memclr(ptr uintptr, n uintptr) (fault uintptr, sig int32) // swapUint32 atomically stores new into *ptr and returns (the previous *ptr // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the @@ -90,29 +91,35 @@ func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes // copied and an error if SIGSEGV or SIGBUS is received while reading from src. func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { + n, err := copyIn(dst, uintptr(src)) + runtime.KeepAlive(src) + return n, err +} + +// copyIn is the underlying definition for CopyIn. +func copyIn(dst []byte, src uintptr) (int, error) { toCopy := uintptr(len(dst)) if len(dst) == 0 { return 0, nil } - fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy) + fault, sig := memcpy(uintptr(unsafe.Pointer(&dst[0])), src, toCopy) if sig == 0 { return len(dst), nil } - faultN, srcN := uintptr(fault), uintptr(src) - if faultN < srcN || faultN >= srcN+toCopy { - panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy)) + if fault < src || fault >= src+toCopy { + panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, fault, src, src+toCopy)) } // memcpy might have ended the copy up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to copy up to the fault. var done int - if faultN-srcN > maxRegisterSize { - done = int(faultN - srcN - maxRegisterSize) + if fault-src > maxRegisterSize { + done = int(fault - src - maxRegisterSize) } - n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done))) + n, err := copyIn(dst[done:int(fault-src)], src+uintptr(done)) done += n if err != nil { return done, err @@ -124,29 +131,35 @@ func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { // bytes done and an error if SIGSEGV or SIGBUS is received while writing to // dst. func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { + n, err := copyOut(uintptr(dst), src) + runtime.KeepAlive(dst) + return n, err +} + +// copyOut is the underlying definition for CopyOut. +func copyOut(dst uintptr, src []byte) (int, error) { toCopy := uintptr(len(src)) if toCopy == 0 { return 0, nil } - fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy) + fault, sig := memcpy(dst, uintptr(unsafe.Pointer(&src[0])), toCopy) if sig == 0 { return len(src), nil } - faultN, dstN := uintptr(fault), uintptr(dst) - if faultN < dstN || faultN >= dstN+toCopy { - panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy)) + if fault < dst || fault >= dst+toCopy { + panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toCopy)) } // memcpy might have ended the copy up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to copy up to the fault. var done int - if faultN-dstN > maxRegisterSize { - done = int(faultN - dstN - maxRegisterSize) + if fault-dst > maxRegisterSize { + done = int(fault - dst - maxRegisterSize) } - n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)]) + n, err := copyOut(dst+uintptr(done), src[done:int(fault-dst)]) done += n if err != nil { return done, err @@ -161,6 +174,14 @@ func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { // Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap, // the resulting contents of dst are unspecified. func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { + n, err := copyN(uintptr(dst), uintptr(src), toCopy) + runtime.KeepAlive(dst) + runtime.KeepAlive(src) + return n, err +} + +// copyN is the underlying definition for Copy. +func copyN(dst, src uintptr, toCopy uintptr) (uintptr, error) { if toCopy == 0 { return 0, nil } @@ -171,17 +192,16 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { } // Did the fault occur while reading from src or writing to dst? - faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst) faultAfterSrc := ^uintptr(0) - if faultN >= srcN { - faultAfterSrc = faultN - srcN + if fault >= src { + faultAfterSrc = fault - src } faultAfterDst := ^uintptr(0) - if faultN >= dstN { - faultAfterDst = faultN - dstN + if fault >= dst { + faultAfterDst = fault - dst } if faultAfterSrc >= toCopy && faultAfterDst >= toCopy { - panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy)) + panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, fault, src, src+toCopy, dst, dst+toCopy)) } faultedAfter := faultAfterSrc if faultedAfter > faultAfterDst { @@ -195,7 +215,7 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { if faultedAfter > maxRegisterSize { done = faultedAfter - maxRegisterSize } - n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done) + n, err := copyN(dst+done, src+done, faultedAfter-done) done += n if err != nil { return done, err @@ -206,6 +226,13 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { // ZeroOut writes toZero zero bytes to dst. It returns the number of bytes // written and an error if SIGSEGV or SIGBUS is received while writing to dst. func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { + n, err := zeroOut(uintptr(dst), toZero) + runtime.KeepAlive(dst) + return n, err +} + +// zeroOut is the underlying definition for ZeroOut. +func zeroOut(dst uintptr, toZero uintptr) (uintptr, error) { if toZero == 0 { return 0, nil } @@ -215,19 +242,18 @@ func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { return toZero, nil } - faultN, dstN := uintptr(fault), uintptr(dst) - if faultN < dstN || faultN >= dstN+toZero { - panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero)) + if fault < dst || fault >= dst+toZero { + panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toZero)) } // memclr might have ended the write up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to write up to the fault. var done uintptr - if faultN-dstN > maxRegisterSize { - done = faultN - dstN - maxRegisterSize + if fault-dst > maxRegisterSize { + done = fault - dst - maxRegisterSize } - n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done) + n, err := zeroOut(dst+done, fault-dst-done) done += n if err != nil { return done, err @@ -243,7 +269,7 @@ func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) { return 0, AlignmentError{addr, 4} } old, sig := swapUint32(ptr, new) - return old, errorFromFaultSignal(ptr, sig) + return old, errorFromFaultSignal(uintptr(ptr), sig) } // SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns @@ -254,7 +280,7 @@ func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) { return 0, AlignmentError{addr, 8} } old, sig := swapUint64(ptr, new) - return old, errorFromFaultSignal(ptr, sig) + return old, errorFromFaultSignal(uintptr(ptr), sig) } // CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32, @@ -265,7 +291,7 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { return 0, AlignmentError{addr, 4} } prev, sig := compareAndSwapUint32(ptr, old, new) - return prev, errorFromFaultSignal(ptr, sig) + return prev, errorFromFaultSignal(uintptr(ptr), sig) } // LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It @@ -277,17 +303,17 @@ func LoadUint32(ptr unsafe.Pointer) (uint32, error) { return 0, AlignmentError{addr, 4} } val, sig := loadUint32(ptr) - return val, errorFromFaultSignal(ptr, sig) + return val, errorFromFaultSignal(uintptr(ptr), sig) } -func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error { +func errorFromFaultSignal(addr uintptr, sig int32) error { switch sig { case 0: return nil case int32(syscall.SIGSEGV): - return SegvError{uintptr(addr)} + return SegvError{addr} case int32(syscall.SIGBUS): - return BusError{uintptr(addr)} + return BusError{addr} default: panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) } -- cgit v1.2.3 From 6823b5e244a5748032130574ae3a25a0a36bbbf5 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Tue, 4 Feb 2020 13:05:30 -0800 Subject: timer_create(2) should return 0 on success The timer ID is copied out to the argument. Fixes #1738 PiperOrigin-RevId: 293210801 --- pkg/sentry/syscalls/linux/sys_timer.go | 2 +- test/syscalls/linux/timers.cc | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index 432351917..a4c400f87 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -146,7 +146,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - return uintptr(id), nil, nil + return 0, nil, nil } // TimerSettime implements linux syscall timer_settime(2). diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc index 3db18d7ac..2f92c27da 100644 --- a/test/syscalls/linux/timers.cc +++ b/test/syscalls/linux/timers.cc @@ -297,9 +297,13 @@ class IntervalTimer { PosixErrorOr TimerCreate(clockid_t clockid, const struct sigevent& sev) { int timerid; - if (syscall(SYS_timer_create, clockid, &sev, &timerid) < 0) { + int ret = syscall(SYS_timer_create, clockid, &sev, &timerid); + if (ret < 0) { return PosixError(errno, "timer_create"); } + if (ret > 0) { + return PosixError(EINVAL, "timer_create should never return positive"); + } MaybeSave(); return IntervalTimer(timerid); } @@ -317,6 +321,18 @@ TEST(IntervalTimerTest, IsInitiallyStopped) { EXPECT_EQ(0, its.it_value.tv_nsec); } +// Kernel can create multiple timers without issue. +// +// Regression test for gvisor.dev/issue/1738. +TEST(IntervalTimerTest, MultipleTimers) { + struct sigevent sev = {}; + sev.sigev_notify = SIGEV_NONE; + const auto timer1 = + ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev)); + const auto timer2 = + ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev)); +} + TEST(IntervalTimerTest, SingleShotSilent) { struct sigevent sev = {}; sev.sigev_notify = SIGEV_NONE; -- cgit v1.2.3 From a26a954946ad2e7910d3ad7578960a93b73a1f9b Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Tue, 4 Feb 2020 15:20:30 -0800 Subject: Add socket connection stress test. Tests 65k connection attempts on common types of sockets to check for port leaks. Also fixes a bug where dual-stack sockets wouldn't properly re-queue segments received while closing. PiperOrigin-RevId: 293241166 --- pkg/tcpip/transport/tcp/connect.go | 4 ++ test/syscalls/BUILD | 9 +++ test/syscalls/linux/BUILD | 17 +++++ test/syscalls/linux/ip_socket_test_util.cc | 27 +++++++ test/syscalls/linux/ip_socket_test_util.h | 15 ++++ test/syscalls/linux/socket_generic_stress.cc | 83 ++++++++++++++++++++++ test/syscalls/linux/socket_test_util.cc | 101 ++++++++++++++++++++++++--- test/syscalls/linux/socket_test_util.h | 6 ++ 8 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 test/syscalls/linux/socket_generic_stress.cc (limited to 'pkg') diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 9ff7ac261..5c5397823 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -989,6 +989,10 @@ func (e *endpoint) transitionToStateCloseLocked() { // to any other listening endpoint. We reply with RST if we cannot find one. func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route) + if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" { + // Dual-stack socket, try IPv4. + ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route) + } if ep == nil { replyWithReset(s) s.decRef() diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 8f2b75a1c..31d239c0e 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -45,6 +45,15 @@ syscall_test(test = "//test/syscalls/linux:brk_test") syscall_test(test = "//test/syscalls/linux:socket_test") +syscall_test( + size = "large", + shard_count = 50, + # Takes too long for TSAN. Since this is kind of a stress test that doesn't + # involve much concurrency, TSAN's usefulness here is limited anyway. + tags = ["nogotsan"], + test = "//test/syscalls/linux:socket_stress_test", +) + syscall_test( add_overlay = True, test = "//test/syscalls/linux:chdir_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 737e2329f..273b014d6 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -136,6 +136,7 @@ cc_library( "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/time", + "@com_google_absl//absl/types:optional", "//test/util:file_descriptor", "//test/util:posix_error", "//test/util:temp_path", @@ -2151,6 +2152,22 @@ cc_library( alwayslink = 1, ) +cc_binary( + name = "socket_stress_test", + testonly = 1, + srcs = [ + "socket_generic_stress.cc", + ], + linkstatic = 1, + deps = [ + ":ip_socket_test_util", + ":socket_test_util", + gtest, + "//test/util:test_main", + "//test/util:test_util", + ], +) + cc_library( name = "socket_unix_dgram_test_cases", testonly = 1, diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc index 6b472eb2f..bba022a41 100644 --- a/test/syscalls/linux/ip_socket_test_util.cc +++ b/test/syscalls/linux/ip_socket_test_util.cc @@ -79,6 +79,33 @@ SocketPairKind DualStackTCPAcceptBindSocketPair(int type) { /* dual_stack = */ true)}; } +SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type) { + std::string description = + absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket"); + return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP, + TCPAcceptBindPersistentListenerSocketPairCreator( + AF_INET6, type | SOCK_STREAM, 0, + /* dual_stack = */ false)}; +} + +SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type) { + std::string description = + absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket"); + return SocketPairKind{description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP, + TCPAcceptBindPersistentListenerSocketPairCreator( + AF_INET, type | SOCK_STREAM, 0, + /* dual_stack = */ false)}; +} + +SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type) { + std::string description = + absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket"); + return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP, + TCPAcceptBindPersistentListenerSocketPairCreator( + AF_INET6, type | SOCK_STREAM, 0, + /* dual_stack = */ true)}; +} + SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) { std::string description = absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket"); diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h index 0f58e0f77..083ebbcf0 100644 --- a/test/syscalls/linux/ip_socket_test_util.h +++ b/test/syscalls/linux/ip_socket_test_util.h @@ -50,6 +50,21 @@ SocketPairKind IPv4TCPAcceptBindSocketPair(int type); // given type bound to the IPv4 loopback. SocketPairKind DualStackTCPAcceptBindSocketPair(int type); +// IPv6TCPAcceptBindPersistentListenerSocketPair is like +// IPv6TCPAcceptBindSocketPair except it uses a persistent listening socket to +// create all socket pairs. +SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type); + +// IPv4TCPAcceptBindPersistentListenerSocketPair is like +// IPv4TCPAcceptBindSocketPair except it uses a persistent listening socket to +// create all socket pairs. +SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type); + +// DualStackTCPAcceptBindPersistentListenerSocketPair is like +// DualStackTCPAcceptBindSocketPair except it uses a persistent listening socket +// to create all socket pairs. +SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type); + // IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents // SocketPairs created with bind() and connect() syscalls with AF_INET6 and the // given type bound to the IPv6 loopback. diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc new file mode 100644 index 000000000..6a232238d --- /dev/null +++ b/test/syscalls/linux/socket_generic_stress.cc @@ -0,0 +1,83 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +// Test fixture for tests that apply to pairs of connected sockets. +using ConnectStressTest = SocketPairTest; + +TEST_P(ConnectStressTest, Reset65kTimes) { + for (int i = 0; i < 1 << 16; ++i) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + // Send some data to ensure that the connection gets reset and the port gets + // released immediately. This avoids either end entering TIME-WAIT. + char sent_data[100] = {}; + ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)), + SyscallSucceedsWithValue(sizeof(sent_data))); + } +} + +INSTANTIATE_TEST_SUITE_P( + AllConnectedSockets, ConnectStressTest, + ::testing::Values(IPv6UDPBidirectionalBindSocketPair(0), + IPv4UDPBidirectionalBindSocketPair(0), + DualStackUDPBidirectionalBindSocketPair(0), + + // Without REUSEADDR, we get port exhaustion on Linux. + SetSockOpt(SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn)(IPv6TCPAcceptBindSocketPair(0)), + SetSockOpt(SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn)(IPv4TCPAcceptBindSocketPair(0)), + SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)( + DualStackTCPAcceptBindSocketPair(0)))); + +// Test fixture for tests that apply to pairs of connected sockets created with +// a persistent listener (if applicable). +using PersistentListenerConnectStressTest = SocketPairTest; + +TEST_P(PersistentListenerConnectStressTest, 65kTimes) { + for (int i = 0; i < 1 << 16; ++i) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + } +} + +INSTANTIATE_TEST_SUITE_P( + AllConnectedSockets, PersistentListenerConnectStressTest, + ::testing::Values( + IPv6UDPBidirectionalBindSocketPair(0), + IPv4UDPBidirectionalBindSocketPair(0), + DualStackUDPBidirectionalBindSocketPair(0), + + // Without REUSEADDR, we get port exhaustion on Linux. + SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)( + IPv6TCPAcceptBindPersistentListenerSocketPair(0)), + SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)( + IPv4TCPAcceptBindPersistentListenerSocketPair(0)), + SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)( + DualStackTCPAcceptBindPersistentListenerSocketPair(0)))); + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc index eff7d577e..c0c5ab3fe 100644 --- a/test/syscalls/linux/socket_test_util.cc +++ b/test/syscalls/linux/socket_test_util.cc @@ -18,10 +18,13 @@ #include #include +#include + #include "gtest/gtest.h" #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" #include "absl/time/clock.h" +#include "absl/types/optional.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" #include "test/util/temp_path.h" @@ -109,7 +112,10 @@ Creator AcceptBindSocketPairCreator(bool abstract, int domain, MaybeSave(); // Unlinked path. } - return absl::make_unique(connected, accepted, bind_addr, + // accepted is before connected to destruct connected before accepted. + // Destructors for nonstatic member objects are called in the reverse order + // in which they appear in the class declaration. + return absl::make_unique(accepted, connected, bind_addr, extra_addr); }; } @@ -311,11 +317,16 @@ PosixErrorOr BindIP(int fd, bool dual_stack) { } template -PosixErrorOr> CreateTCPAcceptBindSocketPair( - int bound, int connected, int type, bool dual_stack) { - ASSIGN_OR_RETURN_ERRNO(T bind_addr, BindIP(bound, dual_stack)); - RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5)); +PosixErrorOr TCPBindAndListen(int fd, bool dual_stack) { + ASSIGN_OR_RETURN_ERRNO(T addr, BindIP(fd, dual_stack)); + RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5)); + return addr; +} +template +PosixErrorOr> +CreateTCPConnectAcceptSocketPair(int bound, int connected, int type, + bool dual_stack, T bind_addr) { int connect_result = 0; RETURN_ERROR_IF_SYSCALL_FAIL( (connect_result = RetryEINTR(connect)( @@ -358,16 +369,27 @@ PosixErrorOr> CreateTCPAcceptBindSocketPair( absl::SleepFor(absl::Seconds(1)); } - // Cleanup no longer needed resources. - RETURN_ERROR_IF_SYSCALL_FAIL(close(bound)); - MaybeSave(); // Successful close. - T extra_addr = {}; LocalhostAddr(&extra_addr, dual_stack); return absl::make_unique(connected, accepted, bind_addr, extra_addr); } +template +PosixErrorOr> CreateTCPAcceptBindSocketPair( + int bound, int connected, int type, bool dual_stack) { + ASSIGN_OR_RETURN_ERRNO(T bind_addr, TCPBindAndListen(bound, dual_stack)); + + auto result = CreateTCPConnectAcceptSocketPair(bound, connected, type, + dual_stack, bind_addr); + + // Cleanup no longer needed resources. + RETURN_ERROR_IF_SYSCALL_FAIL(close(bound)); + MaybeSave(); // Successful close. + + return result; +} + Creator TCPAcceptBindSocketPairCreator(int domain, int type, int protocol, bool dual_stack) { @@ -389,6 +411,63 @@ Creator TCPAcceptBindSocketPairCreator(int domain, int type, }; } +Creator TCPAcceptBindPersistentListenerSocketPairCreator( + int domain, int type, int protocol, bool dual_stack) { + // These are lazily initialized below, on the first call to the returned + // lambda. These values are private to each returned lambda, but shared across + // invocations of a specific lambda. + // + // The sharing allows pairs created with the same parameters to share a + // listener. This prevents future connects from failing if the connecting + // socket selects a port which had previously been used by a listening socket + // that still has some connections in TIME-WAIT. + // + // The lazy initialization is to avoid creating sockets during parameter + // enumeration. This is important because parameters are enumerated during the + // build process where networking may not be available. + auto listener = std::make_shared>(absl::optional()); + auto addr4 = std::make_shared>( + absl::optional()); + auto addr6 = std::make_shared>( + absl::optional()); + + return [=]() -> PosixErrorOr> { + int connected; + RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol)); + MaybeSave(); // Successful socket creation. + + // Share the listener across invocations. + if (!listener->has_value()) { + int fd = socket(domain, type, protocol); + if (fd < 0) { + return PosixError(errno, absl::StrCat("socket(", domain, ", ", type, + ", ", protocol, ")")); + } + listener->emplace(fd); + MaybeSave(); // Successful socket creation. + } + + // Bind the listener once, but create a new connect/accept pair each + // time. + if (domain == AF_INET) { + if (!addr4->has_value()) { + addr4->emplace( + TCPBindAndListen(listener->value(), dual_stack) + .ValueOrDie()); + } + return CreateTCPConnectAcceptSocketPair(listener->value(), connected, + type, dual_stack, addr4->value()); + } + if (!addr6->has_value()) { + addr6->emplace( + TCPBindAndListen(listener->value(), dual_stack) + .ValueOrDie()); + } + return CreateTCPConnectAcceptSocketPair(listener->value(), connected, type, + dual_stack, addr6->value()); + }; +} + template PosixErrorOr> CreateUDPBoundSocketPair( int sock1, int sock2, int type, bool dual_stack) { @@ -518,8 +597,8 @@ size_t CalculateUnixSockAddrLen(const char* sun_path) { if (sun_path[0] == 0) { return sizeof(sockaddr_un); } - // Filesystem addresses use the address length plus the 2 byte sun_family and - // null terminator. + // Filesystem addresses use the address length plus the 2 byte sun_family + // and null terminator. return strlen(sun_path) + 3; } diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h index 2dbb8bed3..bfaa6e397 100644 --- a/test/syscalls/linux/socket_test_util.h +++ b/test/syscalls/linux/socket_test_util.h @@ -273,6 +273,12 @@ Creator TCPAcceptBindSocketPairCreator(int domain, int type, int protocol, bool dual_stack); +// TCPAcceptBindPersistentListenerSocketPairCreator is like +// TCPAcceptBindSocketPairCreator, except it uses the same listening socket to +// create all SocketPairs. +Creator TCPAcceptBindPersistentListenerSocketPairCreator( + int domain, int type, int protocol, bool dual_stack); + // UDPBidirectionalBindSocketPairCreator returns a Creator that // obtains file descriptors by invoking the bind() and connect() syscalls on UDP // sockets. -- cgit v1.2.3 From 665b614e4a6e715bac25bea15c5c29184016e549 Mon Sep 17 00:00:00 2001 From: Ting-Yu Wang Date: Tue, 4 Feb 2020 18:04:26 -0800 Subject: Support RTM_NEWADDR and RTM_GETLINK in (rt)netlink. PiperOrigin-RevId: 293271055 --- pkg/sentry/inet/inet.go | 4 + pkg/sentry/inet/test_stack.go | 6 + pkg/sentry/socket/hostinet/stack.go | 5 + pkg/sentry/socket/netlink/BUILD | 14 +- pkg/sentry/socket/netlink/message.go | 129 +++++++++++ pkg/sentry/socket/netlink/message_test.go | 312 +++++++++++++++++++++++++++ pkg/sentry/socket/netlink/provider.go | 2 +- pkg/sentry/socket/netlink/route/BUILD | 2 - pkg/sentry/socket/netlink/route/protocol.go | 238 ++++++++++++++------ pkg/sentry/socket/netlink/socket.go | 54 ++--- pkg/sentry/socket/netlink/uevent/protocol.go | 2 +- pkg/sentry/socket/netstack/stack.go | 55 +++++ pkg/tcpip/stack/stack.go | 9 + test/syscalls/linux/BUILD | 2 + test/syscalls/linux/socket_netlink_route.cc | 296 ++++++++++++++++++++----- test/syscalls/linux/socket_netlink_util.cc | 45 +++- test/syscalls/linux/socket_netlink_util.h | 9 + 17 files changed, 1022 insertions(+), 162 deletions(-) create mode 100644 pkg/sentry/socket/netlink/message_test.go (limited to 'pkg') diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index a7dfb78a7..2916a0644 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -28,6 +28,10 @@ type Stack interface { // interface indexes to a slice of associated interface address properties. InterfaceAddrs() map[int32][]InterfaceAddr + // AddInterfaceAddr adds an address to the network interface identified by + // index. + AddInterfaceAddr(idx int32, addr InterfaceAddr) error + // SupportsIPv6 returns true if the stack supports IPv6 connectivity. SupportsIPv6() bool diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index dcfcbd97e..d8961fc94 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr { return s.InterfaceAddrsMap } +// AddInterfaceAddr implements Stack.AddInterfaceAddr. +func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error { + s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr) + return nil +} + // SupportsIPv6 implements Stack.SupportsIPv6. func (s *TestStack) SupportsIPv6() bool { return s.SupportsIPv6Flag diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 034eca676..a48082631 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { return addrs } +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + return syserror.EACCES +} + // SupportsIPv6 implements inet.Stack.SupportsIPv6. func (s *Stack) SupportsIPv6() bool { return s.supportsIPv6 diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index f8b8e467d..1911cd9b8 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -33,3 +33,15 @@ go_library( "//pkg/waiter", ], ) + +go_test( + name = "netlink_test", + size = "small", + srcs = [ + "message_test.go", + ], + deps = [ + ":netlink", + "//pkg/abi/linux", + ], +) diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go index b21e0ca4b..4ea252ccb 100644 --- a/pkg/sentry/socket/netlink/message.go +++ b/pkg/sentry/socket/netlink/message.go @@ -30,8 +30,16 @@ func alignUp(length int, align uint) int { return (length + int(align) - 1) &^ (int(align) - 1) } +// alignPad returns the length of padding required for alignment. +// +// Preconditions: align is a power of two. +func alignPad(length int, align uint) int { + return alignUp(length, align) - length +} + // Message contains a complete serialized netlink message. type Message struct { + hdr linux.NetlinkMessageHeader buf []byte } @@ -40,10 +48,86 @@ type Message struct { // The header length will be updated by Finalize. func NewMessage(hdr linux.NetlinkMessageHeader) *Message { return &Message{ + hdr: hdr, buf: binary.Marshal(nil, usermem.ByteOrder, hdr), } } +// ParseMessage parses the first message seen at buf, returning the rest of the +// buffer. If message is malformed, ok of false is returned. For last message, +// padding check is loose, if there isn't enought padding, whole buf is consumed +// and ok is set to true. +func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) { + b := BytesView(buf) + + hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize) + if !ok { + return + } + var hdr linux.NetlinkMessageHeader + binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr) + + // Msg portion. + totalMsgLen := int(hdr.Length) + _, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize) + if !ok { + return + } + + // Padding. + numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO) + // Linux permits the last message not being aligned, just consume all of it. + // Ref: net/netlink/af_netlink.c:netlink_rcv_skb + if numPad > len(b) { + numPad = len(b) + } + _, ok = b.Extract(numPad) + if !ok { + return + } + + return &Message{ + hdr: hdr, + buf: buf[:totalMsgLen], + }, []byte(b), true +} + +// Header returns the header of this message. +func (m *Message) Header() linux.NetlinkMessageHeader { + return m.hdr +} + +// GetData unmarshals the payload message header from this netlink message, and +// returns the attributes portion. +func (m *Message) GetData(msg interface{}) (AttrsView, bool) { + b := BytesView(m.buf) + + _, ok := b.Extract(linux.NetlinkMessageHeaderSize) + if !ok { + return nil, false + } + + size := int(binary.Size(msg)) + msgBytes, ok := b.Extract(size) + if !ok { + return nil, false + } + binary.Unmarshal(msgBytes, usermem.ByteOrder, msg) + + numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO) + // Linux permits the last message not being aligned, just consume all of it. + // Ref: net/netlink/af_netlink.c:netlink_rcv_skb + if numPad > len(b) { + numPad = len(b) + } + _, ok = b.Extract(numPad) + if !ok { + return nil, false + } + + return AttrsView(b), true +} + // Finalize returns the []byte containing the entire message, with the total // length set in the message header. The Message must not be modified after // calling Finalize. @@ -157,3 +241,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message { ms.Messages = append(ms.Messages, m) return m } + +// AttrsView is a view into the attributes portion of a netlink message. +type AttrsView []byte + +// Empty returns whether there is no attribute left in v. +func (v AttrsView) Empty() bool { + return len(v) == 0 +} + +// ParseFirst parses first netlink attribute at the beginning of v. +func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) { + b := BytesView(v) + + hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize) + if !ok { + return + } + binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr) + + value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize) + if !ok { + return + } + + _, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO)) + if !ok { + return + } + + return hdr, value, AttrsView(b), ok +} + +// BytesView supports extracting data from a byte slice with bounds checking. +type BytesView []byte + +// Extract removes the first n bytes from v and returns it. If n is out of +// bounds, it returns false. +func (v *BytesView) Extract(n int) ([]byte, bool) { + if n < 0 || n > len(*v) { + return nil, false + } + extracted := (*v)[:n] + *v = (*v)[n:] + return extracted, true +} diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go new file mode 100644 index 000000000..ef13d9386 --- /dev/null +++ b/pkg/sentry/socket/netlink/message_test.go @@ -0,0 +1,312 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package message_test + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink" +) + +type dummyNetlinkMsg struct { + Foo uint16 +} + +func TestParseMessage(t *testing.T) { + tests := []struct { + desc string + input []byte + + header linux.NetlinkMessageHeader + dataMsg *dummyNetlinkMsg + restLen int + ok bool + }{ + { + desc: "valid", + input: []byte{ + 0x14, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + header: linux.NetlinkMessageHeader{ + Length: 20, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "valid with next message", + input: []byte{ + 0x14, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + 0xFF, // Next message (rest) + }, + header: linux.NetlinkMessageHeader{ + Length: 20, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 1, + ok: true, + }, + { + desc: "valid for last message without padding", + input: []byte{ + 0x12, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, // Data message + }, + header: linux.NetlinkMessageHeader{ + Length: 18, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "valid for last message not to be aligned", + input: []byte{ + 0x13, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, // Data message + 0x00, // Excessive 1 byte permitted at end + }, + header: linux.NetlinkMessageHeader{ + Length: 19, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "header.Length too short", + input: []byte{ + 0x04, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + ok: false, + }, + { + desc: "header.Length too long", + input: []byte{ + 0xFF, 0xFF, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + ok: false, + }, + { + desc: "header incomplete", + input: []byte{ + 0x04, 0x00, 0x00, 0x00, // Length + }, + ok: false, + }, + { + desc: "empty message", + input: []byte{}, + ok: false, + }, + } + for _, test := range tests { + msg, rest, ok := netlink.ParseMessage(test.input) + if ok != test.ok { + t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok) + continue + } + if !test.ok { + continue + } + if !reflect.DeepEqual(msg.Header(), test.header) { + t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header) + } + + dataMsg := &dummyNetlinkMsg{} + _, dataOk := msg.GetData(dataMsg) + if !dataOk { + t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk) + } else if !reflect.DeepEqual(dataMsg, test.dataMsg) { + t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg) + } + + if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) { + t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want) + } + } +} + +func TestAttrView(t *testing.T) { + tests := []struct { + desc string + input []byte + + // Outputs for ParseFirst. + hdr linux.NetlinkAttrHeader + value []byte + restLen int + ok bool + + // Outputs for Empty. + isEmpty bool + }{ + { + desc: "valid", + input: []byte{ + 0x06, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding + }, + hdr: linux.NetlinkAttrHeader{ + Length: 6, + Type: 1, + }, + value: []byte{0x30, 0x31}, + restLen: 0, + ok: true, + isEmpty: false, + }, + { + desc: "at alignment", + input: []byte{ + 0x08, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + hdr: linux.NetlinkAttrHeader{ + Length: 8, + Type: 1, + }, + value: []byte{0x30, 0x31, 0x32, 0x33}, + restLen: 0, + ok: true, + isEmpty: false, + }, + { + desc: "at alignment with rest data", + input: []byte{ + 0x08, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + 0xFF, 0xFE, // Rest data + }, + hdr: linux.NetlinkAttrHeader{ + Length: 8, + Type: 1, + }, + value: []byte{0x30, 0x31, 0x32, 0x33}, + restLen: 2, + ok: true, + isEmpty: false, + }, + { + desc: "hdr.Length too long", + input: []byte{ + 0xFF, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + ok: false, + isEmpty: false, + }, + { + desc: "hdr.Length too short", + input: []byte{ + 0x01, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + ok: false, + isEmpty: false, + }, + { + desc: "empty", + input: []byte{}, + ok: false, + isEmpty: true, + }, + } + for _, test := range tests { + attrs := netlink.AttrsView(test.input) + + // Test ParseFirst(). + hdr, value, rest, ok := attrs.ParseFirst() + if ok != test.ok { + t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok) + } else if test.ok { + if !reflect.DeepEqual(hdr, test.hdr) { + t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr) + } + if !bytes.Equal(value, test.value) { + t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value) + } + if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) { + t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest) + } + } + + // Test Empty(). + if got, want := attrs.Empty(), test.isEmpty; got != want { + t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want) + } + } +} diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 07f860a49..b0dc70e5c 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -42,7 +42,7 @@ type Protocol interface { // If err == nil, any messages added to ms will be sent back to the // other end of the socket. Setting ms.Multi will cause an NLMSG_DONE // message to be sent even if ms contains no messages. - ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error + ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error } // Provider is a function that creates a new Protocol for a specific netlink diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 622a1eafc..93127398d 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -10,13 +10,11 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/socket/netlink", "//pkg/syserr", - "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index 2b3c7f5b3..c84d8bd7c 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -17,16 +17,15 @@ package route import ( "bytes" + "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/usermem" ) // commandKind describes the operational class of a message type. @@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool { } // dumpLinks handles RTM_GETLINK dump requests. -func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { - // TODO(b/68878065): Only the dump variant of the types below are - // supported. - if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP { - return syserr.ErrNotSupported - } - +func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an // ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some // userspace applications (including glibc) still include rtgenmsg. @@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader return nil } - for id, i := range stack.Interfaces() { - m := ms.AddMessage(linux.NetlinkMessageHeader{ - Type: linux.RTM_NEWLINK, - }) + for idx, i := range stack.Interfaces() { + addNewLinkMessage(ms, idx, i) + } - m.Put(linux.InterfaceInfoMessage{ - Family: linux.AF_UNSPEC, - Type: i.DeviceType, - Index: id, - Flags: i.Flags, - }) + return nil +} - m.PutAttrString(linux.IFLA_IFNAME, i.Name) - m.PutAttr(linux.IFLA_MTU, i.MTU) +// getLinks handles RTM_GETLINK requests. +func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } - mac := make([]byte, 6) - brd := mac - if len(i.Addr) > 0 { - mac = i.Addr - brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) + // Parse message. + var ifi linux.InterfaceInfoMessage + attrs, ok := msg.GetData(&ifi) + if !ok { + return syserr.ErrInvalidArgument + } + + // Parse attributes. + var byName []byte + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument } - m.PutAttr(linux.IFLA_ADDRESS, mac) - m.PutAttr(linux.IFLA_BROADCAST, brd) + attrs = rest - // TODO(gvisor.dev/issue/578): There are many more attributes. + switch ahdr.Type { + case linux.IFLA_IFNAME: + if len(value) < 1 { + return syserr.ErrInvalidArgument + } + byName = value[:len(value)-1] + + // TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK. + } } + found := false + for idx, i := range stack.Interfaces() { + switch { + case ifi.Index > 0: + if idx != ifi.Index { + continue + } + case byName != nil: + if string(byName) != i.Name { + continue + } + default: + // Criteria not specified. + return syserr.ErrInvalidArgument + } + + addNewLinkMessage(ms, idx, i) + found = true + break + } + if !found { + return syserr.ErrNoDevice + } return nil } -// dumpAddrs handles RTM_GETADDR dump requests. -func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { - // TODO(b/68878065): Only the dump variant of the types below are - // supported. - if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP { - return syserr.ErrNotSupported +// addNewLinkMessage appends RTM_NEWLINK message for the given interface into +// the message set. +func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWLINK, + }) + + m.Put(linux.InterfaceInfoMessage{ + Family: linux.AF_UNSPEC, + Type: i.DeviceType, + Index: idx, + Flags: i.Flags, + }) + + m.PutAttrString(linux.IFLA_IFNAME, i.Name) + m.PutAttr(linux.IFLA_MTU, i.MTU) + + mac := make([]byte, 6) + brd := mac + if len(i.Addr) > 0 { + mac = i.Addr + brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) } + m.PutAttr(linux.IFLA_ADDRESS, mac) + m.PutAttr(linux.IFLA_BROADCAST, brd) + + // TODO(gvisor.dev/issue/578): There are many more attributes. +} +// dumpAddrs handles RTM_GETADDR dump requests. +func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // RTM_GETADDR dump requests need not contain anything more than the // netlink header and 1 byte protocol family common to all // NETLINK_ROUTE requests. @@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader Index: uint32(id), }) + m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr)) m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr)) // TODO(gvisor.dev/issue/578): There are many more attributes. @@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) { } // parseForDestination parses a message as format of RouteMessage-RtAttr-dst. -func parseForDestination(data []byte) ([]byte, *syserr.Error) { +func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) { var rtMsg linux.RouteMessage - if len(data) < linux.SizeOfRouteMessage { + attrs, ok := msg.GetData(&rtMsg) + if !ok { return nil, syserr.ErrInvalidArgument } - binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg) // iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See // commit bc234301af12. Note we don't check this flag for backward // compatibility. @@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) { return nil, syserr.ErrNotSupported } - data = data[linux.SizeOfRouteMessage:] - - // TODO(gvisor.dev/issue/1611): Add generic attribute parsing. - var rtAttr linux.RtAttr - if len(data) < linux.SizeOfRtAttr { - return nil, syserr.ErrInvalidArgument + // Expect first attribute is RTA_DST. + if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST { + return value, nil } - binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr) - if rtAttr.Type != linux.RTA_DST { - return nil, syserr.ErrInvalidArgument - } - - if len(data) < int(rtAttr.Len) { - return nil, syserr.ErrInvalidArgument - } - return data[linux.SizeOfRtAttr:rtAttr.Len], nil + return nil, syserr.ErrInvalidArgument } // dumpRoutes handles RTM_GETROUTE requests. -func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { +func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // RTM_GETROUTE dump requests need not contain anything more than the // netlink header and 1 byte protocol family common to all // NETLINK_ROUTE requests. @@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade return nil } + hdr := msg.Header() routeTables := stack.RouteTable() if hdr.Flags == linux.NLM_F_REQUEST { - dst, err := parseForDestination(data) + dst, err := parseForDestination(msg) if err != nil { return err } @@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade return nil } +// newAddr handles RTM_NEWADDR requests. +func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network stack. + return syserr.ErrProtocolNotSupported + } + + var ifa linux.InterfaceAddrMessage + attrs, ok := msg.GetData(&ifa) + if !ok { + return syserr.ErrInvalidArgument + } + + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument + } + attrs = rest + + switch ahdr.Type { + case linux.IFA_LOCAL: + err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ + Family: ifa.Family, + PrefixLen: ifa.PrefixLen, + Flags: ifa.Flags, + Addr: value, + }) + if err == syscall.EEXIST { + flags := msg.Header().Flags + if flags&linux.NLM_F_EXCL != 0 { + return syserr.ErrExists + } + } else if err != nil { + return syserr.ErrInvalidArgument + } + } + } + return nil +} + // ProcessMessage implements netlink.Protocol.ProcessMessage. -func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { +func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + hdr := msg.Header() + // All messages start with a 1 byte protocol family. - if len(data) < 1 { + var family uint8 + if _, ok := msg.GetData(&family); !ok { // Linux ignores messages missing the protocol family. See // net/core/rtnetlink.c:rtnetlink_rcv_msg. return nil @@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH } } - switch hdr.Type { - case linux.RTM_GETLINK: - return p.dumpLinks(ctx, hdr, data, ms) - case linux.RTM_GETADDR: - return p.dumpAddrs(ctx, hdr, data, ms) - case linux.RTM_GETROUTE: - return p.dumpRoutes(ctx, hdr, data, ms) - default: - return syserr.ErrNotSupported + if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP { + // TODO(b/68878065): Only the dump variant of the types below are + // supported. + switch hdr.Type { + case linux.RTM_GETLINK: + return p.dumpLinks(ctx, msg, ms) + case linux.RTM_GETADDR: + return p.dumpAddrs(ctx, msg, ms) + case linux.RTM_GETROUTE: + return p.dumpRoutes(ctx, msg, ms) + default: + return syserr.ErrNotSupported + } + } else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST { + switch hdr.Type { + case linux.RTM_GETLINK: + return p.getLink(ctx, msg, ms) + case linux.RTM_GETROUTE: + return p.dumpRoutes(ctx, msg, ms) + case linux.RTM_NEWADDR: + return p.newAddr(ctx, msg, ms) + default: + return syserr.ErrNotSupported + } } + return syserr.ErrNotSupported } // init registers the NETLINK_ROUTE provider. diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index c4b95debb..2ca02567d 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error return nil } -func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error { +func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.NLMSG_ERROR, }) - m.Put(linux.NetlinkErrorMessage{ Error: int32(-err.ToLinux().Number()), Header: hdr, }) - return nil +} +func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.NLMSG_ERROR, + }) + m.Put(linux.NetlinkErrorMessage{ + Error: 0, + Header: hdr, + }) } // processMessages handles each message in buf, passing it to the protocol // handler for final handling. func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { for len(buf) > 0 { - if len(buf) < linux.NetlinkMessageHeaderSize { + msg, rest, ok := ParseMessage(buf) + if !ok { // Linux ignores messages that are too short. See // net/netlink/af_netlink.c:netlink_rcv_skb. break } - - var hdr linux.NetlinkMessageHeader - binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr) - - if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) { - // Linux ignores malformed messages. See - // net/netlink/af_netlink.c:netlink_rcv_skb. - break - } - - // Data from this message. - data := buf[linux.NetlinkMessageHeaderSize:hdr.Length] - - // Advance to the next message. - next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO) - if next >= len(buf)-1 { - next = len(buf) - 1 - } - buf = buf[next:] + buf = rest + hdr := msg.Header() // Ignore control messages. if hdr.Type < linux.NLMSG_MIN_TYPE { @@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error } ms := NewMessageSet(s.portID, hdr.Seq) - var err *syserr.Error - // TODO(b/68877377): ACKs not supported yet. - if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { - err = syserr.ErrNotSupported - } else { - - err = s.protocol.ProcessMessage(ctx, hdr, data, ms) - } - if err != nil { - ms = NewMessageSet(s.portID, hdr.Seq) - if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil { - return err - } + if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil { + dumpErrorMesage(hdr, ms, err) + } else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { + dumpAckMesage(hdr, ms) } if err := s.sendResponse(ctx, ms); err != nil { diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go index 1ee4296bc..029ba21b5 100644 --- a/pkg/sentry/socket/netlink/uevent/protocol.go +++ b/pkg/sentry/socket/netlink/uevent/protocol.go @@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool { } // ProcessMessage implements netlink.Protocol.ProcessMessage. -func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { +func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // Silently ignore all messages. return nil } diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 31ea66eca..0692482e9 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -20,6 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { return nicAddrs } +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + var ( + protocol tcpip.NetworkProtocolNumber + address tcpip.Address + ) + switch addr.Family { + case linux.AF_INET: + if len(addr.Addr) < header.IPv4AddressSize { + return syserror.EINVAL + } + if addr.PrefixLen > header.IPv4AddressSize*8 { + return syserror.EINVAL + } + protocol = ipv4.ProtocolNumber + address = tcpip.Address(addr.Addr[:header.IPv4AddressSize]) + + case linux.AF_INET6: + if len(addr.Addr) < header.IPv6AddressSize { + return syserror.EINVAL + } + if addr.PrefixLen > header.IPv6AddressSize*8 { + return syserror.EINVAL + } + protocol = ipv6.ProtocolNumber + address = tcpip.Address(addr.Addr[:header.IPv6AddressSize]) + + default: + return syserror.ENOTSUP + } + + protocolAddress := tcpip.ProtocolAddress{ + Protocol: protocol, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address, + PrefixLen: int(addr.PrefixLen), + }, + } + + // Attach address to interface. + if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + return syserr.TranslateNetstackError(err).ToError() + } + + // Add route for local network. + s.Stack.AddRoute(tcpip.Route{ + Destination: protocolAddress.AddressWithPrefix.Subnet(), + Gateway: "", // No gateway for local network. + NIC: tcpip.NICID(idx), + }) + return nil +} + // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { var rs tcp.ReceiveBufferSizeOption diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 7057b110e..b793f1d74 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -795,6 +795,8 @@ func (s *Stack) Forwarding() bool { // SetRouteTable assigns the route table to be used by this stack. It // specifies which NIC to use for given destination address ranges. +// +// This method takes ownership of the table. func (s *Stack) SetRouteTable(table []tcpip.Route) { s.mu.Lock() defer s.mu.Unlock() @@ -809,6 +811,13 @@ func (s *Stack) GetRouteTable() []tcpip.Route { return append([]tcpip.Route(nil), s.routeTable...) } +// AddRoute appends a route to the route table. +func (s *Stack) AddRoute(route tcpip.Route) { + s.mu.Lock() + defer s.mu.Unlock() + s.routeTable = append(s.routeTable, route) +} + // NewEndpoint creates a new transport layer endpoint of the given protocol. func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { t, ok := s.transportProtocols[transport] diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 273b014d6..f2e3c7072 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -2769,9 +2769,11 @@ cc_binary( deps = [ ":socket_netlink_util", ":socket_test_util", + "//test/util:capability_util", "//test/util:cleanup", "//test/util:file_descriptor", "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", gtest, "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc index 1e28e658d..e5aed1eec 100644 --- a/test/syscalls/linux/socket_netlink_route.cc +++ b/test/syscalls/linux/socket_netlink_route.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -25,8 +26,10 @@ #include "gtest/gtest.h" #include "absl/strings/str_format.h" +#include "absl/types/optional.h" #include "test/syscalls/linux/socket_netlink_util.h" #include "test/syscalls/linux/socket_test_util.h" +#include "test/util/capability_util.h" #include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/test_util.h" @@ -38,6 +41,8 @@ namespace testing { namespace { +constexpr uint32_t kSeq = 12345; + using ::testing::AnyOf; using ::testing::Eq; @@ -113,58 +118,224 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) { // TODO(mpratt): Check ifinfomsg contents and following attrs. } +PosixError DumpLinks( + const FileDescriptor& fd, uint32_t seq, + const std::function& fn) { + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + }; + + struct request req = {}; + req.hdr.nlmsg_len = sizeof(req); + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.hdr.nlmsg_seq = seq; + req.ifm.ifi_family = AF_UNSPEC; + + return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false); +} + TEST(NetlinkRouteTest, GetLinkDump) { FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get())); + // Loopback is common among all tests, check that it's found. + bool loopbackFound = false; + ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) { + CheckGetLinkResponse(hdr, kSeq, port); + if (hdr->nlmsg_type != RTM_NEWLINK) { + return; + } + ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg))); + const struct ifinfomsg* msg = + reinterpret_cast(NLMSG_DATA(hdr)); + std::cout << "Found interface idx=" << msg->ifi_index + << ", type=" << std::hex << msg->ifi_type; + if (msg->ifi_type == ARPHRD_LOOPBACK) { + loopbackFound = true; + EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0); + } + })); + EXPECT_TRUE(loopbackFound); +} + +struct Link { + int index; + std::string name; +}; + +PosixErrorOr> FindLoopbackLink() { + ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE)); + + absl::optional link; + RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) { + if (hdr->nlmsg_type != RTM_NEWLINK || + hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) { + return; + } + const struct ifinfomsg* msg = + reinterpret_cast(NLMSG_DATA(hdr)); + if (msg->ifi_type == ARPHRD_LOOPBACK) { + const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME); + if (rta == nullptr) { + // Ignore links that do not have a name. + return; + } + + link = Link(); + link->index = msg->ifi_index; + link->name = std::string(reinterpret_cast(RTA_DATA(rta))); + } + })); + return link; +} + +// CheckLinkMsg checks a netlink message against an expected link. +void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) { + ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK)); + ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg))); + const struct ifinfomsg* msg = + reinterpret_cast(NLMSG_DATA(hdr)); + EXPECT_EQ(msg->ifi_index, link.index); + + const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME); + EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message."; + if (rta != nullptr) { + std::string name(reinterpret_cast(RTA_DATA(rta))); + EXPECT_EQ(name, link.name); + } +} + +TEST(NetlinkRouteTest, GetLinkByIndex) { + absl::optional loopback_link = + ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink()); + ASSERT_TRUE(loopback_link.has_value()); + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + struct request { struct nlmsghdr hdr; struct ifinfomsg ifm; }; - constexpr uint32_t kSeq = 12345; - struct request req = {}; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETLINK; - req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.hdr.nlmsg_flags = NLM_F_REQUEST; req.hdr.nlmsg_seq = kSeq; req.ifm.ifi_family = AF_UNSPEC; + req.ifm.ifi_index = loopback_link->index; - // Loopback is common among all tests, check that it's found. - bool loopbackFound = false; + bool found = false; ASSERT_NO_ERRNO(NetlinkRequestResponse( fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) { - CheckGetLinkResponse(hdr, kSeq, port); - if (hdr->nlmsg_type != RTM_NEWLINK) { - return; - } - ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg))); - const struct ifinfomsg* msg = - reinterpret_cast(NLMSG_DATA(hdr)); - std::cout << "Found interface idx=" << msg->ifi_index - << ", type=" << std::hex << msg->ifi_type; - if (msg->ifi_type == ARPHRD_LOOPBACK) { - loopbackFound = true; - EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0); - } + CheckLinkMsg(hdr, *loopback_link); + found = true; }, false)); - EXPECT_TRUE(loopbackFound); + EXPECT_TRUE(found) << "Netlink response does not contain any links."; } -TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) { +TEST(NetlinkRouteTest, GetLinkByName) { + absl::optional loopback_link = + ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink()); + ASSERT_TRUE(loopback_link.has_value()); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); struct request { struct nlmsghdr hdr; struct ifinfomsg ifm; + struct rtattr rtattr; + char ifname[IFNAMSIZ]; + char pad[NLMSG_ALIGNTO + RTA_ALIGNTO]; }; - constexpr uint32_t kSeq = 12345; + struct request req = {}; + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.rtattr.rta_type = IFLA_IFNAME; + req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1); + strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname)); + req.hdr.nlmsg_len = + NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len); + + bool found = false; + ASSERT_NO_ERRNO(NetlinkRequestResponse( + fd, &req, sizeof(req), + [&](const struct nlmsghdr* hdr) { + CheckLinkMsg(hdr, *loopback_link); + found = true; + }, + false)); + EXPECT_TRUE(found) << "Netlink response does not contain any links."; +} + +TEST(NetlinkRouteTest, GetLinkByIndexNotFound) { + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + }; + + struct request req = {}; + req.hdr.nlmsg_len = sizeof(req); + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.ifm.ifi_index = 1234590; + + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(ENODEV, ::testing::_)); +} + +TEST(NetlinkRouteTest, GetLinkByNameNotFound) { + const std::string name = "nodevice?!"; + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + struct rtattr rtattr; + char ifname[IFNAMSIZ]; + char pad[NLMSG_ALIGNTO + RTA_ALIGNTO]; + }; + + struct request req = {}; + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.rtattr.rta_type = IFLA_IFNAME; + req.rtattr.rta_len = RTA_LENGTH(name.size() + 1); + strncpy(req.ifname, name.c_str(), sizeof(req.ifname)); + req.hdr.nlmsg_len = + NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len); + + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(ENODEV, ::testing::_)); +} + +TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) { + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + }; struct request req = {}; req.hdr.nlmsg_len = sizeof(req); @@ -175,18 +346,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) { req.hdr.nlmsg_seq = kSeq; req.ifm.ifi_family = AF_UNSPEC; - ASSERT_NO_ERRNO(NetlinkRequestResponse( - fd, &req, sizeof(req), - [&](const struct nlmsghdr* hdr) { - EXPECT_THAT(hdr->nlmsg_type, Eq(NLMSG_ERROR)); - EXPECT_EQ(hdr->nlmsg_seq, kSeq); - EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr)); - - const struct nlmsgerr* msg = - reinterpret_cast(NLMSG_DATA(hdr)); - EXPECT_EQ(msg->error, -EOPNOTSUPP); - }, - true)); + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(EOPNOTSUPP, ::testing::_)); } TEST(NetlinkRouteTest, MsgHdrMsgTrunc) { @@ -198,8 +359,6 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) { struct ifinfomsg ifm; }; - constexpr uint32_t kSeq = 12345; - struct request req = {}; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETLINK; @@ -238,8 +397,6 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) { struct ifinfomsg ifm; }; - constexpr uint32_t kSeq = 12345; - struct request req = {}; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETLINK; @@ -282,8 +439,6 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) { struct ifinfomsg ifm; }; - constexpr uint32_t kSeq = 12345; - struct request req = {}; // This control message is ignored. We still receive a response for the @@ -317,8 +472,6 @@ TEST(NetlinkRouteTest, GetAddrDump) { struct rtgenmsg rgm; }; - constexpr uint32_t kSeq = 12345; - struct request req; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETADDR; @@ -367,6 +520,57 @@ TEST(NetlinkRouteTest, LookupAll) { ASSERT_GT(count, 0); } +TEST(NetlinkRouteTest, AddAddr) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + + absl::optional loopback_link = + ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink()); + ASSERT_TRUE(loopback_link.has_value()); + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifaddrmsg ifa; + struct rtattr rtattr; + struct in_addr addr; + char pad[NLMSG_ALIGNTO + RTA_ALIGNTO]; + }; + + struct request req = {}; + req.hdr.nlmsg_type = RTM_NEWADDR; + req.hdr.nlmsg_seq = kSeq; + req.ifa.ifa_family = AF_INET; + req.ifa.ifa_prefixlen = 24; + req.ifa.ifa_flags = 0; + req.ifa.ifa_scope = 0; + req.ifa.ifa_index = loopback_link->index; + req.rtattr.rta_type = IFA_LOCAL; + req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr)); + inet_pton(AF_INET, "10.0.0.1", &req.addr); + req.hdr.nlmsg_len = + NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len); + + // Create should succeed, as no such address in kernel. + req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; + EXPECT_NO_ERRNO( + NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len)); + + // Replace an existing address should succeed. + req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK; + req.hdr.nlmsg_seq++; + EXPECT_NO_ERRNO( + NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len)); + + // Create exclusive should fail, as we created the address above. + req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK; + req.hdr.nlmsg_seq++; + EXPECT_THAT( + NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len), + PosixErrorIs(EEXIST, ::testing::_)); +} + // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request. TEST(NetlinkRouteTest, GetRouteDump) { FileDescriptor fd = @@ -378,8 +582,6 @@ TEST(NetlinkRouteTest, GetRouteDump) { struct rtmsg rtm; }; - constexpr uint32_t kSeq = 12345; - struct request req = {}; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETROUTE; @@ -538,8 +740,6 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) { struct rtgenmsg rgm; }; - constexpr uint32_t kSeq = 12345; - struct request req; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETADDR; @@ -615,8 +815,6 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) { struct rtgenmsg rgm; }; - constexpr uint32_t kSeq = 12345; - struct request req; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETADDR; @@ -695,8 +893,6 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) { struct rtgenmsg rgm; }; - constexpr uint32_t kSeq = 12345; - struct request req; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETADDR; @@ -743,8 +939,6 @@ TEST(NetlinkRouteTest, PasscredCreds) { struct rtgenmsg rgm; }; - constexpr uint32_t kSeq = 12345; - struct request req; req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = RTM_GETADDR; diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc index cd2212a1a..952eecfe8 100644 --- a/test/syscalls/linux/socket_netlink_util.cc +++ b/test/syscalls/linux/socket_netlink_util.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -71,9 +72,10 @@ PosixError NetlinkRequestResponse( iov.iov_base = buf.data(); iov.iov_len = buf.size(); - // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE - // message. + // If NLM_F_MULTI is set, response is a series of messages that ends with a + // NLMSG_DONE message. int type = -1; + int flags = 0; do { int len; RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0)); @@ -89,6 +91,7 @@ PosixError NetlinkRequestResponse( for (struct nlmsghdr* hdr = reinterpret_cast(buf.data()); NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) { fn(hdr); + flags = hdr->nlmsg_flags; type = hdr->nlmsg_type; // Done should include an integer payload for dump_done_errno. // See net/netlink/af_netlink.c:netlink_dump @@ -98,11 +101,11 @@ PosixError NetlinkRequestResponse( EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int))); } } - } while (type != NLMSG_DONE && type != NLMSG_ERROR); + } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR); if (expect_nlmsgerr) { EXPECT_EQ(type, NLMSG_ERROR); - } else { + } else if (flags & NLM_F_MULTI) { EXPECT_EQ(type, NLMSG_DONE); } return NoError(); @@ -146,5 +149,39 @@ PosixError NetlinkRequestResponseSingle( return NoError(); } +PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq, + void* request, size_t len) { + // Dummy negative number for no error message received. + // We won't get a negative error number so there will be no confusion. + int err = -42; + RETURN_IF_ERRNO(NetlinkRequestResponse( + fd, request, len, + [&](const struct nlmsghdr* hdr) { + EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type); + EXPECT_EQ(hdr->nlmsg_seq, seq); + EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr)); + + const struct nlmsgerr* msg = + reinterpret_cast(NLMSG_DATA(hdr)); + err = -msg->error; + }, + true)); + return PosixError(err); +} + +const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr, + const struct ifinfomsg* msg, int16_t attr) { + const int ifi_space = NLMSG_SPACE(sizeof(*msg)); + int attrlen = hdr->nlmsg_len - ifi_space; + const struct rtattr* rta = reinterpret_cast( + reinterpret_cast(hdr) + NLMSG_ALIGN(ifi_space)); + for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) { + if (rta->rta_type == attr) { + return rta; + } + } + return nullptr; +} + } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h index 3678c0599..e13ead406 100644 --- a/test/syscalls/linux/socket_netlink_util.h +++ b/test/syscalls/linux/socket_netlink_util.h @@ -19,6 +19,7 @@ // socket.h has to be included before if_arp.h. #include #include +#include #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" @@ -47,6 +48,14 @@ PosixError NetlinkRequestResponseSingle( const FileDescriptor& fd, void* request, size_t len, const std::function& fn); +// Send the passed request then expect and return an ack or error. +PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq, + void* request, size_t len); + +// Find rtnetlink attribute in message. +const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr, + const struct ifinfomsg* msg, int16_t attr); + } // namespace testing } // namespace gvisor -- cgit v1.2.3 From eea0eeee933ba8406ae688fce4348271f9513514 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 5 Feb 2020 11:25:10 -0800 Subject: Disable get/set xattrs until list/remove exist too. PiperOrigin-RevId: 293411655 --- pkg/sentry/syscalls/linux/linux64_amd64.go | 27 ++++--- pkg/sentry/syscalls/linux/linux64_arm64.go | 37 +++++---- test/syscalls/linux/xattr.cc | 124 +++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+), 29 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 7435b50bf..588f8b087 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -228,18 +228,21 @@ var AMD64 = &kernel.SyscallTable{ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), - 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), - 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), - 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), - 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + // TODO(b/148303075): Enable set/getxattr (in their various + // forms) once we also have list and removexattr. The JVM + // assumes that if get/set exist, then list and remove do too. + 188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), 200: syscalls.Supported("tkill", Tkill), 201: syscalls.Supported("time", Time), 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index 03a39fe65..06e5ee401 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -36,23 +36,26 @@ var ARM64 = &kernel.SyscallTable{ }, AuditNumber: linux.AUDIT_ARCH_AARCH64, Table: map[uintptr]kernel.Syscall{ - 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), - 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), - 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), - 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 12: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 14: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 15: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 16: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + // TODO(b/148303075): Enable set/getxattr (in their various + // forms) once we also have list and removexattr. The JVM + // assumes that if get/set exist, then list and remove do too. + 5: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 6: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 7: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 8: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 13: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 14: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 15: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 16: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), 17: syscalls.Supported("getcwd", Getcwd), 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 19: syscalls.Supported("eventfd2", Eventfd2), diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc index ab21d68c6..85eb31847 100644 --- a/test/syscalls/linux/xattr.cc +++ b/test/syscalls/linux/xattr.cc @@ -39,6 +39,10 @@ namespace { class XattrTest : public FileTest {}; TEST_F(XattrTest, XattrNullName) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0), @@ -48,6 +52,10 @@ TEST_F(XattrTest, XattrNullName) { } TEST_F(XattrTest, XattrEmptyName) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0), @@ -56,6 +64,10 @@ TEST_F(XattrTest, XattrEmptyName) { } TEST_F(XattrTest, XattrLargeName) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); std::string name = "user."; name += std::string(XATTR_NAME_MAX - name.length(), 'a'); @@ -77,6 +89,10 @@ TEST_F(XattrTest, XattrLargeName) { } TEST_F(XattrTest, XattrInvalidPrefix) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); std::string name(XATTR_NAME_MAX, 'a'); EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), @@ -88,6 +104,10 @@ TEST_F(XattrTest, XattrInvalidPrefix) { // Do not allow save/restore cycles after making the test file read-only, as // the restore will fail to open it with r/w permissions. TEST_F(XattrTest, XattrReadOnly_NoRandomSave) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + // Drop capabilities that allow us to override file and directory permissions. ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); @@ -113,6 +133,10 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) { // Do not allow save/restore cycles after making the test file write-only, as // the restore will fail to open it with r/w permissions. TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + // Drop capabilities that allow us to override file and directory permissions. ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); @@ -143,6 +167,10 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) { } TEST_F(XattrTest, XattrOnDirectory) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const char name[] = "user.test"; EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0), @@ -152,6 +180,10 @@ TEST_F(XattrTest, XattrOnDirectory) { } TEST_F(XattrTest, XattrOnSymlink) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); TempPath link = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateSymlinkTo(dir.path(), test_file_name_)); @@ -163,6 +195,10 @@ TEST_F(XattrTest, XattrOnSymlink) { } TEST_F(XattrTest, XattrOnInvalidFileTypes) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char name[] = "user.test"; char char_device[] = "/dev/zero"; @@ -181,6 +217,10 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) { } TEST_F(XattrTest, SetxattrSizeSmallerThanValue) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -196,6 +236,10 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) { } TEST_F(XattrTest, SetxattrZeroSize) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -208,6 +252,10 @@ TEST_F(XattrTest, SetxattrZeroSize) { } TEST_F(XattrTest, SetxattrSizeTooLarge) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; @@ -223,6 +271,10 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) { } TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0), @@ -232,6 +284,10 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) { } TEST_F(XattrTest, SetxattrNullValueAndZeroSize) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); @@ -240,6 +296,10 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) { } TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val(XATTR_SIZE_MAX + 1); @@ -256,6 +316,10 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) { } TEST_F(XattrTest, SetxattrReplaceWithSmaller) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -271,6 +335,10 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) { } TEST_F(XattrTest, SetxattrReplaceWithLarger) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -285,6 +353,10 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) { } TEST_F(XattrTest, SetxattrCreateFlag) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE), @@ -296,6 +368,10 @@ TEST_F(XattrTest, SetxattrCreateFlag) { } TEST_F(XattrTest, SetxattrReplaceFlag) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE), @@ -308,6 +384,10 @@ TEST_F(XattrTest, SetxattrReplaceFlag) { } TEST_F(XattrTest, SetxattrInvalidFlags) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); int invalid_flags = 0xff; EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags), @@ -315,6 +395,10 @@ TEST_F(XattrTest, SetxattrInvalidFlags) { } TEST_F(XattrTest, Getxattr) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; int val = 1234; @@ -327,6 +411,10 @@ TEST_F(XattrTest, Getxattr) { } TEST_F(XattrTest, GetxattrSizeSmallerThanValue) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -339,6 +427,10 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) { } TEST_F(XattrTest, GetxattrSizeLargerThanValue) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -354,6 +446,10 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) { } TEST_F(XattrTest, GetxattrZeroSize) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -367,6 +463,10 @@ TEST_F(XattrTest, GetxattrZeroSize) { } TEST_F(XattrTest, GetxattrSizeTooLarge) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -383,6 +483,10 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) { } TEST_F(XattrTest, GetxattrNullValue) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -394,6 +498,10 @@ TEST_F(XattrTest, GetxattrNullValue) { } TEST_F(XattrTest, GetxattrNullValueAndZeroSize) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -410,12 +518,20 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) { } TEST_F(XattrTest, GetxattrNonexistentName) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); } TEST_F(XattrTest, LGetSetxattrOnSymlink) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); TempPath link = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateSymlinkTo(dir.path(), test_file_name_)); @@ -427,6 +543,10 @@ TEST_F(XattrTest, LGetSetxattrOnSymlink) { } TEST_F(XattrTest, LGetSetxattrOnNonsymlink) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); const char name[] = "user.test"; int val = 1234; @@ -441,6 +561,10 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) { } TEST_F(XattrTest, FGetSetxattr) { + // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are + // supported, and get/set have been added pack to the syscall table. + SKIP_IF(IsRunningOnGvisor()); + const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0)); const char name[] = "user.test"; -- cgit v1.2.3 From f3d95607036b8a502c65aa7b3e8145227274dbbc Mon Sep 17 00:00:00 2001 From: Eyal Soha Date: Wed, 5 Feb 2020 17:56:00 -0800 Subject: recv() on a closed TCP socket returns ENOTCONN From RFC 793 s3.9 p58 Event Processing: If RECEIVE Call arrives in CLOSED state and the user has access to such a connection, the return should be "error: connection does not exist" Fixes #1598 PiperOrigin-RevId: 293494287 --- pkg/sentry/socket/netstack/netstack.go | 7 ++++++- pkg/tcpip/tcpip.go | 4 ++++ pkg/tcpip/transport/tcp/endpoint.go | 4 ++-- pkg/tcpip/transport/tcp/tcp_test.go | 9 ++++----- test/syscalls/linux/tcp_socket.cc | 9 +++++++++ 5 files changed, 25 insertions(+), 8 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 049d04bf2..ed2fbcceb 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -2229,11 +2229,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq var copied int // Copy as many views as possible into the user-provided buffer. - for dst.NumBytes() != 0 { + for { + // Always do at least one fetchReadView, even if the number of bytes to + // read is 0. err = s.fetchReadView() if err != nil { break } + if dst.NumBytes() == 0 { + break + } var n int var e error diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 0fa141d58..d29d9a704 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -1124,6 +1124,10 @@ type ReadErrors struct { // InvalidEndpointState is the number of times we found the endpoint state // to be unexpected. InvalidEndpointState StatCounter + + // NotConnected is the number of times we tried to read but found that the + // endpoint was not connected. + NotConnected StatCounter } // WriteErrors collects packet write errors from an endpoint write call. diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index b5a8e15ee..e4a6b1b8b 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1003,8 +1003,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, if s == StateError { return buffer.View{}, tcpip.ControlMessages{}, he } - e.stats.ReadErrors.InvalidEndpointState.Increment() - return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState + e.stats.ReadErrors.NotConnected.Increment() + return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected } v, err := e.readLocked() diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 2c1505067..cc118c993 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -5405,12 +5405,11 @@ func TestEndpointBindListenAcceptState(t *testing.T) { t.Errorf("Unexpected endpoint state: want %v, got %v", want, got) } - // Expect InvalidEndpointState errors on a read at this point. - if _, _, err := ep.Read(nil); err != tcpip.ErrInvalidEndpointState { - t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrInvalidEndpointState) + if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected { + t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected) } - if got := ep.Stats().(*tcp.Stats).ReadErrors.InvalidEndpointState.Value(); got != 1 { - t.Fatalf("got EP stats Stats.ReadErrors.InvalidEndpointState got %v want %v", got, 1) + if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 { + t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1) } if err := ep.Listen(10); err != nil { diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 525ccbd88..8a8b68e75 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -1339,6 +1339,15 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) { EXPECT_EQ(get, kTCPDeferAccept); } +TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) { + auto s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + char buf[1]; + EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN)); + EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0), + SyscallFailsWithErrno(ENOTCONN)); +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest, ::testing::Values(AF_INET, AF_INET6)); -- cgit v1.2.3 From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 5 Feb 2020 22:45:44 -0800 Subject: Add notes to relevant tests. These were out-of-band notes that can help provide additional context and simplify automated imports. PiperOrigin-RevId: 293525915 --- pkg/metric/metric.go | 1 - pkg/sentry/arch/arch_x86.go | 4 ++ pkg/sentry/arch/signal_amd64.go | 2 +- pkg/sentry/fs/file_overlay_test.go | 1 + pkg/sentry/fs/proc/README.md | 4 ++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 3 ++ pkg/sentry/kernel/kernel_opts.go | 20 +++++++ pkg/sentry/socket/hostinet/BUILD | 1 + pkg/sentry/socket/hostinet/socket.go | 5 +- pkg/sentry/socket/hostinet/sockopt_impl.go | 27 ++++++++++ pkg/tcpip/transport/tcp/endpoint.go | 3 ++ runsc/boot/filter/BUILD | 1 + runsc/boot/filter/config.go | 13 ----- runsc/boot/filter/config_profile.go | 34 ++++++++++++ runsc/container/console_test.go | 5 +- runsc/dockerutil/dockerutil.go | 11 ++-- runsc/testutil/BUILD | 5 +- runsc/testutil/testutil.go | 54 ------------------- runsc/testutil/testutil_runfiles.go | 75 +++++++++++++++++++++++++++ test/image/image_test.go | 8 +-- test/syscalls/build_defs.bzl | 35 +++++++++++-- test/syscalls/linux/chroot.cc | 2 +- test/syscalls/linux/concurrency.cc | 3 +- test/syscalls/linux/exec_proc_exe_workload.cc | 6 +++ test/syscalls/linux/fork.cc | 5 +- test/syscalls/linux/mmap.cc | 8 +-- test/syscalls/linux/open_create.cc | 1 + test/syscalls/linux/preadv.cc | 1 + test/syscalls/linux/proc.cc | 46 +++++++++++++--- test/syscalls/linux/readv.cc | 4 +- test/syscalls/linux/rseq.cc | 2 +- test/syscalls/linux/select.cc | 2 +- test/syscalls/linux/shm.cc | 2 +- test/syscalls/linux/sigprocmask.cc | 2 +- test/syscalls/linux/socket_unix_non_stream.cc | 4 +- test/syscalls/linux/symlink.cc | 2 +- test/syscalls/linux/tcp_socket.cc | 3 +- test/syscalls/linux/time.cc | 1 + test/syscalls/linux/tkill.cc | 2 +- test/util/temp_path.cc | 1 + tools/build/tags.bzl | 4 ++ tools/defs.bzl | 17 +++++- 43 files changed, 318 insertions(+), 113 deletions(-) create mode 100644 pkg/sentry/kernel/kernel_opts.go create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go create mode 100644 runsc/boot/filter/config_profile.go create mode 100644 runsc/testutil/testutil_runfiles.go (limited to 'pkg') diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index 93d4f2b8c..006fcd9ab 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -46,7 +46,6 @@ var ( // // TODO(b/67298402): Support non-cumulative metrics. // TODO(b/67298427): Support metric fields. -// type Uint64Metric struct { // value is the actual value of the metric. It must be accessed // atomically. diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index a18093155..3db8bd34b 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte { size, align := cpuid.HostFeatureSet().ExtendedStateSize() capacity := size // Always use at least 4096 bytes. + // + // For the KVM platform, this state is a fixed 4096 bytes, so make sure + // that the underlying array is at _least_ that size otherwise we will + // corrupt random memory. This is not a pleasant thing to debug. if capacity < 4096 { capacity = 4096 } diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 81b92bb43..6fb756f0e 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -55,7 +55,7 @@ type SignalContext64 struct { Trapno uint64 Oldmask linux.SignalSet Cr2 uint64 - // Pointer to a struct _fpstate. + // Pointer to a struct _fpstate. See b/33003106#comment8. Fpstate uint64 Reserved [8]uint64 } diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go index 02538bb4f..a76d87e3a 100644 --- a/pkg/sentry/fs/file_overlay_test.go +++ b/pkg/sentry/fs/file_overlay_test.go @@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) { // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with // a frozen dirent tree does not make Readdir calls to the underlying files. +// This is a regression test for b/114808269. func TestReaddirOverlayFrozen(t *testing.T) { ctx := contexttest.Context(t) diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md index 5d4ec6c7b..6667a0916 100644 --- a/pkg/sentry/fs/proc/README.md +++ b/pkg/sentry/fs/proc/README.md @@ -11,6 +11,8 @@ inconsistency, please file a bug. The following files are implemented: + + | File /proc/ | Content | | :------------------------ | :---------------------------------------------------- | | [cpuinfo](#cpuinfo) | Info about the CPU | @@ -22,6 +24,8 @@ The following files are implemented: | [uptime](#uptime) | Wall clock since boot, combined idle time of all cpus | | [version](#version) | Kernel version | + + ### cpuinfo ```bash diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a27628c0a..2231d6973 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -91,6 +91,7 @@ go_library( "fs_context.go", "ipc_namespace.go", "kernel.go", + "kernel_opts.go", "kernel_state.go", "pending_signals.go", "pending_signals_list.go", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index dcd6e91c4..3ee760ba2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -235,6 +235,9 @@ type Kernel struct { // events. This is initialized lazily on the first unimplemented // syscall. unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` + + // SpecialOpts contains special kernel options. + SpecialOpts } // InitKernelArgs holds arguments to Init. diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go new file mode 100644 index 000000000..2e66ec587 --- /dev/null +++ b/pkg/sentry/kernel/kernel_opts.go @@ -0,0 +1,20 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// SpecialOpts contains non-standard options for the kernel. +// +// +stateify savable +type SpecialOpts struct{} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 5a07d5d0e..023bad156 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -10,6 +10,7 @@ go_library( "save_restore.go", "socket.go", "socket_unsafe.go", + "sockopt_impl.go", "stack.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 34f63986f..de76388ac 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt } // Whitelist options and constrain option length. - var optlen int + optlen := getSockOptLen(t, level, name) switch level { case linux.SOL_IP: switch name { @@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt // SetSockOpt implements socket.Socket.SetSockOpt. func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { // Whitelist options and constrain option length. - var optlen int + optlen := setSockOptLen(t, level, name) switch level { case linux.SOL_IP: switch name { @@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ optlen = sizeofInt32 } } + if optlen == 0 { // Pretend to accept socket options we don't understand. This seems // dangerous, but it's what netstack does... diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go new file mode 100644 index 000000000..8a783712e --- /dev/null +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +func getSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} + +func setSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index e4a6b1b8b..f2be0e651 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error { e.isRegistered = true e.setEndpointState(StateListen) + // The channel may be non-nil when we're restoring the endpoint, and it + // may be pre-populated with some previously accepted (but not Accepted) + // endpoints. if e.acceptedChan == nil { e.acceptedChan = make(chan *endpoint, backlog) } diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index ce30f6c53..ed18f0047 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -8,6 +8,7 @@ go_library( "config.go", "config_amd64.go", "config_arm64.go", + "config_profile.go", "extra_filters.go", "extra_filters_msan.go", "extra_filters_race.go", diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index f8d351c7b..c69f4c602 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules { }, } } - -// profileFilters returns extra syscalls made by runtime/pprof package. -func profileFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - syscall.SYS_OPENAT: []seccomp.Rule{ - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), - }, - }, - } -} diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go new file mode 100644 index 000000000..194952a7b --- /dev/null +++ b/runsc/boot/filter/config_profile.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// profileFilters returns extra syscalls made by runtime/pprof package. +func profileFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_OPENAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + }, + }, + } +} diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 060b63bf3..c2518d52b 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) { defer ptyMaster.Close() defer ptySlave.Close() - // Exec bash and attach a terminal. + // Exec bash and attach a terminal. Note that occasionally /bin/sh + // may be a different shell or have a different configuration (such + // as disabling interactive mode and job control). Since we want to + // explicitly test interactive mode, use /bin/bash. See b/116981926. execArgs := &control.ExecArgs{ Filename: "/bin/bash", // Don't let bash execute from profile or rc files, otherwise diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go index 9b6346ca2..1ff5e8cc3 100644 --- a/runsc/dockerutil/dockerutil.go +++ b/runsc/dockerutil/dockerutil.go @@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) { return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err) } for _, name := range names { - src := getLocalPath(name) - dst := path.Join(dir, name) + src, err := testutil.FindFile(name) + if err != nil { + return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err) + } + dst := path.Join(dir, path.Base(name)) if err := testutil.Copy(src, dst); err != nil { return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err) } @@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) { return dir, nil } -func getLocalPath(file string) string { - return path.Join(".", file) -} - // do executes docker command. func do(args ...string) (string, error) { log.Printf("Running: docker %s\n", args) diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index f845120b0..945405303 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -5,7 +5,10 @@ package(licenses = ["notice"]) go_library( name = "testutil", testonly = 1, - srcs = ["testutil.go"], + srcs = [ + "testutil.go", + "testutil_runfiles.go", + ], visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go index edf2e809a..80c2c9680 100644 --- a/runsc/testutil/testutil.go +++ b/runsc/testutil/testutil.go @@ -79,60 +79,6 @@ func ConfigureExePath() error { return nil } -// FindFile searchs for a file inside the test run environment. It returns the -// full path to the file. It fails if none or more than one file is found. -func FindFile(path string) (string, error) { - wd, err := os.Getwd() - if err != nil { - return "", err - } - - // The test root is demarcated by a path element called "__main__". Search for - // it backwards from the working directory. - root := wd - for { - dir, name := filepath.Split(root) - if name == "__main__" { - break - } - if len(dir) == 0 { - return "", fmt.Errorf("directory __main__ not found in %q", wd) - } - // Remove ending slash to loop around. - root = dir[:len(dir)-1] - } - - // Annoyingly, bazel adds the build type to the directory path for go - // binaries, but not for c++ binaries. We use two different patterns to - // to find our file. - patterns := []string{ - // Try the obvious path first. - filepath.Join(root, path), - // If it was a go binary, use a wildcard to match the build - // type. The pattern is: /test-path/__main__/directories/*/file. - filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)), - } - - for _, p := range patterns { - matches, err := filepath.Glob(p) - if err != nil { - // "The only possible returned error is ErrBadPattern, - // when pattern is malformed." -godoc - return "", fmt.Errorf("error globbing %q: %v", p, err) - } - switch len(matches) { - case 0: - // Try the next pattern. - case 1: - // We found it. - return matches[0], nil - default: - return "", fmt.Errorf("more than one match found for %q: %s", path, matches) - } - } - return "", fmt.Errorf("file %q not found", path) -} - // TestConfig returns the default configuration to use in tests. Note that // 'RootDir' must be set by caller if required. func TestConfig() *boot.Config { diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go new file mode 100644 index 000000000..ece9ea9a1 --- /dev/null +++ b/runsc/testutil/testutil_runfiles.go @@ -0,0 +1,75 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "fmt" + "os" + "path/filepath" +) + +// FindFile searchs for a file inside the test run environment. It returns the +// full path to the file. It fails if none or more than one file is found. +func FindFile(path string) (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", err + } + + // The test root is demarcated by a path element called "__main__". Search for + // it backwards from the working directory. + root := wd + for { + dir, name := filepath.Split(root) + if name == "__main__" { + break + } + if len(dir) == 0 { + return "", fmt.Errorf("directory __main__ not found in %q", wd) + } + // Remove ending slash to loop around. + root = dir[:len(dir)-1] + } + + // Annoyingly, bazel adds the build type to the directory path for go + // binaries, but not for c++ binaries. We use two different patterns to + // to find our file. + patterns := []string{ + // Try the obvious path first. + filepath.Join(root, path), + // If it was a go binary, use a wildcard to match the build + // type. The pattern is: /test-path/__main__/directories/*/file. + filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)), + } + + for _, p := range patterns { + matches, err := filepath.Glob(p) + if err != nil { + // "The only possible returned error is ErrBadPattern, + // when pattern is malformed." -godoc + return "", fmt.Errorf("error globbing %q: %v", p, err) + } + switch len(matches) { + case 0: + // Try the next pattern. + case 1: + // We found it. + return matches[0], nil + default: + return "", fmt.Errorf("more than one match found for %q: %s", path, matches) + } + } + return "", fmt.Errorf("file %q not found", path) +} diff --git a/test/image/image_test.go b/test/image/image_test.go index d0dcb1861..0a1e19d6f 100644 --- a/test/image/image_test.go +++ b/test/image/image_test.go @@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) { } d := dockerutil.MakeDocker("http-test") - dir, err := dockerutil.PrepareFiles("latin10k.txt") + dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } @@ -139,7 +139,7 @@ func TestNginx(t *testing.T) { } d := dockerutil.MakeDocker("net-test") - dir, err := dockerutil.PrepareFiles("latin10k.txt") + dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } @@ -183,7 +183,7 @@ func TestMysql(t *testing.T) { } client := dockerutil.MakeDocker("mysql-client-test") - dir, err := dockerutil.PrepareFiles("mysql.sql") + dir, err := dockerutil.PrepareFiles("test/image/mysql.sql") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } @@ -283,7 +283,7 @@ func TestRuby(t *testing.T) { } d := dockerutil.MakeDocker("ruby-test") - dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh") + dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh") if err != nil { t.Fatalf("PrepareFiles() failed: %v", err) } diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl index 1df761dd0..cbab85ef7 100644 --- a/test/syscalls/build_defs.bzl +++ b/test/syscalls/build_defs.bzl @@ -2,8 +2,6 @@ load("//tools:defs.bzl", "loopback") -# syscall_test is a macro that will create targets to run the given test target -# on the host (native) and runsc. def syscall_test( test, shard_count = 5, @@ -13,6 +11,19 @@ def syscall_test( add_uds_tree = False, add_hostinet = False, tags = None): + """syscall_test is a macro that will create targets for all platforms. + + Args: + test: the test target. + shard_count: shards for defined tests. + size: the defined test size. + use_tmpfs: use tmpfs in the defined tests. + add_overlay: add an overlay test. + add_uds_tree: add a UDS test. + add_hostinet: add a hostinet test. + tags: starting test tags. + """ + _syscall_test( test = test, shard_count = shard_count, @@ -111,6 +122,19 @@ def _syscall_test( # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared. tags += [full_platform, "file_" + file_access] + # Hash this target into one of 15 buckets. This can be used to + # randomly split targets between different workflows. + hash15 = hash(native.package_name() + name) % 15 + tags.append("hash15:" + str(hash15)) + + # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until + # we figure out how to request ipv4 sockets on Guitar machines. + if network == "host": + tags.append("noguitar") + + # Disable off-host networking. + tags.append("requires-net:loopback") + # Add tag to prevent the tests from running in a Bazel sandbox. # TODO(b/120560048): Make the tests run without this tag. tags.append("no-sandbox") @@ -118,8 +142,11 @@ def _syscall_test( # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is # more stable. if platform == "kvm": - tags += ["manual"] - tags += ["requires-kvm"] + tags.append("manual") + tags.append("requires-kvm") + + # TODO(b/112165693): Remove when tests pass reliably. + tags.append("notap") args = [ # Arguments are passed directly to syscall_test_runner binary. diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc index 0a2d44a2c..85ec013d5 100644 --- a/test/syscalls/linux/chroot.cc +++ b/test/syscalls/linux/chroot.cc @@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) { } // Test that link resolution in a chroot can escape the root by following an -// open proc fd. +// open proc fd. Regression test for b/32316719. TEST(ChrootTest, ProcFdLinkResolutionInChroot) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT))); diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc index f41f99900..7cd6a75bd 100644 --- a/test/syscalls/linux/concurrency.cc +++ b/test/syscalls/linux/concurrency.cc @@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) { } // Test that multiple threads in this process continue to execute in parallel, -// even if an unrelated second process is spawned. +// even if an unrelated second process is spawned. Regression test for +// b/32119508. TEST(ConcurrencyTest, MultiProcessMultithreaded) { // In PID 1, start TIDs 1 and 2, and put both to sleep. // diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc index b790fe5be..2989379b7 100644 --- a/test/syscalls/linux/exec_proc_exe_workload.cc +++ b/test/syscalls/linux/exec_proc_exe_workload.cc @@ -21,6 +21,12 @@ #include "test/util/posix_error.h" int main(int argc, char** argv, char** envp) { + // This is annoying. Because remote build systems may put these binaries + // in a content-addressable-store, you may wind up with /proc/self/exe + // pointing to some random path (but with a sensible argv[0]). + // + // Therefore, this test simply checks that the /proc/self/exe + // is absolute and *doesn't* match argv[1]. std::string exe = gvisor::testing::ProcessExePath(getpid()).ValueOrDie(); if (exe[0] != '/') { diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc index 906f3358d..ff8bdfeb0 100644 --- a/test/syscalls/linux/fork.cc +++ b/test/syscalls/linux/fork.cc @@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) { EXPECT_EQ(0, alarmed); } -// Child cannot affect parent private memory. +// Child cannot affect parent private memory. Regression test for b/24137240. TEST_F(ForkTest, PrivateMemory) { std::atomic local(0); @@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) { } // Kernel-accessed buffers should remain coherent across COW. +// +// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates +// differently. Regression test for b/33811887. TEST_F(ForkTest, COWSegment) { constexpr int kBufSize = 1024; char* read_buf = private_; diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc index 1c4d9f1c7..11fb1b457 100644 --- a/test/syscalls/linux/mmap.cc +++ b/test/syscalls/linux/mmap.cc @@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) { // // On most platforms this is trivial, but when the file is mapped via the sentry // page cache (which does not yet support writing to shared mappings), a bug -// caused reads to fail unnecessarily on such mappings. +// caused reads to fail unnecessarily on such mappings. See b/28913513. TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) { uintptr_t addr; size_t len = strlen(kFileContents); @@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) { // Tests that EFAULT is returned when invoking a syscall that requires the OS to // read past end of file (resulting in a fault in sentry context in the gVisor -// case). +// case). See b/28913513. TEST_F(MMapFileTest, InternalSigBus) { uintptr_t addr; ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, @@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) { } // Tests that reading from a file to a memory mapping of the same file does not -// deadlock. +// deadlock. See b/34813270. TEST_F(MMapFileTest, SelfRead) { uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, @@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) { } // Tests that writing to a file from a memory mapping of the same file does not -// deadlock. +// deadlock. Regression test for b/34813270. TEST_F(MMapFileTest, SelfWrite) { uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0), diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc index 431733dbe..902d0a0dc 100644 --- a/test/syscalls/linux/open_create.cc +++ b/test/syscalls/linux/open_create.cc @@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) { } // A file originally created RW, but opened RO can later be opened RW. +// Regression test for b/65385065. TEST(CreateTest, OpenCreateROThenRW) { TempPath file(NewTempAbsPath()); diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc index f7ea44054..5b0743fe9 100644 --- a/test/syscalls/linux/preadv.cc +++ b/test/syscalls/linux/preadv.cc @@ -37,6 +37,7 @@ namespace testing { namespace { +// Stress copy-on-write. Attempts to reproduce b/38430174. TEST(PreadvTest, MMConcurrencyStress) { // Fill a one-page file with zeroes (the contents don't really matter). const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 169b723eb..a23fdb58d 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) { // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. - // 4.17 & gVisor: Syscall succeeds and returns 1 + // + // ~4.3: Syscall fails with EACCES. + // 4.17 & gVisor: Syscall succeeds and returns 1. + // // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)), // SyscallFailsWithErrno(EACCES)); // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. - // 4.17 & gVisor: Syscall succeeds and returns 1. + // + // ~4.3: Syscall fails with EACCES. + // 4.17 & gVisor: Syscall succeeds and returns 1. + // // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)), // SyscallFailsWithErrno(EACCES)); } @@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) { TEST(ProcPidFile, SubprocessZombie) { char buf[1]; - // 4.17: Succeeds and returns 1 - // gVisor: Succeeds and returns 0 + // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent + // behavior on different kernels. + // + // ~4.3: Succeds and returns 0. + // 4.17: Succeeds and returns 1. + // gVisor: Succeeds and returns 0. EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds()); EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)), @@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) { // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. + // + // ~4.3: Fails and returns EACCES. // gVisor & 4.17: Succeeds and returns 1. + // // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)), // SyscallFailsWithErrno(EACCES)); } @@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) { TEST(ProcPidFile, SubprocessExited) { char buf[1]; - // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels + // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels. + // + // ~4.3: Fails and returns ESRCH. // gVisor: Fails with ESRCH. // 4.17: Succeeds and returns 1. + // // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)), // SyscallFailsWithErrno(ESRCH)); @@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) { EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task", TaskFiles(initial, {child1.Tid()}))); - // Stat child1's task file. + // Stat child1's task file. Regression test for b/32097707. struct stat statbuf; const std::string child1_task_file = absl::StrCat("/proc/self/task/", child1.Tid()); @@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) { EXPECT_NO_ERRNO(EventuallyDirContainsExactly( "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()}))); - // Stat child1's task file again. This time it should fail. + // Stat child1's task file again. This time it should fail. See b/32097707. EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf), SyscallFailsWithErrno(ENOENT)); @@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) { } // Check that link for proc fd entries point the target node, not the -// symlink itself. +// symlink itself. Regression test for b/31155070. TEST(ProcTaskFd, FstatatFollowsSymlink) { const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const FileDescriptor fd = @@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) { EXPECT_EQ(link, "self/mounts"); } +TEST(ProcSelfMountinfo, RequiredFieldsArePresent) { + auto mountinfo = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo")); + EXPECT_THAT( + mountinfo, + AllOf( + // Root mount. + ContainsRegex( + R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"), + // Proc mount - always rw. + ContainsRegex( + R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)"))); +} + // Check that /proc/self/mounts looks something like a real mounts file. TEST(ProcSelfMounts, RequiredFieldsArePresent) { auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts")); diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc index 4069cbc7e..baaf9f757 100644 --- a/test/syscalls/linux/readv.cc +++ b/test/syscalls/linux/readv.cc @@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) { // This test depends on the maximum extent of a single readv() syscall, so // we can't tolerate interruption from saving. TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) { - // Ensure that we won't be interrupted by ITIMER_PROF. + // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly + // important in environments where automated profiling tools may start + // ITIMER_PROF automatically. struct itimerval itv = {}; auto const cleanup_itimer = ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv)); diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc index 106c045e3..4bfb1ff56 100644 --- a/test/syscalls/linux/rseq.cc +++ b/test/syscalls/linux/rseq.cc @@ -36,7 +36,7 @@ namespace { // We must be very careful about how these tests are written. Each thread may // only have one struct rseq registration, which may be done automatically at // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does -// not do so). +// not do so, but other libraries do). // // Testing of rseq is thus done primarily in a child process with no // registration. This means exec'ing a nostdlib binary, as rseq registration can diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc index 424e2a67f..be2364fb8 100644 --- a/test/syscalls/linux/select.cc +++ b/test/syscalls/linux/select.cc @@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) { // This test illustrates Linux's behavior of 'select' calls passing after // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on -// this behavior. +// this behavior. See b/122318458. TEST_F(SelectTest, SetrlimitCallNOFILE) { fd_set read_set; FD_ZERO(&read_set); diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc index 7ba752599..c7fdbb924 100644 --- a/test/syscalls/linux/shm.cc +++ b/test/syscalls/linux/shm.cc @@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) { } // Check that sentry does not panic when asked for a zero-length private shm -// segment. +// segment. Regression test for b/110694797. TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) { EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _)); } diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc index 654c6a47f..a603fc1d1 100644 --- a/test/syscalls/linux/sigprocmask.cc +++ b/test/syscalls/linux/sigprocmask.cc @@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) { } // Check that sigprocmask correctly handles aliasing of the set and oldset -// pointers. +// pointers. Regression test for b/30502311. TEST_F(SigProcMaskTest, AliasedSets) { sigset_t mask; diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc index 276a94eb8..884319e1d 100644 --- a/test/syscalls/linux/socket_unix_non_stream.cc +++ b/test/syscalls/linux/socket_unix_non_stream.cc @@ -109,7 +109,7 @@ PosixErrorOr> CreateFragmentedRegion(const int size, } // A contiguous iov that is heavily fragmented in FileMem can still be sent -// successfully. +// successfully. See b/115833655. TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); @@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) { } // A contiguous iov that is heavily fragmented in FileMem can still be received -// into successfully. +// into successfully. Regression test for b/115833655. TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index b249ff91f..03ee1250d 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) { } // Test that name collisions are checked on the new link path, not the source -// path. +// path. Regression test for b/31782115. TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) { const std::string srcname = NewTempAbsPath(); const std::string newname = NewTempAbsPath(); diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 8a8b68e75..c4591a3b9 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) { } // Test that a non-blocking write with a buffer that is larger than the send -// buffer size will not actually write the whole thing at once. +// buffer size will not actually write the whole thing at once. Regression test +// for b/64438887. TEST_P(TcpSocketTest, NonblockingLargeWrite) { // Set the FD to O_NONBLOCK. int opts; diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc index c7eead17e..1ccb95733 100644 --- a/test/syscalls/linux/time.cc +++ b/test/syscalls/linux/time.cc @@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) { ::testing::KilledBySignal(SIGSEGV), ""); } +// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2. int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) { constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000; return reinterpret_cast( diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc index bae377c69..8d8ebbb24 100644 --- a/test/syscalls/linux/tkill.cc +++ b/test/syscalls/linux/tkill.cc @@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) { TEST_CHECK(info->si_code == SI_TKILL); } -// Test with a real signal. +// Test with a real signal. Regression test for b/24790092. TEST(TkillTest, ValidTIDAndRealSignal) { struct sigaction sa; sa.sa_sigaction = SigHandler; diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc index 35aacb172..9c10b6674 100644 --- a/test/util/temp_path.cc +++ b/test/util/temp_path.cc @@ -77,6 +77,7 @@ std::string NewTempAbsPath() { std::string NewTempRelPath() { return NextTempBasename(); } std::string GetAbsoluteTestTmpdir() { + // Note that TEST_TMPDIR is guaranteed to be set. char* env_tmpdir = getenv("TEST_TMPDIR"); std::string tmp_dir = env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp"; diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl index e99c87f81..a6db44e47 100644 --- a/tools/build/tags.bzl +++ b/tools/build/tags.bzl @@ -33,4 +33,8 @@ go_suffixes = [ "_wasm_unsafe", "_linux", "_linux_unsafe", + "_opts", + "_opts_unsafe", + "_impl", + "_impl_unsafe", ] diff --git a/tools/defs.bzl b/tools/defs.bzl index 5d5fa134a..c03b557ae 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -73,6 +73,16 @@ def calculate_sets(srcs): result[target].append(file) return result +def go_imports(name, src, out): + """Simplify a single Go source file by eliminating unused imports.""" + native.genrule( + name = name, + srcs = [src], + outs = [out], + tools = ["@org_golang_x_tools//cmd/goimports:goimports"], + cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"), + ) + def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs): """Wraps the standard go_library and does stateification and marshalling. @@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F state_sets = calculate_sets(srcs) for (suffix, srcs) in state_sets.items(): go_stateify( - name = name + suffix + "_state_autogen", + name = name + suffix + "_state_autogen_with_imports", srcs = srcs, imports = imports, package = name, + out = name + suffix + "_state_autogen_with_imports.go", + ) + go_imports( + name = name + suffix + "_state_autogen", + src = name + suffix + "_state_autogen_with_imports.go", out = name + suffix + "_state_autogen.go", ) all_srcs = all_srcs + [ -- cgit v1.2.3 From 5ff780891e229dbde00d9a37c2f8b6681e592fdb Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 6 Feb 2020 10:06:56 -0800 Subject: Move p9.pool to a separate package PiperOrigin-RevId: 293617493 --- pkg/p9/BUILD | 3 +-- pkg/p9/client.go | 9 ++++--- pkg/p9/pool.go | 68 --------------------------------------------------- pkg/p9/pool_test.go | 64 ------------------------------------------------ pkg/pool/BUILD | 25 +++++++++++++++++++ pkg/pool/pool.go | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ pkg/pool/pool_test.go | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 161 insertions(+), 138 deletions(-) delete mode 100644 pkg/p9/pool.go delete mode 100644 pkg/p9/pool_test.go create mode 100644 pkg/pool/BUILD create mode 100644 pkg/pool/pool.go create mode 100644 pkg/pool/pool_test.go (limited to 'pkg') diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index 4ccc1de86..8904afad9 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -16,7 +16,6 @@ go_library( "messages.go", "p9.go", "path_tree.go", - "pool.go", "server.go", "transport.go", "transport_flipcall.go", @@ -27,6 +26,7 @@ go_library( "//pkg/fdchannel", "//pkg/flipcall", "//pkg/log", + "//pkg/pool", "//pkg/sync", "//pkg/unet", "@org_golang_x_sys//unix:go_default_library", @@ -41,7 +41,6 @@ go_test( "client_test.go", "messages_test.go", "p9_test.go", - "pool_test.go", "transport_test.go", "version_test.go", ], diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 4045e41fa..a6f493b82 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -22,6 +22,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/pool" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) @@ -74,10 +75,10 @@ type Client struct { socket *unet.Socket // tagPool is the collection of available tags. - tagPool pool + tagPool pool.Pool // fidPool is the collection of available fids. - fidPool pool + fidPool pool.Pool // messageSize is the maximum total size of a message. messageSize uint32 @@ -155,8 +156,8 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client } c := &Client{ socket: socket, - tagPool: pool{start: 1, limit: uint64(NoTag)}, - fidPool: pool{start: 1, limit: uint64(NoFID)}, + tagPool: pool.Pool{Start: 1, Limit: uint64(NoTag)}, + fidPool: pool.Pool{Start: 1, Limit: uint64(NoFID)}, pending: make(map[Tag]*response), recvr: make(chan bool, 1), messageSize: messageSize, diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go deleted file mode 100644 index 2b14a5ce3..000000000 --- a/pkg/p9/pool.go +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package p9 - -import ( - "gvisor.dev/gvisor/pkg/sync" -) - -// pool is a simple allocator. -// -// It is used for both tags and FIDs. -type pool struct { - mu sync.Mutex - - // cache is the set of returned values. - cache []uint64 - - // start is the starting value (if needed). - start uint64 - - // max is the current maximum issued. - max uint64 - - // limit is the upper limit. - limit uint64 -} - -// Get gets a value from the pool. -func (p *pool) Get() (uint64, bool) { - p.mu.Lock() - defer p.mu.Unlock() - - // Anything cached? - if len(p.cache) > 0 { - v := p.cache[len(p.cache)-1] - p.cache = p.cache[:len(p.cache)-1] - return v, true - } - - // Over the limit? - if p.start == p.limit { - return 0, false - } - - // Generate a new value. - v := p.start - p.start++ - return v, true -} - -// Put returns a value to the pool. -func (p *pool) Put(v uint64) { - p.mu.Lock() - p.cache = append(p.cache, v) - p.mu.Unlock() -} diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go deleted file mode 100644 index e4746b8da..000000000 --- a/pkg/p9/pool_test.go +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package p9 - -import ( - "testing" -) - -func TestPoolUnique(t *testing.T) { - p := pool{start: 1, limit: 3} - got := make(map[uint64]bool) - - for { - n, ok := p.Get() - if !ok { - break - } - - // Check unique. - if _, ok := got[n]; ok { - t.Errorf("pool spit out %v multiple times", n) - } - - // Record. - got[n] = true - } -} - -func TestExausted(t *testing.T) { - p := pool{start: 1, limit: 500} - for i := 0; i < 499; i++ { - _, ok := p.Get() - if !ok { - t.Fatalf("pool exhausted before 499 items") - } - } - - _, ok := p.Get() - if ok { - t.Errorf("pool not exhausted when it should be") - } -} - -func TestPoolRecycle(t *testing.T) { - p := pool{start: 1, limit: 500} - n1, _ := p.Get() - p.Put(n1) - n2, _ := p.Get() - if n1 != n2 { - t.Errorf("pool not recycling items") - } -} diff --git a/pkg/pool/BUILD b/pkg/pool/BUILD new file mode 100644 index 000000000..7b1c6b75b --- /dev/null +++ b/pkg/pool/BUILD @@ -0,0 +1,25 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package( + default_visibility = ["//visibility:public"], + licenses = ["notice"], +) + +go_library( + name = "pool", + srcs = [ + "pool.go", + ], + deps = [ + "//pkg/sync", + ], +) + +go_test( + name = "pool_test", + size = "small", + srcs = [ + "pool_test.go", + ], + library = ":pool", +) diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go new file mode 100644 index 000000000..a1b2e0cfe --- /dev/null +++ b/pkg/pool/pool.go @@ -0,0 +1,66 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pool + +import ( + "gvisor.dev/gvisor/pkg/sync" +) + +// Pool is a simple allocator. +type Pool struct { + mu sync.Mutex + + // cache is the set of returned values. + cache []uint64 + + // Start is the starting value (if needed). + Start uint64 + + // max is the current maximum issued. + max uint64 + + // Limit is the upper limit. + Limit uint64 +} + +// Get gets a value from the pool. +func (p *Pool) Get() (uint64, bool) { + p.mu.Lock() + defer p.mu.Unlock() + + // Anything cached? + if len(p.cache) > 0 { + v := p.cache[len(p.cache)-1] + p.cache = p.cache[:len(p.cache)-1] + return v, true + } + + // Over the limit? + if p.Start == p.Limit { + return 0, false + } + + // Generate a new value. + v := p.Start + p.Start++ + return v, true +} + +// Put returns a value to the pool. +func (p *Pool) Put(v uint64) { + p.mu.Lock() + p.cache = append(p.cache, v) + p.mu.Unlock() +} diff --git a/pkg/pool/pool_test.go b/pkg/pool/pool_test.go new file mode 100644 index 000000000..d928439c1 --- /dev/null +++ b/pkg/pool/pool_test.go @@ -0,0 +1,64 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pool + +import ( + "testing" +) + +func TestPoolUnique(t *testing.T) { + p := Pool{Start: 1, Limit: 3} + got := make(map[uint64]bool) + + for { + n, ok := p.Get() + if !ok { + break + } + + // Check unique. + if _, ok := got[n]; ok { + t.Errorf("pool spit out %v multiple times", n) + } + + // Record. + got[n] = true + } +} + +func TestExausted(t *testing.T) { + p := Pool{Start: 1, Limit: 500} + for i := 0; i < 499; i++ { + _, ok := p.Get() + if !ok { + t.Fatalf("pool exhausted before 499 items") + } + } + + _, ok := p.Get() + if ok { + t.Errorf("pool not exhausted when it should be") + } +} + +func TestPoolRecycle(t *testing.T) { + p := Pool{Start: 1, Limit: 500} + n1, _ := p.Get() + p.Put(n1) + n2, _ := p.Get() + if n1 != n2 { + t.Errorf("pool not recycling items") + } +} -- cgit v1.2.3 From 6bd59b4e08893281468e8af5aebb5fab0f7a8c0d Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Thu, 6 Feb 2020 11:12:41 -0800 Subject: Update link address for targets of Neighbor Adverts Get the link address for the target of an NDP Neighbor Advertisement from the NDP Target Link Layer Address option. Tests: - ipv6.TestNeighorAdvertisementWithTargetLinkLayerOption - ipv6.TestNeighorAdvertisementWithInvalidTargetLinkLayerOption PiperOrigin-RevId: 293632609 --- pkg/tcpip/network/ipv6/icmp.go | 44 ++++-- pkg/tcpip/network/ipv6/icmp_test.go | 186 +++++++++++++++--------- pkg/tcpip/network/ipv6/ndp_test.go | 278 +++++++++++++++++++++++++----------- pkg/tcpip/stack/ndp_test.go | 8 +- 4 files changed, 352 insertions(+), 164 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index 60817d36d..45dc757c7 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -15,6 +15,8 @@ package ipv6 import ( + "log" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -194,7 +196,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P // TODO(b/148429853): Properly process the NS message and do Neighbor // Unreachability Detection. for { - opt, done, _ := it.Next() + opt, done, err := it.Next() + if err != nil { + // This should never happen as Iter(true) above did not return an error. + log.Fatalf("unexpected error when iterating over NDP options: %s", err) + } if done { break } @@ -253,21 +259,25 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P } na := header.NDPNeighborAdvert(h.NDPPayload()) + it, err := na.Options().Iter(true) + if err != nil { + // If we have a malformed NDP NA option, drop the packet. + received.Invalid.Increment() + return + } + targetAddr := na.TargetAddress() stack := r.Stack() rxNICID := r.NICID() - isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr) - if err != nil { + if isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr); err != nil { // We will only get an error if rxNICID is unrecognized, // which should not happen. For now short-circuit this // packet. // // TODO(b/141002840): Handle this better? return - } - - if isTentative { + } else if isTentative { // We just got an NA from a node that owns an address we // are performing DAD on, implying the address is not // unique. In this case we let the stack know so it can @@ -283,13 +293,29 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P // scenario is beyond the scope of RFC 4862. As such, we simply // ignore such a scenario for now and proceed as normal. // + // If the NA message has the target link layer option, update the link + // address cache with the link address for the target of the message. + // // TODO(b/143147598): Handle the scenario described above. Also // inform the netstack integration that a duplicate address was // detected outside of DAD. + // + // TODO(b/148429853): Properly process the NA message and do Neighbor + // Unreachability Detection. + for { + opt, done, err := it.Next() + if err != nil { + // This should never happen as Iter(true) above did not return an error. + log.Fatalf("unexpected error when iterating over NDP options: %s", err) + } + if done { + break + } - e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, r.RemoteLinkAddress) - if targetAddr != r.RemoteAddress { - e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, r.RemoteLinkAddress) + switch opt := opt.(type) { + case header.NDPTargetLinkLayerAddressOption: + e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, opt.EthernetAddress()) + } } case header.ICMPv6EchoRequest: diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go index d0e930e20..50c4b6474 100644 --- a/pkg/tcpip/network/ipv6/icmp_test.go +++ b/pkg/tcpip/network/ipv6/icmp_test.go @@ -121,21 +121,60 @@ func TestICMPCounts(t *testing.T) { } defer r.Release() + var tllData [header.NDPLinkLayerAddressSize]byte + header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{ + header.NDPTargetLinkLayerAddressOption(linkAddr1), + }) + types := []struct { - typ header.ICMPv6Type - size int + typ header.ICMPv6Type + size int + extraData []byte }{ - {header.ICMPv6DstUnreachable, header.ICMPv6DstUnreachableMinimumSize}, - {header.ICMPv6PacketTooBig, header.ICMPv6PacketTooBigMinimumSize}, - {header.ICMPv6TimeExceeded, header.ICMPv6MinimumSize}, - {header.ICMPv6ParamProblem, header.ICMPv6MinimumSize}, - {header.ICMPv6EchoRequest, header.ICMPv6EchoMinimumSize}, - {header.ICMPv6EchoReply, header.ICMPv6EchoMinimumSize}, - {header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize}, - {header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize}, - {header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize}, - {header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize}, - {header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize}, + { + typ: header.ICMPv6DstUnreachable, + size: header.ICMPv6DstUnreachableMinimumSize, + }, + { + typ: header.ICMPv6PacketTooBig, + size: header.ICMPv6PacketTooBigMinimumSize, + }, + { + typ: header.ICMPv6TimeExceeded, + size: header.ICMPv6MinimumSize, + }, + { + typ: header.ICMPv6ParamProblem, + size: header.ICMPv6MinimumSize, + }, + { + typ: header.ICMPv6EchoRequest, + size: header.ICMPv6EchoMinimumSize, + }, + { + typ: header.ICMPv6EchoReply, + size: header.ICMPv6EchoMinimumSize, + }, + { + typ: header.ICMPv6RouterSolicit, + size: header.ICMPv6MinimumSize, + }, + { + typ: header.ICMPv6RouterAdvert, + size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize, + }, + { + typ: header.ICMPv6NeighborSolicit, + size: header.ICMPv6NeighborSolicitMinimumSize}, + { + typ: header.ICMPv6NeighborAdvert, + size: header.ICMPv6NeighborAdvertMinimumSize, + extraData: tllData[:], + }, + { + typ: header.ICMPv6RedirectMsg, + size: header.ICMPv6MinimumSize, + }, } handleIPv6Payload := func(hdr buffer.Prependable) { @@ -154,10 +193,13 @@ func TestICMPCounts(t *testing.T) { } for _, typ := range types { - hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size) + extraDataLen := len(typ.extraData) + hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen) + extraData := buffer.View(hdr.Prepend(extraDataLen)) + copy(extraData, typ.extraData) pkt := header.ICMPv6(hdr.Prepend(typ.size)) pkt.SetType(typ.typ) - pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView())) handleIPv6Payload(hdr) } @@ -372,97 +414,104 @@ func TestLinkResolution(t *testing.T) { } func TestICMPChecksumValidationSimple(t *testing.T) { + var tllData [header.NDPLinkLayerAddressSize]byte + header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{ + header.NDPTargetLinkLayerAddressOption(linkAddr1), + }) + types := []struct { name string typ header.ICMPv6Type size int + extraData []byte statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter }{ { - "DstUnreachable", - header.ICMPv6DstUnreachable, - header.ICMPv6DstUnreachableMinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "DstUnreachable", + typ: header.ICMPv6DstUnreachable, + size: header.ICMPv6DstUnreachableMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.DstUnreachable }, }, { - "PacketTooBig", - header.ICMPv6PacketTooBig, - header.ICMPv6PacketTooBigMinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "PacketTooBig", + typ: header.ICMPv6PacketTooBig, + size: header.ICMPv6PacketTooBigMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.PacketTooBig }, }, { - "TimeExceeded", - header.ICMPv6TimeExceeded, - header.ICMPv6MinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "TimeExceeded", + typ: header.ICMPv6TimeExceeded, + size: header.ICMPv6MinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.TimeExceeded }, }, { - "ParamProblem", - header.ICMPv6ParamProblem, - header.ICMPv6MinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "ParamProblem", + typ: header.ICMPv6ParamProblem, + size: header.ICMPv6MinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.ParamProblem }, }, { - "EchoRequest", - header.ICMPv6EchoRequest, - header.ICMPv6EchoMinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "EchoRequest", + typ: header.ICMPv6EchoRequest, + size: header.ICMPv6EchoMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.EchoRequest }, }, { - "EchoReply", - header.ICMPv6EchoReply, - header.ICMPv6EchoMinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "EchoReply", + typ: header.ICMPv6EchoReply, + size: header.ICMPv6EchoMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.EchoReply }, }, { - "RouterSolicit", - header.ICMPv6RouterSolicit, - header.ICMPv6MinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "RouterSolicit", + typ: header.ICMPv6RouterSolicit, + size: header.ICMPv6MinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.RouterSolicit }, }, { - "RouterAdvert", - header.ICMPv6RouterAdvert, - header.ICMPv6HeaderSize + header.NDPRAMinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "RouterAdvert", + typ: header.ICMPv6RouterAdvert, + size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.RouterAdvert }, }, { - "NeighborSolicit", - header.ICMPv6NeighborSolicit, - header.ICMPv6NeighborSolicitMinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "NeighborSolicit", + typ: header.ICMPv6NeighborSolicit, + size: header.ICMPv6NeighborSolicitMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.NeighborSolicit }, }, { - "NeighborAdvert", - header.ICMPv6NeighborAdvert, - header.ICMPv6NeighborAdvertSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "NeighborAdvert", + typ: header.ICMPv6NeighborAdvert, + size: header.ICMPv6NeighborAdvertMinimumSize, + extraData: tllData[:], + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.NeighborAdvert }, }, { - "RedirectMsg", - header.ICMPv6RedirectMsg, - header.ICMPv6MinimumSize, - func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + name: "RedirectMsg", + typ: header.ICMPv6RedirectMsg, + size: header.ICMPv6MinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { return stats.RedirectMsg }, }, @@ -494,16 +543,19 @@ func TestICMPChecksumValidationSimple(t *testing.T) { ) } - handleIPv6Payload := func(typ header.ICMPv6Type, size int, checksum bool) { - hdr := buffer.NewPrependable(header.IPv6MinimumSize + size) - pkt := header.ICMPv6(hdr.Prepend(size)) - pkt.SetType(typ) + handleIPv6Payload := func(checksum bool) { + extraDataLen := len(typ.extraData) + hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen) + extraData := buffer.View(hdr.Prepend(extraDataLen)) + copy(extraData, typ.extraData) + pkt := header.ICMPv6(hdr.Prepend(typ.size)) + pkt.SetType(typ.typ) if checksum { - pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{})) + pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, extraData.ToVectorisedView())) } ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(size), + PayloadLength: uint16(typ.size + extraDataLen), NextHeader: uint8(header.ICMPv6ProtocolNumber), HopLimit: header.NDPHopLimit, SrcAddr: lladdr1, @@ -528,7 +580,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) { // Without setting checksum, the incoming packet should // be invalid. - handleIPv6Payload(typ.typ, typ.size, false) + handleIPv6Payload(false) if got := invalid.Value(); got != 1 { t.Fatalf("got invalid = %d, want = 1", got) } @@ -538,7 +590,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) { } // When checksum is set, it should be received. - handleIPv6Payload(typ.typ, typ.size, true) + handleIPv6Payload(true) if got := typStat.Value(); got != 1 { t.Fatalf("got %s = %d, want = 1", typ.name, got) } diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go index bd732f93f..c9395de52 100644 --- a/pkg/tcpip/network/ipv6/ndp_test.go +++ b/pkg/tcpip/network/ipv6/ndp_test.go @@ -70,76 +70,29 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack return s, ep } -// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving an -// NDP NS message with the Source Link Layer Address option results in a +// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a +// valid NDP NS message with the Source Link Layer Address option results in a // new entry in the link address cache for the sender of the message. func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) { const nicID = 1 - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocol{NewProtocol()}, - }) - e := channel.New(0, 1280, linkAddr0) - if err := s.CreateNIC(nicID, e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) - } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) - } - - ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize - hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize) - pkt := header.ICMPv6(hdr.Prepend(ndpNSSize)) - pkt.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(pkt.NDPPayload()) - ns.SetTargetAddress(lladdr0) - ns.Options().Serialize(header.NDPOptionsSerializer{ - header.NDPSourceLinkLayerAddressOption(linkAddr1), - }) - pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{})) - payloadLength := hdr.UsedLength() - ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) - ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: lladdr1, - DstAddr: lladdr0, - }) - e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{ - Data: hdr.View().ToVectorisedView(), - }) - - linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil) - if err != nil { - t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err) - } - if c != nil { - t.Errorf("got unexpected channel") - } - if linkAddr != linkAddr1 { - t.Errorf("got link address = %s, want = %s", linkAddr, linkAddr1) - } -} - -// TestNeighorSolicitationWithInvalidSourceLinkLayerOption tests that receiving -// an NDP NS message with an invalid Source Link Layer Address option does not -// result in a new entry in the link address cache for the sender of the -// message. -func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) { - const nicID = 1 - tests := []struct { - name string - optsBuf []byte + name string + optsBuf []byte + expectedLinkAddr tcpip.LinkAddress }{ + { + name: "Valid", + optsBuf: []byte{1, 1, 2, 3, 4, 5, 6, 7}, + expectedLinkAddr: "\x02\x03\x04\x05\x06\x07", + }, { name: "Too Small", - optsBuf: []byte{1, 1, 1, 2, 3, 4, 5}, + optsBuf: []byte{1, 1, 2, 3, 4, 5, 6}, }, { name: "Invalid Length", - optsBuf: []byte{1, 2, 1, 2, 3, 4, 5, 6}, + optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7}, }, } @@ -186,20 +139,138 @@ func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) { Data: hdr.View().ToVectorisedView(), }) - // Invalid count should have increased. - if got := invalid.Value(); got != 1 { - t.Fatalf("got invalid = %d, want = 1", got) + linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil) + if linkAddr != test.expectedLinkAddr { + t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr) } - linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil) - if err != tcpip.ErrWouldBlock { - t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock) + if test.expectedLinkAddr != "" { + if err != nil { + t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err) + } + if c != nil { + t.Errorf("got unexpected channel") + } + + // Invalid count should not have increased. + if got := invalid.Value(); got != 0 { + t.Errorf("got invalid = %d, want = 0", got) + } + } else { + if err != tcpip.ErrWouldBlock { + t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock) + } + if c == nil { + t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber) + } + + // Invalid count should have increased. + if got := invalid.Value(); got != 1 { + t.Errorf("got invalid = %d, want = 1", got) + } + } + }) + } +} + +// TestNeighorAdvertisementWithTargetLinkLayerOption tests that receiving a +// valid NDP NA message with the Target Link Layer Address option results in a +// new entry in the link address cache for the target of the message. +func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) { + const nicID = 1 + + tests := []struct { + name string + optsBuf []byte + expectedLinkAddr tcpip.LinkAddress + }{ + { + name: "Valid", + optsBuf: []byte{2, 1, 2, 3, 4, 5, 6, 7}, + expectedLinkAddr: "\x02\x03\x04\x05\x06\x07", + }, + { + name: "Too Small", + optsBuf: []byte{2, 1, 2, 3, 4, 5, 6}, + }, + { + name: "Invalid Length", + optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{NewProtocol()}, + }) + e := channel.New(0, 1280, linkAddr0) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) + } + + ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf) + hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize) + pkt := header.ICMPv6(hdr.Prepend(ndpNASize)) + pkt.SetType(header.ICMPv6NeighborAdvert) + ns := header.NDPNeighborAdvert(pkt.NDPPayload()) + ns.SetTargetAddress(lladdr1) + opts := ns.Options() + copy(opts, test.optsBuf) + pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{})) + payloadLength := hdr.UsedLength() + ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) + ip.Encode(&header.IPv6Fields{ + PayloadLength: uint16(payloadLength), + NextHeader: uint8(header.ICMPv6ProtocolNumber), + HopLimit: 255, + SrcAddr: lladdr1, + DstAddr: lladdr0, + }) + + invalid := s.Stats().ICMP.V6PacketsReceived.Invalid + + // Invalid count should initially be 0. + if got := invalid.Value(); got != 0 { + t.Fatalf("got invalid = %d, want = 0", got) } - if c == nil { - t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber) + + e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{ + Data: hdr.View().ToVectorisedView(), + }) + + linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil) + if linkAddr != test.expectedLinkAddr { + t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr) } - if linkAddr != "" { - t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (%s, _, ), want = ('', _, _)", nicID, lladdr1, lladdr0, ProtocolNumber, linkAddr) + + if test.expectedLinkAddr != "" { + if err != nil { + t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err) + } + if c != nil { + t.Errorf("got unexpected channel") + } + + // Invalid count should not have increased. + if got := invalid.Value(); got != 0 { + t.Errorf("got invalid = %d, want = 0", got) + } + } else { + if err != tcpip.ErrWouldBlock { + t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock) + } + if c == nil { + t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber) + } + + // Invalid count should have increased. + if got := invalid.Value(); got != 1 { + t.Errorf("got invalid = %d, want = 1", got) + } } }) } @@ -238,27 +309,59 @@ func TestHopLimitValidation(t *testing.T) { }) } + var tllData [header.NDPLinkLayerAddressSize]byte + header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{ + header.NDPTargetLinkLayerAddressOption(linkAddr1), + }) + types := []struct { name string typ header.ICMPv6Type size int + extraData []byte statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter }{ - {"RouterSolicit", header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { - return stats.RouterSolicit - }}, - {"RouterAdvert", header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { - return stats.RouterAdvert - }}, - {"NeighborSolicit", header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { - return stats.NeighborSolicit - }}, - {"NeighborAdvert", header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { - return stats.NeighborAdvert - }}, - {"RedirectMsg", header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { - return stats.RedirectMsg - }}, + { + name: "RouterSolicit", + typ: header.ICMPv6RouterSolicit, + size: header.ICMPv6MinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + return stats.RouterSolicit + }, + }, + { + name: "RouterAdvert", + typ: header.ICMPv6RouterAdvert, + size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + return stats.RouterAdvert + }, + }, + { + name: "NeighborSolicit", + typ: header.ICMPv6NeighborSolicit, + size: header.ICMPv6NeighborSolicitMinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + return stats.NeighborSolicit + }, + }, + { + name: "NeighborAdvert", + typ: header.ICMPv6NeighborAdvert, + size: header.ICMPv6NeighborAdvertMinimumSize, + extraData: tllData[:], + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + return stats.NeighborAdvert + }, + }, + { + name: "RedirectMsg", + typ: header.ICMPv6RedirectMsg, + size: header.ICMPv6MinimumSize, + statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter { + return stats.RedirectMsg + }, + }, } for _, typ := range types { @@ -270,10 +373,13 @@ func TestHopLimitValidation(t *testing.T) { invalid := stats.Invalid typStat := typ.statCounter(stats) - hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size) + extraDataLen := len(typ.extraData) + hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen) + extraData := buffer.View(hdr.Prepend(extraDataLen)) + copy(extraData, typ.extraData) pkt := header.ICMPv6(hdr.Prepend(typ.size)) pkt.SetType(typ.typ) - pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView())) // Invalid count should initially be 0. if got := invalid.Value(); got != 0 { diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 8af8565f7..9a4607dcb 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -478,13 +478,17 @@ func TestDADFail(t *testing.T) { { "RxAdvert", func(tgt tcpip.Address) buffer.Prependable { - hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) - pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) + naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize + hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize) + pkt := header.ICMPv6(hdr.Prepend(naSize)) pkt.SetType(header.ICMPv6NeighborAdvert) na := header.NDPNeighborAdvert(pkt.NDPPayload()) na.SetSolicitedFlag(true) na.SetOverrideFlag(true) na.SetTargetAddress(tgt) + na.Options().Serialize(header.NDPOptionsSerializer{ + header.NDPTargetLinkLayerAddressOption(linkAddr1), + }) pkt.SetChecksum(header.ICMPv6Checksum(pkt, tgt, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{})) payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) -- cgit v1.2.3 From 736775e0ac592c266acac0a7415f21d54715a54c Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Thu, 6 Feb 2020 14:03:49 -0800 Subject: Make gonet consistent both internally and with the net package. The types gonet.Conn and gonet.PacketConn were confusingly named as both implemented net.Conn. Further, gonet.Conn was perhaps unexpectedly TCP-specific (net.Conn is not). This change renames them to gonet.TCPConn and gonet.UDPConn. Renames gonet.NewListener to gonet.ListenTCP and adds a new gonet.NewTCPListner function to be consistent with both the gonet.DialXxx and gonet.NewXxxConn functions as well as net.ListenTCP. Updates #1632 PiperOrigin-RevId: 293671303 --- benchmarks/tcp/tcp_proxy.go | 2 +- pkg/tcpip/adapters/gonet/gonet.go | 111 +++++++++++++++++---------------- pkg/tcpip/adapters/gonet/gonet_test.go | 22 +++---- 3 files changed, 70 insertions(+), 65 deletions(-) (limited to 'pkg') diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go index 72ada5700..73b7c4f5b 100644 --- a/benchmarks/tcp/tcp_proxy.go +++ b/benchmarks/tcp/tcp_proxy.go @@ -274,7 +274,7 @@ func (n netstackImpl) listen(port int) (net.Listener, error) { NIC: nicID, Port: uint16(port), } - listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber) + listener, err := gonet.ListenTCP(n.s, addr, ipv4.ProtocolNumber) if err != nil { return nil, err } diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go index 711969b9b..6e0db2741 100644 --- a/pkg/tcpip/adapters/gonet/gonet.go +++ b/pkg/tcpip/adapters/gonet/gonet.go @@ -43,18 +43,28 @@ func (e *timeoutError) Error() string { return "i/o timeout" } func (e *timeoutError) Timeout() bool { return true } func (e *timeoutError) Temporary() bool { return true } -// A Listener is a wrapper around a tcpip endpoint that implements +// A TCPListener is a wrapper around a TCP tcpip.Endpoint that implements // net.Listener. -type Listener struct { +type TCPListener struct { stack *stack.Stack ep tcpip.Endpoint wq *waiter.Queue cancel chan struct{} } -// NewListener creates a new Listener. -func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Listener, error) { - // Create TCP endpoint, bind it, then start listening. +// NewTCPListener creates a new TCPListener from a listening tcpip.Endpoint. +func NewTCPListener(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *TCPListener { + return &TCPListener{ + stack: s, + ep: ep, + wq: wq, + cancel: make(chan struct{}), + } +} + +// ListenTCP creates a new TCPListener. +func ListenTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPListener, error) { + // Create a TCP endpoint, bind it, then start listening. var wq waiter.Queue ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq) if err != nil { @@ -81,28 +91,23 @@ func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkPr } } - return &Listener{ - stack: s, - ep: ep, - wq: &wq, - cancel: make(chan struct{}), - }, nil + return NewTCPListener(s, &wq, ep), nil } // Close implements net.Listener.Close. -func (l *Listener) Close() error { +func (l *TCPListener) Close() error { l.ep.Close() return nil } // Shutdown stops the HTTP server. -func (l *Listener) Shutdown() { +func (l *TCPListener) Shutdown() { l.ep.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead) close(l.cancel) // broadcast cancellation } // Addr implements net.Listener.Addr. -func (l *Listener) Addr() net.Addr { +func (l *TCPListener) Addr() net.Addr { a, err := l.ep.GetLocalAddress() if err != nil { return nil @@ -208,9 +213,9 @@ func (d *deadlineTimer) SetDeadline(t time.Time) error { return nil } -// A Conn is a wrapper around a tcpip.Endpoint that implements the net.Conn +// A TCPConn is a wrapper around a TCP tcpip.Endpoint that implements the net.Conn // interface. -type Conn struct { +type TCPConn struct { deadlineTimer wq *waiter.Queue @@ -228,9 +233,9 @@ type Conn struct { read buffer.View } -// NewConn creates a new Conn. -func NewConn(wq *waiter.Queue, ep tcpip.Endpoint) *Conn { - c := &Conn{ +// NewTCPConn creates a new TCPConn. +func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn { + c := &TCPConn{ wq: wq, ep: ep, } @@ -239,7 +244,7 @@ func NewConn(wq *waiter.Queue, ep tcpip.Endpoint) *Conn { } // Accept implements net.Conn.Accept. -func (l *Listener) Accept() (net.Conn, error) { +func (l *TCPListener) Accept() (net.Conn, error) { n, wq, err := l.ep.Accept() if err == tcpip.ErrWouldBlock { @@ -272,7 +277,7 @@ func (l *Listener) Accept() (net.Conn, error) { } } - return NewConn(wq, n), nil + return NewTCPConn(wq, n), nil } type opErrorer interface { @@ -323,7 +328,7 @@ func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, a } // Read implements net.Conn.Read. -func (c *Conn) Read(b []byte) (int, error) { +func (c *TCPConn) Read(b []byte) (int, error) { c.readMu.Lock() defer c.readMu.Unlock() @@ -352,7 +357,7 @@ func (c *Conn) Read(b []byte) (int, error) { } // Write implements net.Conn.Write. -func (c *Conn) Write(b []byte) (int, error) { +func (c *TCPConn) Write(b []byte) (int, error) { deadline := c.writeCancel() // Check if deadlineTimer has already expired. @@ -431,7 +436,7 @@ func (c *Conn) Write(b []byte) (int, error) { } // Close implements net.Conn.Close. -func (c *Conn) Close() error { +func (c *TCPConn) Close() error { c.ep.Close() return nil } @@ -440,7 +445,7 @@ func (c *Conn) Close() error { // should just use Close. // // A TCP Half-Close is performed the same as CloseRead for *net.TCPConn. -func (c *Conn) CloseRead() error { +func (c *TCPConn) CloseRead() error { if terr := c.ep.Shutdown(tcpip.ShutdownRead); terr != nil { return c.newOpError("close", errors.New(terr.String())) } @@ -451,7 +456,7 @@ func (c *Conn) CloseRead() error { // should just use Close. // // A TCP Half-Close is performed the same as CloseWrite for *net.TCPConn. -func (c *Conn) CloseWrite() error { +func (c *TCPConn) CloseWrite() error { if terr := c.ep.Shutdown(tcpip.ShutdownWrite); terr != nil { return c.newOpError("close", errors.New(terr.String())) } @@ -459,7 +464,7 @@ func (c *Conn) CloseWrite() error { } // LocalAddr implements net.Conn.LocalAddr. -func (c *Conn) LocalAddr() net.Addr { +func (c *TCPConn) LocalAddr() net.Addr { a, err := c.ep.GetLocalAddress() if err != nil { return nil @@ -468,7 +473,7 @@ func (c *Conn) LocalAddr() net.Addr { } // RemoteAddr implements net.Conn.RemoteAddr. -func (c *Conn) RemoteAddr() net.Addr { +func (c *TCPConn) RemoteAddr() net.Addr { a, err := c.ep.GetRemoteAddress() if err != nil { return nil @@ -476,7 +481,7 @@ func (c *Conn) RemoteAddr() net.Addr { return fullToTCPAddr(a) } -func (c *Conn) newOpError(op string, err error) *net.OpError { +func (c *TCPConn) newOpError(op string, err error) *net.OpError { return &net.OpError{ Op: op, Net: "tcp", @@ -494,14 +499,14 @@ func fullToUDPAddr(addr tcpip.FullAddress) *net.UDPAddr { return &net.UDPAddr{IP: net.IP(addr.Addr), Port: int(addr.Port)} } -// DialTCP creates a new TCP Conn connected to the specified address. -func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Conn, error) { +// DialTCP creates a new TCPConn connected to the specified address. +func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) { return DialContextTCP(context.Background(), s, addr, network) } -// DialContextTCP creates a new TCP Conn connected to the specified address +// DialContextTCP creates a new TCPConn connected to the specified address // with the option of adding cancellation and timeouts. -func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Conn, error) { +func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) { // Create TCP endpoint, then connect. var wq waiter.Queue ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq) @@ -543,12 +548,12 @@ func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, } } - return NewConn(&wq, ep), nil + return NewTCPConn(&wq, ep), nil } -// A PacketConn is a wrapper around a tcpip endpoint that implements -// net.PacketConn. -type PacketConn struct { +// A UDPConn is a wrapper around a UDP tcpip.Endpoint that implements +// net.Conn and net.PacketConn. +type UDPConn struct { deadlineTimer stack *stack.Stack @@ -556,9 +561,9 @@ type PacketConn struct { wq *waiter.Queue } -// NewPacketConn creates a new PacketConn. -func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketConn { - c := &PacketConn{ +// NewUDPConn creates a new UDPConn. +func NewUDPConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *UDPConn { + c := &UDPConn{ stack: s, ep: ep, wq: wq, @@ -567,12 +572,12 @@ func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketC return c } -// DialUDP creates a new PacketConn. +// DialUDP creates a new UDPConn. // // If laddr is nil, a local address is automatically chosen. // -// If raddr is nil, the PacketConn is left unconnected. -func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*PacketConn, error) { +// If raddr is nil, the UDPConn is left unconnected. +func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*UDPConn, error) { var wq waiter.Queue ep, err := s.NewEndpoint(udp.ProtocolNumber, network, &wq) if err != nil { @@ -591,7 +596,7 @@ func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.Netw } } - c := NewPacketConn(s, &wq, ep) + c := NewUDPConn(s, &wq, ep) if raddr != nil { if err := c.ep.Connect(*raddr); err != nil { @@ -608,11 +613,11 @@ func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.Netw return c, nil } -func (c *PacketConn) newOpError(op string, err error) *net.OpError { +func (c *UDPConn) newOpError(op string, err error) *net.OpError { return c.newRemoteOpError(op, nil, err) } -func (c *PacketConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError { +func (c *UDPConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError { return &net.OpError{ Op: op, Net: "udp", @@ -623,7 +628,7 @@ func (c *PacketConn) newRemoteOpError(op string, remote net.Addr, err error) *ne } // RemoteAddr implements net.Conn.RemoteAddr. -func (c *PacketConn) RemoteAddr() net.Addr { +func (c *UDPConn) RemoteAddr() net.Addr { a, err := c.ep.GetRemoteAddress() if err != nil { return nil @@ -632,13 +637,13 @@ func (c *PacketConn) RemoteAddr() net.Addr { } // Read implements net.Conn.Read -func (c *PacketConn) Read(b []byte) (int, error) { +func (c *UDPConn) Read(b []byte) (int, error) { bytesRead, _, err := c.ReadFrom(b) return bytesRead, err } // ReadFrom implements net.PacketConn.ReadFrom. -func (c *PacketConn) ReadFrom(b []byte) (int, net.Addr, error) { +func (c *UDPConn) ReadFrom(b []byte) (int, net.Addr, error) { deadline := c.readCancel() var addr tcpip.FullAddress @@ -650,12 +655,12 @@ func (c *PacketConn) ReadFrom(b []byte) (int, net.Addr, error) { return copy(b, read), fullToUDPAddr(addr), nil } -func (c *PacketConn) Write(b []byte) (int, error) { +func (c *UDPConn) Write(b []byte) (int, error) { return c.WriteTo(b, nil) } // WriteTo implements net.PacketConn.WriteTo. -func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) { +func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { deadline := c.writeCancel() // Check if deadline has already expired. @@ -713,13 +718,13 @@ func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) { } // Close implements net.PacketConn.Close. -func (c *PacketConn) Close() error { +func (c *UDPConn) Close() error { c.ep.Close() return nil } // LocalAddr implements net.PacketConn.LocalAddr. -func (c *PacketConn) LocalAddr() net.Addr { +func (c *UDPConn) LocalAddr() net.Addr { a, err := c.ep.GetLocalAddress() if err != nil { return nil diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go index ee077ae83..ea0a0409a 100644 --- a/pkg/tcpip/adapters/gonet/gonet_test.go +++ b/pkg/tcpip/adapters/gonet/gonet_test.go @@ -41,7 +41,7 @@ const ( ) func TestTimeouts(t *testing.T) { - nc := NewConn(nil, nil) + nc := NewTCPConn(nil, nil) dlfs := []struct { name string f func(time.Time) error @@ -132,7 +132,7 @@ func TestCloseReader(t *testing.T) { s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) - l, e := NewListener(s, addr, ipv4.ProtocolNumber) + l, e := ListenTCP(s, addr, ipv4.ProtocolNumber) if e != nil { t.Fatalf("NewListener() = %v", e) } @@ -168,7 +168,7 @@ func TestCloseReader(t *testing.T) { sender.close() } -// TestCloseReaderWithForwarder tests that Conn.Close() wakes Conn.Read() when +// TestCloseReaderWithForwarder tests that TCPConn.Close wakes TCPConn.Read when // using tcp.Forwarder. func TestCloseReaderWithForwarder(t *testing.T) { s, err := newLoopbackStack() @@ -192,7 +192,7 @@ func TestCloseReaderWithForwarder(t *testing.T) { defer ep.Close() r.Complete(false) - c := NewConn(&wq, ep) + c := NewTCPConn(&wq, ep) // Give c.Read() a chance to block before closing the connection. time.AfterFunc(time.Millisecond*50, func() { @@ -238,7 +238,7 @@ func TestCloseRead(t *testing.T) { defer ep.Close() r.Complete(false) - c := NewConn(&wq, ep) + c := NewTCPConn(&wq, ep) buf := make([]byte, 256) n, e := c.Read(buf) @@ -257,7 +257,7 @@ func TestCloseRead(t *testing.T) { if terr != nil { t.Fatalf("connect() = %v", terr) } - c := NewConn(tc.wq, tc.ep) + c := NewTCPConn(tc.wq, tc.ep) if err := c.CloseRead(); err != nil { t.Errorf("c.CloseRead() = %v", err) @@ -291,7 +291,7 @@ func TestCloseWrite(t *testing.T) { defer ep.Close() r.Complete(false) - c := NewConn(&wq, ep) + c := NewTCPConn(&wq, ep) n, e := c.Read(make([]byte, 256)) if n != 0 || e != io.EOF { @@ -309,7 +309,7 @@ func TestCloseWrite(t *testing.T) { if terr != nil { t.Fatalf("connect() = %v", terr) } - c := NewConn(tc.wq, tc.ep) + c := NewTCPConn(tc.wq, tc.ep) if err := c.CloseWrite(); err != nil { t.Errorf("c.CloseWrite() = %v", err) @@ -353,7 +353,7 @@ func TestUDPForwarder(t *testing.T) { } defer ep.Close() - c := NewConn(&wq, ep) + c := NewTCPConn(&wq, ep) buf := make([]byte, 256) n, e := c.Read(buf) @@ -396,7 +396,7 @@ func TestDeadlineChange(t *testing.T) { s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) - l, e := NewListener(s, addr, ipv4.ProtocolNumber) + l, e := ListenTCP(s, addr, ipv4.ProtocolNumber) if e != nil { t.Fatalf("NewListener() = %v", e) } @@ -541,7 +541,7 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) { addr := tcpip.FullAddress{NICID, ip, 11211} s.AddAddress(NICID, ipv4.ProtocolNumber, ip) - l, err := NewListener(s, addr, ipv4.ProtocolNumber) + l, err := ListenTCP(s, addr, ipv4.ProtocolNumber) if err != nil { return nil, nil, nil, fmt.Errorf("NewListener: %v", err) } -- cgit v1.2.3 From 940d255971c38af9f91ceed1345fd973f8fdb41d Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Thu, 6 Feb 2020 15:57:34 -0800 Subject: Perform DAD on IPv6 addresses when enabling a NIC Addresses may be added before a NIC is enabled. Make sure DAD is performed on the permanent IPv6 addresses when they get enabled. Test: - stack_test.TestDoDADWhenNICEnabled - stack.TestDisabledRxStatsWhenNICDisabled PiperOrigin-RevId: 293697429 --- pkg/tcpip/stack/BUILD | 6 ++- pkg/tcpip/stack/ndp_test.go | 74 +++++++++++++-------------- pkg/tcpip/stack/nic.go | 84 ++++++++++++++++++++++-------- pkg/tcpip/stack/nic_test.go | 62 +++++++++++++++++++++++ pkg/tcpip/stack/stack_test.go | 115 ++++++++++++++++++++++++++++++++++++++++++ pkg/tcpip/tcpip.go | 8 +-- 6 files changed, 287 insertions(+), 62 deletions(-) create mode 100644 pkg/tcpip/stack/nic_test.go (limited to 'pkg') diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index f5b750046..705cf01ee 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -78,11 +78,15 @@ go_test( go_test( name = "stack_test", size = "small", - srcs = ["linkaddrcache_test.go"], + srcs = [ + "linkaddrcache_test.go", + "nic_test.go", + ], library = ":stack", deps = [ "//pkg/sleep", "//pkg/sync", "//pkg/tcpip", + "//pkg/tcpip/buffer", ], ) diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 9a4607dcb..1e575bdaf 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -1539,7 +1539,7 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) { } // Checks to see if list contains an IPv6 address, item. -func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool { +func containsV6Addr(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool { protocolAddress := tcpip.ProtocolAddress{ Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: item, @@ -1665,7 +1665,7 @@ func TestAutoGenAddr(t *testing.T) { // with non-zero lifetime. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) expectAutoGenAddrEvent(addr1, newAddr) - if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) { t.Fatalf("Should have %s in the list of addresses", addr1) } @@ -1681,10 +1681,10 @@ func TestAutoGenAddr(t *testing.T) { // Receive an RA with prefix2 in a PI. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) expectAutoGenAddrEvent(addr2, newAddr) - if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) { t.Fatalf("Should have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) { t.Fatalf("Should have %s in the list of addresses", addr2) } @@ -1705,10 +1705,10 @@ func TestAutoGenAddr(t *testing.T) { case <-time.After(newMinVLDuration + defaultAsyncEventTimeout): t.Fatal("timed out waiting for addr auto gen event") } - if contains(s.NICInfo()[1].ProtocolAddresses, addr1) { + if containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) { t.Fatalf("Should not have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) { t.Fatalf("Should have %s in the list of addresses", addr2) } } @@ -1853,7 +1853,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { // Receive PI for prefix1. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) expectAutoGenAddrEvent(addr1, newAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should have %s in the list of addresses", addr1) } expectPrimaryAddr(addr1) @@ -1861,7 +1861,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { // Deprecate addr for prefix1 immedaitely. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) expectAutoGenAddrEvent(addr1, deprecatedAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should have %s in the list of addresses", addr1) } // addr should still be the primary endpoint as there are no other addresses. @@ -1879,7 +1879,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { // Receive PI for prefix2. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) expectAutoGenAddrEvent(addr2, newAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } expectPrimaryAddr(addr2) @@ -1887,7 +1887,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { // Deprecate addr for prefix2 immedaitely. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) expectAutoGenAddrEvent(addr2, deprecatedAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } // addr1 should be the primary endpoint now since addr2 is deprecated but @@ -1982,7 +1982,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // Receive PI for prefix2. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) expectAutoGenAddrEvent(addr2, newAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } expectPrimaryAddr(addr2) @@ -1990,10 +1990,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // Receive a PI for prefix1. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90)) expectAutoGenAddrEvent(addr1, newAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } expectPrimaryAddr(addr1) @@ -2009,10 +2009,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // Wait for addr of prefix1 to be deprecated. expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } // addr2 should be the primary endpoint now since addr1 is deprecated but @@ -2049,10 +2049,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // Wait for addr of prefix1 to be deprecated. expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } // addr2 should be the primary endpoint now since it is not deprecated. @@ -2063,10 +2063,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // Wait for addr of prefix1 to be invalidated. expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncEventTimeout) - if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } expectPrimaryAddr(addr2) @@ -2112,10 +2112,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { case <-time.After(newMinVLDuration + defaultAsyncEventTimeout): t.Fatal("timed out waiting for addr auto gen event") } - if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } - if contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should not have %s in the list of addresses", addr2) } // Should not have any primary endpoints. @@ -2600,7 +2600,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) { if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil { t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err) } - if !contains(s.NICInfo()[1].ProtocolAddresses, addr) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) { t.Fatalf("Should have %s in the list of addresses", addr1) } @@ -2613,7 +2613,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) { t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically") default: } - if !contains(s.NICInfo()[1].ProtocolAddresses, addr) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) { t.Fatalf("Should have %s in the list of addresses", addr1) } @@ -2624,7 +2624,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) { t.Fatal("unexpectedly received an auto gen addr event") case <-time.After(lifetimeSeconds*time.Second + defaultTimeout): } - if !contains(s.NICInfo()[1].ProtocolAddresses, addr) { + if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) { t.Fatalf("Should have %s in the list of addresses", addr1) } } @@ -2702,17 +2702,17 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) { const validLifetimeSecondPrefix1 = 1 e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0)) expectAutoGenAddrEvent(addr1, newAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should have %s in the list of addresses", addr1) } // Receive an RA with prefix2 in a PI with a large valid lifetime. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) expectAutoGenAddrEvent(addr2, newAddr) - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } @@ -2725,10 +2725,10 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) { case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncEventTimeout): t.Fatal("timed out waiting for addr auto gen event") } - if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) { + if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } - if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) { + if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) } } @@ -3014,16 +3014,16 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { nicinfo := s.NICInfo() nic1Addrs := nicinfo[nicID1].ProtocolAddresses nic2Addrs := nicinfo[nicID2].ProtocolAddresses - if !contains(nic1Addrs, e1Addr1) { + if !containsV6Addr(nic1Addrs, e1Addr1) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs) } - if !contains(nic1Addrs, e1Addr2) { + if !containsV6Addr(nic1Addrs, e1Addr2) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs) } - if !contains(nic2Addrs, e2Addr1) { + if !containsV6Addr(nic2Addrs, e2Addr1) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs) } - if !contains(nic2Addrs, e2Addr2) { + if !containsV6Addr(nic2Addrs, e2Addr2) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs) } @@ -3102,16 +3102,16 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { nicinfo = s.NICInfo() nic1Addrs = nicinfo[nicID1].ProtocolAddresses nic2Addrs = nicinfo[nicID2].ProtocolAddresses - if contains(nic1Addrs, e1Addr1) { + if containsV6Addr(nic1Addrs, e1Addr1) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs) } - if contains(nic1Addrs, e1Addr2) { + if containsV6Addr(nic1Addrs, e1Addr2) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs) } - if contains(nic2Addrs, e2Addr1) { + if containsV6Addr(nic2Addrs, e2Addr1) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs) } - if contains(nic2Addrs, e2Addr2) { + if containsV6Addr(nic2Addrs, e2Addr2) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs) } diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 7dad9a8cb..682e9c416 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -16,6 +16,7 @@ package stack import ( "log" + "reflect" "sort" "strings" "sync/atomic" @@ -39,6 +40,7 @@ type NIC struct { mu struct { sync.RWMutex + enabled bool spoofing bool promiscuous bool primary map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint @@ -56,6 +58,14 @@ type NIC struct { type NICStats struct { Tx DirectionStats Rx DirectionStats + + DisabledRx DirectionStats +} + +func makeNICStats() NICStats { + var s NICStats + tcpip.InitStatCounters(reflect.ValueOf(&s).Elem()) + return s } // DirectionStats includes packet and byte counts. @@ -99,16 +109,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC name: name, linkEP: ep, context: ctx, - stats: NICStats{ - Tx: DirectionStats{ - Packets: &tcpip.StatCounter{}, - Bytes: &tcpip.StatCounter{}, - }, - Rx: DirectionStats{ - Packets: &tcpip.StatCounter{}, - Bytes: &tcpip.StatCounter{}, - }, - }, + stats: makeNICStats(), } nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint) nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint) @@ -137,14 +138,30 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC // enable enables the NIC. enable will attach the link to its LinkEndpoint and // join the IPv6 All-Nodes Multicast address (ff02::1). func (n *NIC) enable() *tcpip.Error { + n.mu.RLock() + enabled := n.mu.enabled + n.mu.RUnlock() + if enabled { + return nil + } + + n.mu.Lock() + defer n.mu.Unlock() + + if n.mu.enabled { + return nil + } + + n.mu.enabled = true + n.attachLinkEndpoint() // Create an endpoint to receive broadcast packets on this interface. if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok { - if err := n.AddAddress(tcpip.ProtocolAddress{ + if _, err := n.addAddressLocked(tcpip.ProtocolAddress{ Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize}, - }, NeverPrimaryEndpoint); err != nil { + }, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil { return err } } @@ -166,8 +183,22 @@ func (n *NIC) enable() *tcpip.Error { return nil } - n.mu.Lock() - defer n.mu.Unlock() + // Perform DAD on the all the unicast IPv6 endpoints that are in the permanent + // state. + // + // Addresses may have aleady completed DAD but in the time since the NIC was + // last enabled, other devices may have acquired the same addresses. + for _, r := range n.mu.endpoints { + addr := r.ep.ID().LocalAddress + if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) { + continue + } + + r.setKind(permanentTentative) + if err := n.mu.ndp.startDuplicateAddressDetection(addr, r); err != nil { + return err + } + } if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil { return err @@ -633,7 +664,9 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address) // If the address is an IPv6 address and it is a permanent address, - // mark it as tentative so it goes through the DAD process. + // mark it as tentative so it goes through the DAD process if the NIC is + // enabled. If the NIC is not enabled, DAD will be started when the NIC is + // enabled. if isIPv6Unicast && kind == permanent { kind = permanentTentative } @@ -668,8 +701,8 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar n.insertPrimaryEndpointLocked(ref, peb) - // If we are adding a tentative IPv6 address, start DAD. - if isIPv6Unicast && kind == permanentTentative { + // If we are adding a tentative IPv6 address, start DAD if the NIC is enabled. + if isIPv6Unicast && kind == permanentTentative && n.mu.enabled { if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil { return nil, err } @@ -700,9 +733,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress { // Don't include tentative, expired or temporary endpoints to // avoid confusion and prevent the caller from using those. switch ref.getKind() { - case permanentTentative, permanentExpired, temporary: - // TODO(b/140898488): Should tentative addresses be - // returned? + case permanentExpired, temporary: continue } addrs = append(addrs, tcpip.ProtocolAddress{ @@ -1016,11 +1047,23 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, // This rule applies only to the slice itself, not to the items of the slice; // the ownership of the items is not retained by the caller. func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) { + n.mu.RLock() + enabled := n.mu.enabled + // If the NIC is not yet enabled, don't receive any packets. + if !enabled { + n.mu.RUnlock() + + n.stats.DisabledRx.Packets.Increment() + n.stats.DisabledRx.Bytes.IncrementBy(uint64(pkt.Data.Size())) + return + } + n.stats.Rx.Packets.Increment() n.stats.Rx.Bytes.IncrementBy(uint64(pkt.Data.Size())) netProto, ok := n.stack.networkProtocols[protocol] if !ok { + n.mu.RUnlock() n.stack.stats.UnknownProtocolRcvdPackets.Increment() return } @@ -1032,7 +1075,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link } // Are any packet sockets listening for this network protocol? - n.mu.RLock() packetEPs := n.mu.packetEPs[protocol] // Check whether there are packet sockets listening for every protocol. // If we received a packet with protocol EthernetProtocolAll, then the diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go new file mode 100644 index 000000000..edaee3b86 --- /dev/null +++ b/pkg/tcpip/stack/nic_test.go @@ -0,0 +1,62 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" +) + +func TestDisabledRxStatsWhenNICDisabled(t *testing.T) { + // When the NIC is disabled, the only field that matters is the stats field. + // This test is limited to stats counter checks. + nic := NIC{ + stats: makeNICStats(), + } + + if got := nic.stats.DisabledRx.Packets.Value(); got != 0 { + t.Errorf("got DisabledRx.Packets = %d, want = 0", got) + } + if got := nic.stats.DisabledRx.Bytes.Value(); got != 0 { + t.Errorf("got DisabledRx.Bytes = %d, want = 0", got) + } + if got := nic.stats.Rx.Packets.Value(); got != 0 { + t.Errorf("got Rx.Packets = %d, want = 0", got) + } + if got := nic.stats.Rx.Bytes.Value(); got != 0 { + t.Errorf("got Rx.Bytes = %d, want = 0", got) + } + + if t.Failed() { + t.FailNow() + } + + nic.DeliverNetworkPacket(nil, "", "", 0, tcpip.PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()}) + + if got := nic.stats.DisabledRx.Packets.Value(); got != 1 { + t.Errorf("got DisabledRx.Packets = %d, want = 1", got) + } + if got := nic.stats.DisabledRx.Bytes.Value(); got != 4 { + t.Errorf("got DisabledRx.Bytes = %d, want = 4", got) + } + if got := nic.stats.Rx.Packets.Value(); got != 0 { + t.Errorf("got Rx.Packets = %d, want = 0", got) + } + if got := nic.stats.Rx.Bytes.Value(); got != 0 { + t.Errorf("got Rx.Bytes = %d, want = 0", got) + } +} diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 834fe9487..243868f3a 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -2561,3 +2561,118 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) { }) } } + +// TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC +// was disabled have DAD performed on them when the NIC is enabled. +func TestDoDADWhenNICEnabled(t *testing.T) { + t.Parallel() + + const dadTransmits = 1 + const retransmitTimer = time.Second + const nicID = 1 + + ndpDisp := ndpDispatcher{ + dadC: make(chan ndpDADEvent), + } + opts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPConfigs: stack.NDPConfigurations{ + DupAddrDetectTransmits: dadTransmits, + RetransmitTimer: retransmitTimer, + }, + NDPDisp: &ndpDisp, + } + + e := channel.New(dadTransmits, 1280, linkAddr1) + s := stack.New(opts) + nicOpts := stack.NICOptions{Disabled: true} + if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { + t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err) + } + + addr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: llAddr1, + PrefixLen: 128, + }, + } + if err := s.AddProtocolAddress(nicID, addr); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err) + } + + // Address should be in the list of all addresses. + if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { + t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr) + } + + // Address should be tentative so it should not be a main address. + got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) + } + if want := (tcpip.AddressWithPrefix{}); got != want { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want) + } + + // Enabling the NIC should start DAD for the address. + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("s.EnableNIC(%d): %s", nicID, err) + } + if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { + t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr) + } + + // Address should not be considered bound to the NIC yet (DAD ongoing). + got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) + } + if want := (tcpip.AddressWithPrefix{}); got != want { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want) + } + + // Wait for DAD to resolve. + select { + case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout): + t.Fatal("timed out waiting for DAD resolution") + case e := <-ndpDisp.dadC: + if e.err != nil { + t.Fatal("got DAD error: ", e.err) + } + if e.nicID != nicID { + t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID) + } + if e.addr != addr.AddressWithPrefix.Address { + t.Fatalf("got DAD event w/ addr = %s, want = %s", e.addr, addr.AddressWithPrefix.Address) + } + if !e.resolved { + t.Fatal("got DAD event w/ resolved = false, want = true") + } + } + if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { + t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr) + } + got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) + } + if got != addr.AddressWithPrefix { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix) + } + + // Enabling the NIC again should be a no-op. + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("s.EnableNIC(%d): %s", nicID, err) + } + if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { + t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr) + } + got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) + } + if got != addr.AddressWithPrefix { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix) + } +} diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index d29d9a704..0e944712f 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -1170,7 +1170,9 @@ type TransportEndpointStats struct { // marker interface. func (*TransportEndpointStats) IsEndpointStats() {} -func fillIn(v reflect.Value) { +// InitStatCounters initializes v's fields with nil StatCounter fields to new +// StatCounters. +func InitStatCounters(v reflect.Value) { for i := 0; i < v.NumField(); i++ { v := v.Field(i) if s, ok := v.Addr().Interface().(**StatCounter); ok { @@ -1178,14 +1180,14 @@ func fillIn(v reflect.Value) { *s = new(StatCounter) } } else { - fillIn(v) + InitStatCounters(v) } } } // FillIn returns a copy of s with nil fields initialized to new StatCounters. func (s Stats) FillIn() Stats { - fillIn(reflect.ValueOf(&s).Elem()) + InitStatCounters(reflect.ValueOf(&s).Elem()) return s } -- cgit v1.2.3 From 3700221b1f3ff0779a0f4479fd2bafa3312d5a23 Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Thu, 6 Feb 2020 16:42:37 -0800 Subject: Auto-generate link-local address as a SLAAC address Auto-generated link-local addresses should have the same lifecycle hooks as global SLAAC addresses. The Stack's NDP dispatcher should be notified when link-local addresses are auto-generated and invalidated. They should also be removed when a NIC is disabled (which will be supported in a later change). Tests: - stack_test.TestNICAutoGenAddrWithOpaque - stack_test.TestNICAutoGenAddr PiperOrigin-RevId: 293706760 --- pkg/tcpip/stack/ndp.go | 36 +++-- pkg/tcpip/stack/ndp_test.go | 85 +++++++----- pkg/tcpip/stack/nic.go | 30 +---- pkg/tcpip/stack/stack_test.go | 307 +++++++++++++++++++----------------------- 4 files changed, 218 insertions(+), 240 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index 6123fda33..fae5f5014 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -906,22 +906,21 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform return } - // We do not already have an address within the prefix, prefix. Do the + // We do not already have an address with the prefix prefix. Do the // work as outlined by RFC 4862 section 5.5.3.d if n is configured - // to auto-generated global addresses by SLAAC. - ndp.newAutoGenAddress(prefix, pl, vl) + // to auto-generate global addresses by SLAAC. + if !ndp.configs.AutoGenGlobalAddresses { + return + } + + ndp.doSLAAC(prefix, pl, vl) } -// newAutoGenAddress generates a new SLAAC address with the provided lifetimes +// doSLAAC generates a new SLAAC address with the provided lifetimes // for prefix. // // pl is the new preferred lifetime. vl is the new valid lifetime. -func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration) { - // Are we configured to auto-generate new global addresses? - if !ndp.configs.AutoGenGlobalAddresses { - return - } - +func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { // If we do not already have an address for this prefix and the valid // lifetime is 0, no need to do anything further, as per RFC 4862 // section 5.5.3.d. @@ -1152,12 +1151,21 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo // // The NIC that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupHostOnlyState() { + linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet() + linkLocalAddrs := 0 for addr := range ndp.autoGenAddresses { + // RFC 4862 section 5 states that routers are also expected to generate a + // link-local address so we do not invalidate them. + if linkLocalSubnet.Contains(addr) { + linkLocalAddrs++ + continue + } + ndp.invalidateAutoGenAddress(addr) } - if got := len(ndp.autoGenAddresses); got != 0 { - log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got) + if got := len(ndp.autoGenAddresses); got != linkLocalAddrs { + log.Fatalf("ndp: still have non-linklocal auto-generated addresses after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalAddrs) } for prefix := range ndp.onLinkPrefixes { @@ -1165,7 +1173,7 @@ func (ndp *ndpState) cleanupHostOnlyState() { } if got := len(ndp.onLinkPrefixes); got != 0 { - log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got) + log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got) } for router := range ndp.defaultRouters { @@ -1173,7 +1181,7 @@ func (ndp *ndpState) cleanupHostOnlyState() { } if got := len(ndp.defaultRouters); got != 0 { - log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got) + log.Fatalf("ndp: still have discovered default routers after cleaning up; found = %d", got) } } diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 1e575bdaf..e13509fbd 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -42,6 +42,7 @@ const ( linkAddr1 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06") linkAddr2 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07") linkAddr3 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08") + linkAddr4 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09") defaultTimeout = 100 * time.Millisecond defaultAsyncEventTimeout = time.Second ) @@ -50,6 +51,7 @@ var ( llAddr1 = header.LinkLocalAddr(linkAddr1) llAddr2 = header.LinkLocalAddr(linkAddr2) llAddr3 = header.LinkLocalAddr(linkAddr3) + llAddr4 = header.LinkLocalAddr(linkAddr4) dstAddr = tcpip.FullAddress{ Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", Port: 25, @@ -2882,8 +2884,8 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) { } // TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers -// and prefixes, and auto-generated addresses get invalidated when a NIC -// becomes a router. +// and prefixes, and non-linklocal auto-generated addresses are invalidated when +// a NIC becomes a router. func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { t.Parallel() @@ -2898,6 +2900,14 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1) e2Addr1 := addrForSubnet(subnet1, linkAddr2) e2Addr2 := addrForSubnet(subnet2, linkAddr2) + llAddrWithPrefix1 := tcpip.AddressWithPrefix{ + Address: llAddr1, + PrefixLen: 64, + } + llAddrWithPrefix2 := tcpip.AddressWithPrefix{ + Address: llAddr2, + PrefixLen: 64, + } ndpDisp := ndpDispatcher{ routerC: make(chan ndpRouterEvent, maxEvents), @@ -2907,7 +2917,8 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { autoGenAddrC: make(chan ndpAutoGenAddrEvent, maxEvents), } s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + AutoGenIPv6LinkLocal: true, NDPConfigs: stack.NDPConfigurations{ HandleRAs: true, DiscoverDefaultRouters: true, @@ -2917,16 +2928,6 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { NDPDisp: &ndpDisp, }) - e1 := channel.New(0, 1280, linkAddr1) - if err := s.CreateNIC(nicID1, e1); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err) - } - - e2 := channel.New(0, 1280, linkAddr2) - if err := s.CreateNIC(nicID2, e2); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err) - } - expectRouterEvent := func() (bool, ndpRouterEvent) { select { case e := <-ndpDisp.routerC: @@ -2957,18 +2958,30 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { return false, ndpAutoGenAddrEvent{} } - // Receive RAs on NIC(1) and NIC(2) from default routers (llAddr1 and - // llAddr2) w/ PI (for prefix1 in RA from llAddr1 and prefix2 in RA from - // llAddr2) to discover multiple routers and prefixes, and auto-gen - // multiple addresses. - - e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) + e1 := channel.New(0, 1280, linkAddr1) + if err := s.CreateNIC(nicID1, e1); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err) + } // We have other tests that make sure we receive the *correct* events // on normal discovery of routers/prefixes, and auto-generated // addresses. Here we just make sure we get an event and let other tests // handle the correctness check. + expectAutoGenAddrEvent() + + e2 := channel.New(0, 1280, linkAddr2) + if err := s.CreateNIC(nicID2, e2); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err) + } + expectAutoGenAddrEvent() + + // Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and + // llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from + // llAddr4) to discover multiple routers and prefixes, and auto-gen + // multiple addresses. + + e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID1) + t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1) @@ -2977,9 +2990,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1) } - e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) + e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID1) + t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1) @@ -2988,9 +3001,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1) } - e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) + e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID2) + t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2) @@ -2999,9 +3012,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2) } - e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) + e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID2) + t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2) @@ -3014,12 +3027,18 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { nicinfo := s.NICInfo() nic1Addrs := nicinfo[nicID1].ProtocolAddresses nic2Addrs := nicinfo[nicID2].ProtocolAddresses + if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs) + } if !containsV6Addr(nic1Addrs, e1Addr1) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs) } if !containsV6Addr(nic1Addrs, e1Addr2) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs) } + if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs) + } if !containsV6Addr(nic2Addrs, e2Addr1) { t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs) } @@ -3071,10 +3090,10 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { } expectedRouterEvents := map[ndpRouterEvent]int{ - {nicID: nicID1, addr: llAddr1, discovered: false}: 1, - {nicID: nicID1, addr: llAddr2, discovered: false}: 1, - {nicID: nicID2, addr: llAddr1, discovered: false}: 1, - {nicID: nicID2, addr: llAddr2, discovered: false}: 1, + {nicID: nicID1, addr: llAddr3, discovered: false}: 1, + {nicID: nicID1, addr: llAddr4, discovered: false}: 1, + {nicID: nicID2, addr: llAddr3, discovered: false}: 1, + {nicID: nicID2, addr: llAddr4, discovered: false}: 1, } if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" { t.Errorf("router events mismatch (-want +got):\n%s", diff) @@ -3102,12 +3121,18 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) { nicinfo = s.NICInfo() nic1Addrs = nicinfo[nicID1].ProtocolAddresses nic2Addrs = nicinfo[nicID2].ProtocolAddresses + if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs) + } if containsV6Addr(nic1Addrs, e1Addr1) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs) } if containsV6Addr(nic1Addrs, e1Addr2) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs) } + if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) { + t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs) + } if containsV6Addr(nic2Addrs, e2Addr1) { t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs) } diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 682e9c416..78d451cca 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -206,33 +206,9 @@ func (n *NIC) enable() *tcpip.Error { // Do not auto-generate an IPv6 link-local address for loopback devices. if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() { - var addr tcpip.Address - if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil { - addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey) - } else { - l2addr := n.linkEP.LinkAddress() - - // Only attempt to generate the link-local address if we have a valid MAC - // address. - // - // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by - // LinkEndpoint.LinkAddress) before reaching this point. - if !header.IsValidUnicastEthernetAddress(l2addr) { - return nil - } - - addr = header.LinkLocalAddr(l2addr) - } - - if _, err := n.addAddressLocked(tcpip.ProtocolAddress{ - Protocol: header.IPv6ProtocolNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{ - Address: addr, - PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen, - }, - }, CanBePrimaryEndpoint, permanent, static, false /* deprecated */); err != nil { - return err - } + // The valid and preferred lifetime is infinite for the auto-generated + // link-local address. + n.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime) } // If we are operating as a router, then do not solicit routers since we diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 243868f3a..b2c1763bf 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -1894,112 +1894,6 @@ func TestNICForwarding(t *testing.T) { } } -// TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses -// using the modified EUI-64 of the NIC's MAC address (or lack there-of if -// disabled (default)). Note, DAD will be disabled in these tests. -func TestNICAutoGenAddr(t *testing.T) { - tests := []struct { - name string - autoGen bool - linkAddr tcpip.LinkAddress - iidOpts stack.OpaqueInterfaceIdentifierOptions - shouldGen bool - }{ - { - "Disabled", - false, - linkAddr1, - stack.OpaqueInterfaceIdentifierOptions{ - NICNameFromID: func(nicID tcpip.NICID, _ string) string { - return fmt.Sprintf("nic%d", nicID) - }, - }, - false, - }, - { - "Enabled", - true, - linkAddr1, - stack.OpaqueInterfaceIdentifierOptions{}, - true, - }, - { - "Nil MAC", - true, - tcpip.LinkAddress([]byte(nil)), - stack.OpaqueInterfaceIdentifierOptions{}, - false, - }, - { - "Empty MAC", - true, - tcpip.LinkAddress(""), - stack.OpaqueInterfaceIdentifierOptions{}, - false, - }, - { - "Invalid MAC", - true, - tcpip.LinkAddress("\x01\x02\x03"), - stack.OpaqueInterfaceIdentifierOptions{}, - false, - }, - { - "Multicast MAC", - true, - tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"), - stack.OpaqueInterfaceIdentifierOptions{}, - false, - }, - { - "Unspecified MAC", - true, - tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"), - stack.OpaqueInterfaceIdentifierOptions{}, - false, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - opts := stack.Options{ - NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, - OpaqueIIDOpts: test.iidOpts, - } - - if test.autoGen { - // Only set opts.AutoGenIPv6LinkLocal when test.autoGen is true because - // opts.AutoGenIPv6LinkLocal should be false by default. - opts.AutoGenIPv6LinkLocal = true - } - - e := channel.New(10, 1280, test.linkAddr) - s := stack.New(opts) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(_) = %s", err) - } - - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err) - } - - if test.shouldGen { - // Should have auto-generated an address and resolved immediately (DAD - // is disabled). - if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want) - } - } else { - // Should not have auto-generated an address. - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) - } - } - }) - } -} - // TestNICContextPreservation tests that you can read out via stack.NICInfo the // Context data you pass via NICContext.Context in stack.CreateNICWithOptions. func TestNICContextPreservation(t *testing.T) { @@ -2040,11 +1934,9 @@ func TestNICContextPreservation(t *testing.T) { } } -// TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local -// addresses with opaque interface identifiers. Link Local addresses should -// always be generated with opaque IIDs if configured to use them, even if the -// NIC has an invalid MAC address. -func TestNICAutoGenAddrWithOpaque(t *testing.T) { +// TestNICAutoGenLinkLocalAddr tests the auto-generation of IPv6 link-local +// addresses. +func TestNICAutoGenLinkLocalAddr(t *testing.T) { const nicID = 1 var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte @@ -2056,108 +1948,185 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) { t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n) } + nicNameFunc := func(_ tcpip.NICID, name string) string { + return name + } + tests := []struct { - name string - nicName string - autoGen bool - linkAddr tcpip.LinkAddress - secretKey []byte + name string + nicName string + autoGen bool + linkAddr tcpip.LinkAddress + iidOpts stack.OpaqueInterfaceIdentifierOptions + shouldGen bool + expectedAddr tcpip.Address }{ { name: "Disabled", nicName: "nic1", autoGen: false, linkAddr: linkAddr1, - secretKey: secretKey[:], + shouldGen: false, }, { - name: "Enabled", - nicName: "nic1", - autoGen: true, - linkAddr: linkAddr1, - secretKey: secretKey[:], + name: "Disabled without OIID options", + nicName: "nic1", + autoGen: false, + linkAddr: linkAddr1, + iidOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: nicNameFunc, + SecretKey: secretKey[:], + }, + shouldGen: false, }, - // These are all cases where we would not have generated a - // link-local address if opaque IIDs were disabled. + + // Tests for EUI64 based addresses. { - name: "Nil MAC and empty nicName", - nicName: "", + name: "EUI64 Enabled", + autoGen: true, + linkAddr: linkAddr1, + shouldGen: true, + expectedAddr: header.LinkLocalAddr(linkAddr1), + }, + { + name: "EUI64 Empty MAC", autoGen: true, - linkAddr: tcpip.LinkAddress([]byte(nil)), - secretKey: secretKey[:1], + shouldGen: false, }, { - name: "Empty MAC and empty nicName", + name: "EUI64 Invalid MAC", autoGen: true, - linkAddr: tcpip.LinkAddress(""), - secretKey: secretKey[:2], + linkAddr: "\x01\x02\x03", + shouldGen: false, }, { - name: "Invalid MAC", - nicName: "test", + name: "EUI64 Multicast MAC", autoGen: true, - linkAddr: tcpip.LinkAddress("\x01\x02\x03"), - secretKey: secretKey[:3], + linkAddr: "\x01\x02\x03\x04\x05\x06", + shouldGen: false, }, { - name: "Multicast MAC", - nicName: "test2", + name: "EUI64 Unspecified MAC", autoGen: true, - linkAddr: tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"), - secretKey: secretKey[:4], + linkAddr: "\x00\x00\x00\x00\x00\x00", + shouldGen: false, }, + + // Tests for Opaque IID based addresses. { - name: "Unspecified MAC and nil SecretKey", + name: "OIID Enabled", + nicName: "nic1", + autoGen: true, + linkAddr: linkAddr1, + iidOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: nicNameFunc, + SecretKey: secretKey[:], + }, + shouldGen: true, + expectedAddr: header.LinkLocalAddrWithOpaqueIID("nic1", 0, secretKey[:]), + }, + // These are all cases where we would not have generated a + // link-local address if opaque IIDs were disabled. + { + name: "OIID Empty MAC and empty nicName", + autoGen: true, + iidOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: nicNameFunc, + SecretKey: secretKey[:1], + }, + shouldGen: true, + expectedAddr: header.LinkLocalAddrWithOpaqueIID("", 0, secretKey[:1]), + }, + { + name: "OIID Invalid MAC", + nicName: "test", + autoGen: true, + linkAddr: "\x01\x02\x03", + iidOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: nicNameFunc, + SecretKey: secretKey[:2], + }, + shouldGen: true, + expectedAddr: header.LinkLocalAddrWithOpaqueIID("test", 0, secretKey[:2]), + }, + { + name: "OIID Multicast MAC", + nicName: "test2", + autoGen: true, + linkAddr: "\x01\x02\x03\x04\x05\x06", + iidOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: nicNameFunc, + SecretKey: secretKey[:3], + }, + shouldGen: true, + expectedAddr: header.LinkLocalAddrWithOpaqueIID("test2", 0, secretKey[:3]), + }, + { + name: "OIID Unspecified MAC and nil SecretKey", nicName: "test3", autoGen: true, - linkAddr: tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"), + linkAddr: "\x00\x00\x00\x00\x00\x00", + iidOpts: stack.OpaqueInterfaceIdentifierOptions{ + NICNameFromID: nicNameFunc, + }, + shouldGen: true, + expectedAddr: header.LinkLocalAddrWithOpaqueIID("test3", 0, nil), }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - opts := stack.Options{ - NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, - OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{ - NICNameFromID: func(_ tcpip.NICID, nicName string) string { - return nicName - }, - SecretKey: test.secretKey, - }, + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), } - - if test.autoGen { - // Only set opts.AutoGenIPv6LinkLocal when - // test.autoGen is true because - // opts.AutoGenIPv6LinkLocal should be false by - // default. - opts.AutoGenIPv6LinkLocal = true + opts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + AutoGenIPv6LinkLocal: test.autoGen, + NDPDisp: &ndpDisp, + OpaqueIIDOpts: test.iidOpts, } - e := channel.New(10, 1280, test.linkAddr) + e := channel.New(0, 1280, test.linkAddr) s := stack.New(opts) nicOpts := stack.NICOptions{Name: test.nicName} if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err) } - addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err) - } + var expectedMainAddr tcpip.AddressWithPrefix + + if test.shouldGen { + expectedMainAddr = tcpip.AddressWithPrefix{ + Address: test.expectedAddr, + PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen, + } - if test.autoGen { - // Should have auto-generated an address and - // resolved immediately (DAD is disabled). - if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID(test.nicName, 0, test.secretKey), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want) + // Should have auto-generated an address and resolved immediately (DAD + // is disabled). + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, expectedMainAddr, newAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected addr auto gen event") } } else { // Should not have auto-generated an address. - if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly auto-generated an address") + default: } } + + gotMainAddr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err) + } + if gotMainAddr != expectedMainAddr { + t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", gotMainAddr, expectedMainAddr) + } }) } } @@ -2226,7 +2195,7 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) { NDPDisp: &ndpDisp, } - e := channel.New(10, 1280, linkAddr1) + e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1) s := stack.New(opts) if err := s.CreateNIC(1, e); err != nil { t.Fatalf("CreateNIC(_) = %s", err) -- cgit v1.2.3 From ca30dfa065f5458228b06dcf5379ed4edf29c165 Mon Sep 17 00:00:00 2001 From: Ghanan Gowripalan Date: Thu, 6 Feb 2020 19:49:30 -0800 Subject: Send DAD event when DAD resolves immediately Previously, a DAD event would not be sent if DAD was disabled. This allows integrators to do some work when an IPv6 address is bound to a NIC without special logic that checks if DAD is enabled. Without this change, integrators would need to check if a NIC has DAD enabled when an address is auto-generated. If DAD is enabled, it would need to delay the work until the DAD completion event; otherwise, it would need to do the work in the address auto-generated event handler. Test: stack_test.TestDADDisabled PiperOrigin-RevId: 293732914 --- pkg/tcpip/stack/ndp.go | 7 ++ pkg/tcpip/stack/ndp_test.go | 263 +++++++++++++++++++++--------------------- pkg/tcpip/stack/stack_test.go | 44 +++---- 3 files changed, 154 insertions(+), 160 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go index fae5f5014..045409bda 100644 --- a/pkg/tcpip/stack/ndp.go +++ b/pkg/tcpip/stack/ndp.go @@ -448,6 +448,13 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref remaining := ndp.configs.DupAddrDetectTransmits if remaining == 0 { ref.setKind(permanent) + + // Consider DAD to have resolved even if no DAD messages were actually + // transmitted. + if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil { + ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, true, nil) + } + return nil } diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index e13509fbd..1f6f77439 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -86,39 +86,6 @@ func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWi return prefix, subnet, addrForSubnet(subnet, linkAddr) } -// TestDADDisabled tests that an address successfully resolves immediately -// when DAD is not enabled (the default for an empty stack.Options). -func TestDADDisabled(t *testing.T) { - opts := stack.Options{ - NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, - } - - e := channel.New(0, 1280, linkAddr1) - s := stack.New(opts) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(_) = %s", err) - } - - if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err) - } - - // Should get the address immediately since we should not have performed - // DAD on it. - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) - if err != nil { - t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err) - } - if addr.Address != addr1 { - t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1) - } - - // We should not have sent any NDP NS messages. - if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 { - t.Fatalf("got NeighborSolicit = %d, want = 0", got) - } -} - // ndpDADEvent is a set of parameters that was passed to // ndpDispatcher.OnDuplicateAddressDetectionStatus. type ndpDADEvent struct { @@ -300,6 +267,58 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s } } +// Check e to make sure that the event is for addr on nic with ID 1, and the +// resolved flag set to resolved with the specified err. +func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string { + return cmp.Diff(ndpDADEvent{nicID: nicID, addr: addr, resolved: resolved, err: err}, e, cmp.AllowUnexported(e)) +} + +// TestDADDisabled tests that an address successfully resolves immediately +// when DAD is not enabled (the default for an empty stack.Options). +func TestDADDisabled(t *testing.T) { + const nicID = 1 + ndpDisp := ndpDispatcher{ + dadC: make(chan ndpDADEvent, 1), + } + opts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()}, + NDPDisp: &ndpDisp, + } + + e := channel.New(0, 1280, linkAddr1) + s := stack.New(opts) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + + if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) + } + + // Should get the address immediately since we should not have performed + // DAD on it. + select { + case e := <-ndpDisp.dadC: + if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected DAD event") + } + addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) + if err != nil { + t.Fatalf("stack.GetMainNICAddress(%d, %d) err = %s", nicID, header.IPv6ProtocolNumber, err) + } + if addr.Address != addr1 { + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1) + } + + // We should not have sent any NDP NS messages. + if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 { + t.Fatalf("got NeighborSolicit = %d, want = 0", got) + } +} + // TestDADResolve tests that an address successfully resolves after performing // DAD for various values of DupAddrDetectTransmits and RetransmitTimer. // Included in the subtests is a test to make sure that an invalid @@ -381,17 +400,8 @@ func TestDADResolve(t *testing.T) { // means something is wrong. t.Fatal("timed out waiting for DAD resolution") case e := <-ndpDisp.dadC: - if e.err != nil { - t.Fatal("got DAD error: ", e.err) - } - if e.nicID != nicID { - t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID) - } - if e.addr != addr1 { - t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1) - } - if !e.resolved { - t.Fatal("got DAD event w/ resolved = false, want = true") + if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) @@ -445,6 +455,8 @@ func TestDADResolve(t *testing.T) { // a node doing DAD for the same address), or if another node is detected to own // the address already (receive an NA message for the tentative address). func TestDADFail(t *testing.T) { + const nicID = 1 + tests := []struct { name string makeBuf func(tgt tcpip.Address) buffer.Prependable @@ -526,22 +538,22 @@ func TestDADFail(t *testing.T) { e := channel.New(0, 1280, linkAddr1) s := stack.New(opts) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(_) = %s", err) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err) + if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) } // Address should not be considered bound to the NIC yet // (DAD ongoing). - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) } // Receive a packet to simulate multiple nodes owning or @@ -565,25 +577,16 @@ func TestDADFail(t *testing.T) { // something is wrong. t.Fatal("timed out waiting for DAD failure") case e := <-ndpDisp.dadC: - if e.err != nil { - t.Fatal("got DAD error: ", e.err) - } - if e.nicID != 1 { - t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID) - } - if e.addr != addr1 { - t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1) - } - if e.resolved { - t.Fatal("got DAD event w/ resolved = true, want = false") + if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) } }) } @@ -592,6 +595,8 @@ func TestDADFail(t *testing.T) { // TestDADStop tests to make sure that the DAD process stops when an address is // removed. func TestDADStop(t *testing.T) { + const nicID = 1 + ndpDisp := ndpDispatcher{ dadC: make(chan ndpDADEvent, 1), } @@ -607,26 +612,26 @@ func TestDADStop(t *testing.T) { e := channel.New(0, 1280, linkAddr1) s := stack.New(opts) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(_) = %s", err) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err) + if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) } // Address should not be considered bound to the NIC yet (DAD ongoing). - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) } // Remove the address. This should stop DAD. - if err := s.RemoveAddress(1, addr1); err != nil { - t.Fatalf("RemoveAddress(_, %s) = %s", addr1, err) + if err := s.RemoveAddress(nicID, addr1); err != nil { + t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err) } // Wait for DAD to fail (since the address was removed during DAD). @@ -636,26 +641,16 @@ func TestDADStop(t *testing.T) { // time + extra 1s buffer, something is wrong. t.Fatal("timed out waiting for DAD failure") case e := <-ndpDisp.dadC: - if e.err != nil { - t.Fatal("got DAD error: ", e.err) - } - if e.nicID != 1 { - t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID) - } - if e.addr != addr1 { - t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1) + if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) } - if e.resolved { - t.Fatal("got DAD event w/ resolved = true, want = false") - } - } - addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) } // Should not have sent more than 1 NS message. @@ -681,6 +676,10 @@ func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) { // configurations without affecting the default NDP configurations or other // interfaces' configurations. func TestSetNDPConfigurations(t *testing.T) { + const nicID1 = 1 + const nicID2 = 2 + const nicID3 = 3 + tests := []struct { name string dupAddrDetectTransmits uint8 @@ -704,7 +703,7 @@ func TestSetNDPConfigurations(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { ndpDisp := ndpDispatcher{ - dadC: make(chan ndpDADEvent), + dadC: make(chan ndpDADEvent, 1), } e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ @@ -712,17 +711,28 @@ func TestSetNDPConfigurations(t *testing.T) { NDPDisp: &ndpDisp, }) + expectDADEvent := func(nicID tcpip.NICID, addr tcpip.Address) { + select { + case e := <-ndpDisp.dadC: + if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatalf("expected DAD event for %s", addr) + } + } + // This NIC(1)'s NDP configurations will be updated to // be different from the default. - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) + if err := s.CreateNIC(nicID1, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err) } // Created before updating NIC(1)'s NDP configurations // but updating NIC(1)'s NDP configurations should not // affect other existing NICs. - if err := s.CreateNIC(2, e); err != nil { - t.Fatalf("CreateNIC(2) = %s", err) + if err := s.CreateNIC(nicID2, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err) } // Update the NDP configurations on NIC(1) to use DAD. @@ -730,36 +740,38 @@ func TestSetNDPConfigurations(t *testing.T) { DupAddrDetectTransmits: test.dupAddrDetectTransmits, RetransmitTimer: test.retransmitTimer, } - if err := s.SetNDPConfigurations(1, configs); err != nil { - t.Fatalf("got SetNDPConfigurations(1, _) = %s", err) + if err := s.SetNDPConfigurations(nicID1, configs); err != nil { + t.Fatalf("got SetNDPConfigurations(%d, _) = %s", nicID1, err) } // Created after updating NIC(1)'s NDP configurations // but the stack's default NDP configurations should not // have been updated. - if err := s.CreateNIC(3, e); err != nil { - t.Fatalf("CreateNIC(3) = %s", err) + if err := s.CreateNIC(nicID3, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID3, err) } // Add addresses for each NIC. - if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(1, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err) + if err := s.AddAddress(nicID1, header.IPv6ProtocolNumber, addr1); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addr1, err) } - if err := s.AddAddress(2, header.IPv6ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(2, %d, %s) = %s", header.IPv6ProtocolNumber, addr2, err) + if err := s.AddAddress(nicID2, header.IPv6ProtocolNumber, addr2); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addr2, err) } - if err := s.AddAddress(3, header.IPv6ProtocolNumber, addr3); err != nil { - t.Fatalf("AddAddress(3, %d, %s) = %s", header.IPv6ProtocolNumber, addr3, err) + expectDADEvent(nicID2, addr2) + if err := s.AddAddress(nicID3, header.IPv6ProtocolNumber, addr3); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addr3, err) } + expectDADEvent(nicID3, addr3) // Address should not be considered bound to NIC(1) yet // (DAD ongoing). - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err := s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want) } // Should get the address on NIC(2) and NIC(3) @@ -767,31 +779,31 @@ func TestSetNDPConfigurations(t *testing.T) { // it as the stack was configured to not do DAD by // default and we only updated the NDP configurations on // NIC(1). - addr, err = s.GetMainNICAddress(2, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID2, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("stack.GetMainNICAddress(2, _) err = %s", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID2, header.IPv6ProtocolNumber, err) } if addr.Address != addr2 { - t.Fatalf("got stack.GetMainNICAddress(2, _) = %s, want = %s", addr, addr2) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID2, header.IPv6ProtocolNumber, addr, addr2) } - addr, err = s.GetMainNICAddress(3, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID3, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("stack.GetMainNICAddress(3, _) err = %s", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID3, header.IPv6ProtocolNumber, err) } if addr.Address != addr3 { - t.Fatalf("got stack.GetMainNICAddress(3, _) = %s, want = %s", addr, addr3) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID3, header.IPv6ProtocolNumber, addr, addr3) } // Sleep until right (500ms before) before resolution to // make sure the address didn't resolve on NIC(1) yet. const delta = 500 * time.Millisecond time.Sleep(time.Duration(test.dupAddrDetectTransmits)*test.expectedRetransmitTimer - delta) - addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want) } // Wait for DAD to resolve. @@ -805,25 +817,16 @@ func TestSetNDPConfigurations(t *testing.T) { // means something is wrong. t.Fatal("timed out waiting for DAD resolution") case e := <-ndpDisp.dadC: - if e.err != nil { - t.Fatal("got DAD error: ", e.err) - } - if e.nicID != 1 { - t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID) - } - if e.addr != addr1 { - t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1) - } - if !e.resolved { - t.Fatal("got DAD event w/ resolved = false, want = true") + if diff := checkDADEvent(e, nicID1, addr1, true, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("stack.GetMainNICAddress(1, _) err = %s", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err) } if addr.Address != addr1 { - t.Fatalf("got stack.GetMainNICAddress(1, _) = %s, want = %s", addr, addr1) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID1, header.IPv6ProtocolNumber, addr, addr1) } }) } diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index b2c1763bf..24133e6f2 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -2184,6 +2184,8 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) { // TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6 // link-local addresses will only be assigned after the DAD process resolves. func TestNICAutoGenAddrDoesDAD(t *testing.T) { + const nicID = 1 + ndpDisp := ndpDispatcher{ dadC: make(chan ndpDADEvent), } @@ -2197,18 +2199,18 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) { e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1) s := stack.New(opts) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(_) = %s", err) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } // Address should not be considered bound to the // NIC yet (DAD ongoing). - addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) } linkLocalAddr := header.LinkLocalAddr(linkAddr1) @@ -2222,25 +2224,16 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) { // means something is wrong. t.Fatal("timed out waiting for DAD resolution") case e := <-ndpDisp.dadC: - if e.err != nil { - t.Fatal("got DAD error: ", e.err) - } - if e.nicID != 1 { - t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID) - } - if e.addr != linkLocalAddr { - t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr) - } - if !e.resolved { - t.Fatal("got DAD event w/ resolved = false, want = true") + if diff := checkDADEvent(e, nicID, linkLocalAddr, true, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } - addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber) + addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber) if err != nil { - t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err) } if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want { - t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want) + t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want) } } @@ -2606,17 +2599,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) { case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout): t.Fatal("timed out waiting for DAD resolution") case e := <-ndpDisp.dadC: - if e.err != nil { - t.Fatal("got DAD error: ", e.err) - } - if e.nicID != nicID { - t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID) - } - if e.addr != addr.AddressWithPrefix.Address { - t.Fatalf("got DAD event w/ addr = %s, want = %s", e.addr, addr.AddressWithPrefix.Address) - } - if !e.resolved { - t.Fatal("got DAD event w/ resolved = false, want = true") + if diff := checkDADEvent(e, nicID, addr.AddressWithPrefix.Address, true, nil); diff != "" { + t.Errorf("dad event mismatch (-want +got):\n%s", diff) } } if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) { -- cgit v1.2.3 From c141eb5f430dc50f6bf90232c369b7b3a542155e Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 7 Feb 2020 13:47:57 -0800 Subject: Address GH comments. --- pkg/abi/linux/netfilter.go | 2 +- pkg/sentry/socket/netfilter/extensions.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg') diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go index e4aabb6bb..7363185b7 100644 --- a/pkg/abi/linux/netfilter.go +++ b/pkg/abi/linux/netfilter.go @@ -363,7 +363,7 @@ type XTTCP struct { // range to which the matcher applies. DestinationPortStart uint16 - // DestinationPortEnd specifies the start of the destination port + // DestinationPortEnd specifies the end of the destination port // range to which the matcher applies. DestinationPortEnd uint16 diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go index b5fbb52e4..3082976cd 100644 --- a/pkg/sentry/socket/netfilter/extensions.go +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. -- cgit v1.2.3 From e1587a28876f8aac689a2cd1b7630f1637655b58 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Fri, 7 Feb 2020 14:00:50 -0800 Subject: Log level, optname, optval and optlen in getsockopt/setsockopt in strace. Log 8, 16, and 32 int optvals and dump the memory of other sizes. Updates #1782 PiperOrigin-RevId: 293889388 --- pkg/sentry/strace/linux64_amd64.go | 4 +- pkg/sentry/strace/socket.go | 215 +++++++++++++++++++++++++++++++++++++ pkg/sentry/strace/strace.go | 22 +++- pkg/sentry/strace/syscalls.go | 22 +++- 4 files changed, 258 insertions(+), 5 deletions(-) (limited to 'pkg') diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index 85ec66fd3..a4de545e9 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -78,8 +78,8 @@ var linuxAMD64 = SyscallMap{ 51: makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen), 52: makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen), 53: makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex), - 54: makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex), - 55: makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex), + 54: makeSyscallInfo("setsockopt", FD, SockOptLevel, SockOptName, SetSockOptVal, Hex /* length by value, not a pointer */), + 55: makeSyscallInfo("getsockopt", FD, SockOptLevel, SockOptName, GetSockOptVal, SockLen), 56: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex), 57: makeSyscallInfo("fork"), 58: makeSyscallInfo("vfork"), diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index d2079c85f..f7ff4573e 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -419,3 +419,218 @@ func sockFlags(flags int32) string { } return SocketFlagSet.Parse(uint64(flags)) } + +func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen usermem.Addr, maximumBlobSize uint, rval uintptr) string { + if int(rval) < 0 { + return hexNum(uint64(optVal)) + } + if optVal == 0 { + return "null" + } + l, err := copySockLen(t, optLen) + if err != nil { + return fmt.Sprintf("%#x {error reading length: %v}", optLen, err) + } + return sockOptVal(t, level, optname, optVal, uint64(l), maximumBlobSize) +} + +func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string { + switch optLen { + case 1: + var v uint8 + _, err := t.CopyIn(optVal, &v) + if err != nil { + return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) + } + return fmt.Sprintf("%#x {value=%v}", optVal, v) + case 2: + var v uint16 + _, err := t.CopyIn(optVal, &v) + if err != nil { + return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) + } + return fmt.Sprintf("%#x {value=%v}", optVal, v) + case 4: + var v uint32 + _, err := t.CopyIn(optVal, &v) + if err != nil { + return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) + } + return fmt.Sprintf("%#x {value=%v}", optVal, v) + default: + return dump(t, optVal, uint(optLen), maximumBlobSize) + } +} + +var sockOptLevels = abi.ValueSet{ + linux.SOL_IP: "SOL_IP", + linux.SOL_SOCKET: "SOL_SOCKET", + linux.SOL_TCP: "SOL_TCP", + linux.SOL_UDP: "SOL_UDP", + linux.SOL_IPV6: "SOL_IPV6", + linux.SOL_ICMPV6: "SOL_ICMPV6", + linux.SOL_RAW: "SOL_RAW", + linux.SOL_PACKET: "SOL_PACKET", + linux.SOL_NETLINK: "SOL_NETLINK", +} + +var sockOptNames = map[uint64]abi.ValueSet{ + linux.SOL_IP: { + linux.IP_TTL: "IP_TTL", + linux.IP_MULTICAST_TTL: "IP_MULTICAST_TTL", + linux.IP_MULTICAST_IF: "IP_MULTICAST_IF", + linux.IP_MULTICAST_LOOP: "IP_MULTICAST_LOOP", + linux.IP_TOS: "IP_TOS", + linux.IP_RECVTOS: "IP_RECVTOS", + linux.IPT_SO_GET_INFO: "IPT_SO_GET_INFO", + linux.IPT_SO_GET_ENTRIES: "IPT_SO_GET_ENTRIES", + linux.IP_ADD_MEMBERSHIP: "IP_ADD_MEMBERSHIP", + linux.IP_DROP_MEMBERSHIP: "IP_DROP_MEMBERSHIP", + linux.MCAST_JOIN_GROUP: "MCAST_JOIN_GROUP", + linux.IP_ADD_SOURCE_MEMBERSHIP: "IP_ADD_SOURCE_MEMBERSHIP", + linux.IP_BIND_ADDRESS_NO_PORT: "IP_BIND_ADDRESS_NO_PORT", + linux.IP_BLOCK_SOURCE: "IP_BLOCK_SOURCE", + linux.IP_CHECKSUM: "IP_CHECKSUM", + linux.IP_DROP_SOURCE_MEMBERSHIP: "IP_DROP_SOURCE_MEMBERSHIP", + linux.IP_FREEBIND: "IP_FREEBIND", + linux.IP_HDRINCL: "IP_HDRINCL", + linux.IP_IPSEC_POLICY: "IP_IPSEC_POLICY", + linux.IP_MINTTL: "IP_MINTTL", + linux.IP_MSFILTER: "IP_MSFILTER", + linux.IP_MTU_DISCOVER: "IP_MTU_DISCOVER", + linux.IP_MULTICAST_ALL: "IP_MULTICAST_ALL", + linux.IP_NODEFRAG: "IP_NODEFRAG", + linux.IP_OPTIONS: "IP_OPTIONS", + linux.IP_PASSSEC: "IP_PASSSEC", + linux.IP_PKTINFO: "IP_PKTINFO", + linux.IP_RECVERR: "IP_RECVERR", + linux.IP_RECVFRAGSIZE: "IP_RECVFRAGSIZE", + linux.IP_RECVOPTS: "IP_RECVOPTS", + linux.IP_RECVORIGDSTADDR: "IP_RECVORIGDSTADDR", + linux.IP_RECVTTL: "IP_RECVTTL", + linux.IP_RETOPTS: "IP_RETOPTS", + linux.IP_TRANSPARENT: "IP_TRANSPARENT", + linux.IP_UNBLOCK_SOURCE: "IP_UNBLOCK_SOURCE", + linux.IP_UNICAST_IF: "IP_UNICAST_IF", + linux.IP_XFRM_POLICY: "IP_XFRM_POLICY", + linux.MCAST_BLOCK_SOURCE: "MCAST_BLOCK_SOURCE", + linux.MCAST_JOIN_SOURCE_GROUP: "MCAST_JOIN_SOURCE_GROUP", + linux.MCAST_LEAVE_GROUP: "MCAST_LEAVE_GROUP", + linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP", + linux.MCAST_MSFILTER: "MCAST_MSFILTER", + linux.MCAST_UNBLOCK_SOURCE: "MCAST_UNBLOCK_SOURCE", + linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT", + linux.IP_PKTOPTIONS: "IP_PKTOPTIONS", + linux.IP_MTU: "IP_MTU", + }, + linux.SOL_SOCKET: { + linux.SO_ERROR: "SO_ERROR", + linux.SO_PEERCRED: "SO_PEERCRED", + linux.SO_PASSCRED: "SO_PASSCRED", + linux.SO_SNDBUF: "SO_SNDBUF", + linux.SO_RCVBUF: "SO_RCVBUF", + linux.SO_REUSEADDR: "SO_REUSEADDR", + linux.SO_REUSEPORT: "SO_REUSEPORT", + linux.SO_BINDTODEVICE: "SO_BINDTODEVICE", + linux.SO_BROADCAST: "SO_BROADCAST", + linux.SO_KEEPALIVE: "SO_KEEPALIVE", + linux.SO_LINGER: "SO_LINGER", + linux.SO_SNDTIMEO: "SO_SNDTIMEO", + linux.SO_RCVTIMEO: "SO_RCVTIMEO", + linux.SO_OOBINLINE: "SO_OOBINLINE", + linux.SO_TIMESTAMP: "SO_TIMESTAMP", + }, + linux.SOL_TCP: { + linux.TCP_NODELAY: "TCP_NODELAY", + linux.TCP_CORK: "TCP_CORK", + linux.TCP_QUICKACK: "TCP_QUICKACK", + linux.TCP_MAXSEG: "TCP_MAXSEG", + linux.TCP_KEEPIDLE: "TCP_KEEPIDLE", + linux.TCP_KEEPINTVL: "TCP_KEEPINTVL", + linux.TCP_USER_TIMEOUT: "TCP_USER_TIMEOUT", + linux.TCP_INFO: "TCP_INFO", + linux.TCP_CC_INFO: "TCP_CC_INFO", + linux.TCP_NOTSENT_LOWAT: "TCP_NOTSENT_LOWAT", + linux.TCP_ZEROCOPY_RECEIVE: "TCP_ZEROCOPY_RECEIVE", + linux.TCP_CONGESTION: "TCP_CONGESTION", + linux.TCP_LINGER2: "TCP_LINGER2", + linux.TCP_DEFER_ACCEPT: "TCP_DEFER_ACCEPT", + linux.TCP_REPAIR_OPTIONS: "TCP_REPAIR_OPTIONS", + linux.TCP_INQ: "TCP_INQ", + linux.TCP_FASTOPEN: "TCP_FASTOPEN", + linux.TCP_FASTOPEN_CONNECT: "TCP_FASTOPEN_CONNECT", + linux.TCP_FASTOPEN_KEY: "TCP_FASTOPEN_KEY", + linux.TCP_FASTOPEN_NO_COOKIE: "TCP_FASTOPEN_NO_COOKIE", + linux.TCP_KEEPCNT: "TCP_KEEPCNT", + linux.TCP_QUEUE_SEQ: "TCP_QUEUE_SEQ", + linux.TCP_REPAIR: "TCP_REPAIR", + linux.TCP_REPAIR_QUEUE: "TCP_REPAIR_QUEUE", + linux.TCP_REPAIR_WINDOW: "TCP_REPAIR_WINDOW", + linux.TCP_SAVED_SYN: "TCP_SAVED_SYN", + linux.TCP_SAVE_SYN: "TCP_SAVE_SYN", + linux.TCP_SYNCNT: "TCP_SYNCNT", + linux.TCP_THIN_DUPACK: "TCP_THIN_DUPACK", + linux.TCP_THIN_LINEAR_TIMEOUTS: "TCP_THIN_LINEAR_TIMEOUTS", + linux.TCP_TIMESTAMP: "TCP_TIMESTAMP", + linux.TCP_ULP: "TCP_ULP", + linux.TCP_WINDOW_CLAMP: "TCP_WINDOW_CLAMP", + }, + linux.SOL_IPV6: { + linux.IPV6_V6ONLY: "IPV6_V6ONLY", + linux.IPV6_PATHMTU: "IPV6_PATHMTU", + linux.IPV6_TCLASS: "IPV6_TCLASS", + linux.IPV6_ADD_MEMBERSHIP: "IPV6_ADD_MEMBERSHIP", + linux.IPV6_DROP_MEMBERSHIP: "IPV6_DROP_MEMBERSHIP", + linux.IPV6_IPSEC_POLICY: "IPV6_IPSEC_POLICY", + linux.IPV6_JOIN_ANYCAST: "IPV6_JOIN_ANYCAST", + linux.IPV6_LEAVE_ANYCAST: "IPV6_LEAVE_ANYCAST", + linux.IPV6_PKTINFO: "IPV6_PKTINFO", + linux.IPV6_ROUTER_ALERT: "IPV6_ROUTER_ALERT", + linux.IPV6_XFRM_POLICY: "IPV6_XFRM_POLICY", + linux.MCAST_BLOCK_SOURCE: "MCAST_BLOCK_SOURCE", + linux.MCAST_JOIN_GROUP: "MCAST_JOIN_GROUP", + linux.MCAST_JOIN_SOURCE_GROUP: "MCAST_JOIN_SOURCE_GROUP", + linux.MCAST_LEAVE_GROUP: "MCAST_LEAVE_GROUP", + linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP", + linux.MCAST_UNBLOCK_SOURCE: "MCAST_UNBLOCK_SOURCE", + linux.IPV6_2292DSTOPTS: "IPV6_2292DSTOPTS", + linux.IPV6_2292HOPLIMIT: "IPV6_2292HOPLIMIT", + linux.IPV6_2292HOPOPTS: "IPV6_2292HOPOPTS", + linux.IPV6_2292PKTINFO: "IPV6_2292PKTINFO", + linux.IPV6_2292PKTOPTIONS: "IPV6_2292PKTOPTIONS", + linux.IPV6_2292RTHDR: "IPV6_2292RTHDR", + linux.IPV6_ADDR_PREFERENCES: "IPV6_ADDR_PREFERENCES", + linux.IPV6_AUTOFLOWLABEL: "IPV6_AUTOFLOWLABEL", + linux.IPV6_DONTFRAG: "IPV6_DONTFRAG", + linux.IPV6_DSTOPTS: "IPV6_DSTOPTS", + linux.IPV6_FLOWINFO: "IPV6_FLOWINFO", + linux.IPV6_FLOWINFO_SEND: "IPV6_FLOWINFO_SEND", + linux.IPV6_FLOWLABEL_MGR: "IPV6_FLOWLABEL_MGR", + linux.IPV6_FREEBIND: "IPV6_FREEBIND", + linux.IPV6_HOPOPTS: "IPV6_HOPOPTS", + linux.IPV6_MINHOPCOUNT: "IPV6_MINHOPCOUNT", + linux.IPV6_MTU: "IPV6_MTU", + linux.IPV6_MTU_DISCOVER: "IPV6_MTU_DISCOVER", + linux.IPV6_MULTICAST_ALL: "IPV6_MULTICAST_ALL", + linux.IPV6_MULTICAST_HOPS: "IPV6_MULTICAST_HOPS", + linux.IPV6_MULTICAST_IF: "IPV6_MULTICAST_IF", + linux.IPV6_MULTICAST_LOOP: "IPV6_MULTICAST_LOOP", + linux.IPV6_RECVDSTOPTS: "IPV6_RECVDSTOPTS", + linux.IPV6_RECVERR: "IPV6_RECVERR", + linux.IPV6_RECVFRAGSIZE: "IPV6_RECVFRAGSIZE", + linux.IPV6_RECVHOPLIMIT: "IPV6_RECVHOPLIMIT", + linux.IPV6_RECVHOPOPTS: "IPV6_RECVHOPOPTS", + linux.IPV6_RECVORIGDSTADDR: "IPV6_RECVORIGDSTADDR", + linux.IPV6_RECVPATHMTU: "IPV6_RECVPATHMTU", + linux.IPV6_RECVPKTINFO: "IPV6_RECVPKTINFO", + linux.IPV6_RECVRTHDR: "IPV6_RECVRTHDR", + linux.IPV6_RECVTCLASS: "IPV6_RECVTCLASS", + linux.IPV6_RTHDR: "IPV6_RTHDR", + linux.IPV6_RTHDRDSTOPTS: "IPV6_RTHDRDSTOPTS", + linux.IPV6_TRANSPARENT: "IPV6_TRANSPARENT", + linux.IPV6_UNICAST_HOPS: "IPV6_UNICAST_HOPS", + linux.IPV6_UNICAST_IF: "IPV6_UNICAST_IF", + linux.MCAST_MSFILTER: "MCAST_MSFILTER", + linux.IPV6_ADDRFORM: "IPV6_ADDRFORM", + }, +} diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 3fc4a47fc..a796b2396 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -55,6 +55,14 @@ var ItimerTypes = abi.ValueSet{ linux.ITIMER_PROF: "ITIMER_PROF", } +func hexNum(num uint64) string { + return "0x" + strconv.FormatUint(num, 16) +} + +func hexArg(arg arch.SyscallArgument) string { + return hexNum(arg.Uint64()) +} + func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr) @@ -389,6 +397,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo output = append(output, path(t, args[arg].Pointer())) case ExecveStringVector: output = append(output, stringVector(t, args[arg].Pointer())) + case SetSockOptVal: + output = append(output, sockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Uint64() /* optLen */, maximumBlobSize)) + case SockOptLevel: + output = append(output, sockOptLevels.Parse(args[arg].Uint64())) + case SockOptName: + output = append(output, sockOptNames[args[arg-1].Uint64() /* level */].Parse(args[arg].Uint64())) case SockAddr: output = append(output, sockAddr(t, args[arg].Pointer(), uint32(args[arg+1].Uint64()))) case SockLen: @@ -446,7 +460,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo case Hex: fallthrough default: - output = append(output, "0x"+strconv.FormatUint(args[arg].Uint64(), 16)) + output = append(output, hexArg(args[arg])) } } @@ -507,6 +521,12 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer()) case PollFDs: output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true) + case GetSockOptVal: + output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval) + case SetSockOptVal: + // No need to print the value again. While it usually + // isn't, the string version of this arg can be long. + output[arg] = hexArg(args[arg]) } } } diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go index 24e29a2ba..446d1e0f6 100644 --- a/pkg/sentry/strace/syscalls.go +++ b/pkg/sentry/strace/syscalls.go @@ -207,9 +207,27 @@ const ( // array is in the next argument. PollFDs - // SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of - // fds represented must be the first argument. + // SelectFDSet is an fd_set argument in select(2)/pselect(2). The + // number of FDs represented must be the first argument. SelectFDSet + + // GetSockOptVal is the optval argument in getsockopt(2). + // + // Formatted after syscall execution. + GetSockOptVal + + // SetSockOptVal is the optval argument in setsockopt(2). + // + // Contents omitted after syscall execution. + SetSockOptVal + + // SockOptLevel is the level argument in getsockopt(2) and + // setsockopt(2). + SockOptLevel + + // SockOptLevel is the optname argument in getsockopt(2) and + // setsockopt(2). + SockOptName ) // defaultFormat is the syscall argument format to use if the actual format is -- cgit v1.2.3 From 17b9f5e66238bde1e4ed3bd9e5fb67342c8b58ec Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 7 Feb 2020 14:46:24 -0800 Subject: Support listxattr and removexattr syscalls. Note that these are only implemented for tmpfs, and other impls will still return EOPNOTSUPP. PiperOrigin-RevId: 293899385 --- pkg/p9/client_file.go | 33 ++++ pkg/p9/file.go | 16 ++ pkg/p9/handlers.go | 33 ++++ pkg/p9/messages.go | 199 +++++++++++++++---- pkg/p9/p9.go | 4 + pkg/p9/version.go | 8 +- pkg/sentry/fs/copy_up.go | 2 +- pkg/sentry/fs/fsutil/inode.go | 20 +- pkg/sentry/fs/gofer/context_file.go | 14 ++ pkg/sentry/fs/gofer/inode.go | 13 +- pkg/sentry/fs/inode.go | 14 +- pkg/sentry/fs/inode_operations.go | 13 +- pkg/sentry/fs/inode_overlay.go | 18 +- pkg/sentry/fs/tmpfs/tmpfs.go | 9 +- pkg/sentry/syscalls/linux/linux64_amd64.go | 27 ++- pkg/sentry/syscalls/linux/linux64_arm64.go | 37 ++-- pkg/sentry/syscalls/linux/sys_xattr.go | 200 +++++++++++++++++++- runsc/fsgofer/fsgofer.go | 16 +- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/xattr.cc | 294 ++++++++++++++++------------- 20 files changed, 733 insertions(+), 238 deletions(-) (limited to 'pkg') diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go index 0254e4ccc..2ee07b664 100644 --- a/pkg/p9/client_file.go +++ b/pkg/p9/client_file.go @@ -194,6 +194,39 @@ func (c *clientFile) SetXattr(name, value string, flags uint32) error { return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{}) } +// ListXattr implements File.ListXattr. +func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) { + if atomic.LoadUint32(&c.closed) != 0 { + return nil, syscall.EBADF + } + if !versionSupportsListRemoveXattr(c.client.version) { + return nil, syscall.EOPNOTSUPP + } + + rlistxattr := Rlistxattr{} + if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil { + return nil, err + } + + xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs)) + for _, x := range rlistxattr.Xattrs { + xattrs[x] = struct{}{} + } + return xattrs, nil +} + +// RemoveXattr implements File.RemoveXattr. +func (c *clientFile) RemoveXattr(name string) error { + if atomic.LoadUint32(&c.closed) != 0 { + return syscall.EBADF + } + if !versionSupportsListRemoveXattr(c.client.version) { + return syscall.EOPNOTSUPP + } + + return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{}) +} + // Allocate implements File.Allocate. func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error { if atomic.LoadUint32(&c.closed) != 0 { diff --git a/pkg/p9/file.go b/pkg/p9/file.go index 4607cfcdf..d4ffbc8e3 100644 --- a/pkg/p9/file.go +++ b/pkg/p9/file.go @@ -105,6 +105,22 @@ type File interface { // TODO(b/127675828): Determine concurrency guarantees once implemented. SetXattr(name, value string, flags uint32) error + // ListXattr lists the names of the extended attributes on this node. + // + // Size indicates the size of the buffer that has been allocated to hold the + // attribute list. If the list would be larger than size, implementations may + // return ERANGE to indicate that the buffer is too small, but they are also + // free to ignore the hint entirely (i.e. the value returned may be larger + // than size). All size checking is done independently at the syscall layer. + // + // TODO(b/148303075): Determine concurrency guarantees once implemented. + ListXattr(size uint64) (map[string]struct{}, error) + + // RemoveXattr removes extended attributes on this node. + // + // TODO(b/148303075): Determine concurrency guarantees once implemented. + RemoveXattr(name string) error + // Allocate allows the caller to directly manipulate the allocated disk space // for the file. See fallocate(2) for more details. Allocate(mode AllocateMode, offset, length uint64) error diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go index 7d6653a07..2ac45eb80 100644 --- a/pkg/p9/handlers.go +++ b/pkg/p9/handlers.go @@ -941,6 +941,39 @@ func (t *Tsetxattr) handle(cs *connState) message { return &Rsetxattr{} } +// handle implements handler.handle. +func (t *Tlistxattr) handle(cs *connState) message { + ref, ok := cs.LookupFID(t.FID) + if !ok { + return newErr(syscall.EBADF) + } + defer ref.DecRef() + + xattrs, err := ref.file.ListXattr(t.Size) + if err != nil { + return newErr(err) + } + xattrList := make([]string, 0, len(xattrs)) + for x := range xattrs { + xattrList = append(xattrList, x) + } + return &Rlistxattr{Xattrs: xattrList} +} + +// handle implements handler.handle. +func (t *Tremovexattr) handle(cs *connState) message { + ref, ok := cs.LookupFID(t.FID) + if !ok { + return newErr(syscall.EBADF) + } + defer ref.DecRef() + + if err := ref.file.RemoveXattr(t.Name); err != nil { + return newErr(err) + } + return &Rremovexattr{} +} + // handle implements handler.handle. func (t *Treaddir) handle(cs *connState) message { ref, ok := cs.LookupFID(t.Directory) diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go index ceb723d86..b1cede5f5 100644 --- a/pkg/p9/messages.go +++ b/pkg/p9/messages.go @@ -174,11 +174,11 @@ type Rflush struct { } // Decode implements encoder.Decode. -func (*Rflush) Decode(b *buffer) { +func (*Rflush) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rflush) Encode(b *buffer) { +func (*Rflush) Encode(*buffer) { } // Type implements message.Type. @@ -188,7 +188,7 @@ func (*Rflush) Type() MsgType { // String implements fmt.Stringer. func (r *Rflush) String() string { - return fmt.Sprintf("RFlush{}") + return "RFlush{}" } // Twalk is a walk request. @@ -300,11 +300,11 @@ type Rclunk struct { } // Decode implements encoder.Decode. -func (*Rclunk) Decode(b *buffer) { +func (*Rclunk) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rclunk) Encode(b *buffer) { +func (*Rclunk) Encode(*buffer) { } // Type implements message.Type. @@ -314,7 +314,7 @@ func (*Rclunk) Type() MsgType { // String implements fmt.Stringer. func (r *Rclunk) String() string { - return fmt.Sprintf("Rclunk{}") + return "Rclunk{}" } // Tremove is a remove request. @@ -350,11 +350,11 @@ type Rremove struct { } // Decode implements encoder.Decode. -func (*Rremove) Decode(b *buffer) { +func (*Rremove) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rremove) Encode(b *buffer) { +func (*Rremove) Encode(*buffer) { } // Type implements message.Type. @@ -364,7 +364,7 @@ func (*Rremove) Type() MsgType { // String implements fmt.Stringer. func (r *Rremove) String() string { - return fmt.Sprintf("Rremove{}") + return "Rremove{}" } // Rlerror is an error response. @@ -745,16 +745,16 @@ func (*Rlink) Type() MsgType { } // Decode implements encoder.Decode. -func (*Rlink) Decode(b *buffer) { +func (*Rlink) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rlink) Encode(b *buffer) { +func (*Rlink) Encode(*buffer) { } // String implements fmt.Stringer. func (r *Rlink) String() string { - return fmt.Sprintf("Rlink{}") + return "Rlink{}" } // Trenameat is a rename request. @@ -803,11 +803,11 @@ type Rrenameat struct { } // Decode implements encoder.Decode. -func (*Rrenameat) Decode(b *buffer) { +func (*Rrenameat) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rrenameat) Encode(b *buffer) { +func (*Rrenameat) Encode(*buffer) { } // Type implements message.Type. @@ -817,7 +817,7 @@ func (*Rrenameat) Type() MsgType { // String implements fmt.Stringer. func (r *Rrenameat) String() string { - return fmt.Sprintf("Rrenameat{}") + return "Rrenameat{}" } // Tunlinkat is an unlink request. @@ -861,11 +861,11 @@ type Runlinkat struct { } // Decode implements encoder.Decode. -func (*Runlinkat) Decode(b *buffer) { +func (*Runlinkat) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Runlinkat) Encode(b *buffer) { +func (*Runlinkat) Encode(*buffer) { } // Type implements message.Type. @@ -875,7 +875,7 @@ func (*Runlinkat) Type() MsgType { // String implements fmt.Stringer. func (r *Runlinkat) String() string { - return fmt.Sprintf("Runlinkat{}") + return "Runlinkat{}" } // Trename is a rename request. @@ -922,11 +922,11 @@ type Rrename struct { } // Decode implements encoder.Decode. -func (*Rrename) Decode(b *buffer) { +func (*Rrename) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rrename) Encode(b *buffer) { +func (*Rrename) Encode(*buffer) { } // Type implements message.Type. @@ -936,7 +936,7 @@ func (*Rrename) Type() MsgType { // String implements fmt.Stringer. func (r *Rrename) String() string { - return fmt.Sprintf("Rrename{}") + return "Rrename{}" } // Treadlink is a readlink request. @@ -1409,11 +1409,11 @@ type Rsetattr struct { } // Decode implements encoder.Decode. -func (*Rsetattr) Decode(b *buffer) { +func (*Rsetattr) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rsetattr) Encode(b *buffer) { +func (*Rsetattr) Encode(*buffer) { } // Type implements message.Type. @@ -1423,7 +1423,7 @@ func (*Rsetattr) Type() MsgType { // String implements fmt.Stringer. func (r *Rsetattr) String() string { - return fmt.Sprintf("Rsetattr{}") + return "Rsetattr{}" } // Tallocate is an allocate request. This is an extension to 9P protocol, not @@ -1466,11 +1466,11 @@ type Rallocate struct { } // Decode implements encoder.Decode. -func (*Rallocate) Decode(b *buffer) { +func (*Rallocate) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rallocate) Encode(b *buffer) { +func (*Rallocate) Encode(*buffer) { } // Type implements message.Type. @@ -1480,7 +1480,71 @@ func (*Rallocate) Type() MsgType { // String implements fmt.Stringer. func (r *Rallocate) String() string { - return fmt.Sprintf("Rallocate{}") + return "Rallocate{}" +} + +// Tlistxattr is a listxattr request. +type Tlistxattr struct { + // FID refers to the file on which to list xattrs. + FID FID + + // Size is the buffer size for the xattr list. + Size uint64 +} + +// Decode implements encoder.Decode. +func (t *Tlistxattr) Decode(b *buffer) { + t.FID = b.ReadFID() + t.Size = b.Read64() +} + +// Encode implements encoder.Encode. +func (t *Tlistxattr) Encode(b *buffer) { + b.WriteFID(t.FID) + b.Write64(t.Size) +} + +// Type implements message.Type. +func (*Tlistxattr) Type() MsgType { + return MsgTlistxattr +} + +// String implements fmt.Stringer. +func (t *Tlistxattr) String() string { + return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size) +} + +// Rlistxattr is a listxattr response. +type Rlistxattr struct { + // Xattrs is a list of extended attribute names. + Xattrs []string +} + +// Decode implements encoder.Decode. +func (r *Rlistxattr) Decode(b *buffer) { + n := b.Read16() + r.Xattrs = r.Xattrs[:0] + for i := 0; i < int(n); i++ { + r.Xattrs = append(r.Xattrs, b.ReadString()) + } +} + +// Encode implements encoder.Encode. +func (r *Rlistxattr) Encode(b *buffer) { + b.Write16(uint16(len(r.Xattrs))) + for _, x := range r.Xattrs { + b.WriteString(x) + } +} + +// Type implements message.Type. +func (*Rlistxattr) Type() MsgType { + return MsgRlistxattr +} + +// String implements fmt.Stringer. +func (r *Rlistxattr) String() string { + return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs) } // Txattrwalk walks extended attributes. @@ -1594,11 +1658,11 @@ type Rxattrcreate struct { } // Decode implements encoder.Decode. -func (r *Rxattrcreate) Decode(b *buffer) { +func (r *Rxattrcreate) Decode(*buffer) { } // Encode implements encoder.Encode. -func (r *Rxattrcreate) Encode(b *buffer) { +func (r *Rxattrcreate) Encode(*buffer) { } // Type implements message.Type. @@ -1608,7 +1672,7 @@ func (*Rxattrcreate) Type() MsgType { // String implements fmt.Stringer. func (r *Rxattrcreate) String() string { - return fmt.Sprintf("Rxattrcreate{}") + return "Rxattrcreate{}" } // Tgetxattr is a getxattr request. @@ -1719,11 +1783,11 @@ type Rsetxattr struct { } // Decode implements encoder.Decode. -func (r *Rsetxattr) Decode(b *buffer) { +func (r *Rsetxattr) Decode(*buffer) { } // Encode implements encoder.Encode. -func (r *Rsetxattr) Encode(b *buffer) { +func (r *Rsetxattr) Encode(*buffer) { } // Type implements message.Type. @@ -1733,7 +1797,60 @@ func (*Rsetxattr) Type() MsgType { // String implements fmt.Stringer. func (r *Rsetxattr) String() string { - return fmt.Sprintf("Rsetxattr{}") + return "Rsetxattr{}" +} + +// Tremovexattr is a removexattr request. +type Tremovexattr struct { + // FID refers to the file on which to set xattrs. + FID FID + + // Name is the attribute name. + Name string +} + +// Decode implements encoder.Decode. +func (t *Tremovexattr) Decode(b *buffer) { + t.FID = b.ReadFID() + t.Name = b.ReadString() +} + +// Encode implements encoder.Encode. +func (t *Tremovexattr) Encode(b *buffer) { + b.WriteFID(t.FID) + b.WriteString(t.Name) +} + +// Type implements message.Type. +func (*Tremovexattr) Type() MsgType { + return MsgTremovexattr +} + +// String implements fmt.Stringer. +func (t *Tremovexattr) String() string { + return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name) +} + +// Rremovexattr is a removexattr response. +type Rremovexattr struct { +} + +// Decode implements encoder.Decode. +func (r *Rremovexattr) Decode(*buffer) { +} + +// Encode implements encoder.Encode. +func (r *Rremovexattr) Encode(*buffer) { +} + +// Type implements message.Type. +func (*Rremovexattr) Type() MsgType { + return MsgRremovexattr +} + +// String implements fmt.Stringer. +func (r *Rremovexattr) String() string { + return "Rremovexattr{}" } // Treaddir is a readdir request. @@ -1880,11 +1997,11 @@ type Rfsync struct { } // Decode implements encoder.Decode. -func (*Rfsync) Decode(b *buffer) { +func (*Rfsync) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rfsync) Encode(b *buffer) { +func (*Rfsync) Encode(*buffer) { } // Type implements message.Type. @@ -1894,7 +2011,7 @@ func (*Rfsync) Type() MsgType { // String implements fmt.Stringer. func (r *Rfsync) String() string { - return fmt.Sprintf("Rfsync{}") + return "Rfsync{}" } // Tstatfs is a stat request. @@ -1980,11 +2097,11 @@ type Rflushf struct { } // Decode implements encoder.Decode. -func (*Rflushf) Decode(b *buffer) { +func (*Rflushf) Decode(*buffer) { } // Encode implements encoder.Encode. -func (*Rflushf) Encode(b *buffer) { +func (*Rflushf) Encode(*buffer) { } // Type implements message.Type. @@ -1994,7 +2111,7 @@ func (*Rflushf) Type() MsgType { // String implements fmt.Stringer. func (*Rflushf) String() string { - return fmt.Sprintf("Rflushf{}") + return "Rflushf{}" } // Twalkgetattr is a walk request. @@ -2484,6 +2601,8 @@ func init() { msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} }) msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} }) msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} }) + msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} }) + msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} }) msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} }) msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} }) msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} }) @@ -2492,6 +2611,8 @@ func init() { msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} }) msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} }) msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} }) + msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} }) + msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} }) msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} }) msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} }) msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} }) diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index 5ab00d625..20ab31f7a 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -335,6 +335,8 @@ const ( MsgRgetattr = 25 MsgTsetattr = 26 MsgRsetattr = 27 + MsgTlistxattr = 28 + MsgRlistxattr = 29 MsgTxattrwalk = 30 MsgRxattrwalk = 31 MsgTxattrcreate = 32 @@ -343,6 +345,8 @@ const ( MsgRgetxattr = 35 MsgTsetxattr = 36 MsgRsetxattr = 37 + MsgTremovexattr = 38 + MsgRremovexattr = 39 MsgTreaddir = 40 MsgRreaddir = 41 MsgTfsync = 50 diff --git a/pkg/p9/version.go b/pkg/p9/version.go index 34a15eb55..09cde9f5a 100644 --- a/pkg/p9/version.go +++ b/pkg/p9/version.go @@ -26,7 +26,7 @@ const ( // // Clients are expected to start requesting this version number and // to continuously decrement it until a Tversion request succeeds. - highestSupportedVersion uint32 = 10 + highestSupportedVersion uint32 = 11 // lowestSupportedVersion is the lowest supported version X in a // version string of the format 9P2000.L.Google.X. @@ -167,3 +167,9 @@ func VersionSupportsOpenTruncateFlag(v uint32) bool { func versionSupportsGetSetXattr(v uint32) bool { return v >= 10 } + +// versionSupportsListRemoveXattr returns true if version v supports +// the Tlistxattr and Tremovexattr messages. +func versionSupportsListRemoveXattr(v uint32) bool { + return v >= 11 +} diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index f6c79e51b..b060a12ff 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -401,7 +401,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error if err != nil { return err } - lowerXattr, err := lower.ListXattr(ctx) + lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX) if err != nil && err != syserror.EOPNOTSUPP { return err } diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 252830572..daecc4ffe 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -247,7 +247,7 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode, } // ListXattr implements fs.InodeOperations.ListXattr. -func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { +func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) { i.mu.RLock() names := make(map[string]struct{}, len(i.xattrs)) for name := range i.xattrs { @@ -257,6 +257,17 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (m return names, nil } +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error { + i.mu.RLock() + defer i.mu.RUnlock() + if _, ok := i.xattrs[name]; ok { + delete(i.xattrs, name) + return nil + } + return syserror.ENOATTR +} + // staticFile is a file with static contents. It is returned by // InodeStaticFileGetter.GetFile. // @@ -460,10 +471,15 @@ func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, st } // ListXattr implements fs.InodeOperations.ListXattr. -func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { +func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) { return nil, syserror.EOPNOTSUPP } +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error { + return syserror.EOPNOTSUPP +} + // InodeNoopRelease implements fs.InodeOperations.Release as a noop. type InodeNoopRelease struct{} diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go index 3da818aed..125907d70 100644 --- a/pkg/sentry/fs/gofer/context_file.go +++ b/pkg/sentry/fs/gofer/context_file.go @@ -73,6 +73,20 @@ func (c *contextFile) setXattr(ctx context.Context, name, value string, flags ui return err } +func (c *contextFile) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := c.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + +func (c *contextFile) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := c.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := c.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index ac28174d2..1c934981b 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -604,18 +604,23 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length } // GetXattr implements fs.InodeOperations.GetXattr. -func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) { +func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) { return i.fileState.file.getXattr(ctx, name, size) } // SetXattr implements fs.InodeOperations.SetXattr. -func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error { +func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error { return i.fileState.file.setXattr(ctx, name, value, flags) } // ListXattr implements fs.InodeOperations.ListXattr. -func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { - return nil, syscall.EOPNOTSUPP +func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) { + return i.fileState.file.listXattr(ctx, size) +} + +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error { + return i.fileState.file.removeXattr(ctx, name) } // Allocate implements fs.InodeOperations.Allocate. diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index b66c091ab..55fb71c16 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -278,11 +278,19 @@ func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, fla } // ListXattr calls i.InodeOperations.ListXattr with i as the Inode. -func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) { +func (i *Inode) ListXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { if i.overlay != nil { - return overlayListXattr(ctx, i.overlay) + return overlayListXattr(ctx, i.overlay, size) } - return i.InodeOperations.ListXattr(ctx, i) + return i.InodeOperations.ListXattr(ctx, i, size) +} + +// RemoveXattr calls i.InodeOperations.RemoveXattr with i as the Inode. +func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error { + if i.overlay != nil { + return overlayRemoveXattr(ctx, i.overlay, d, name) + } + return i.InodeOperations.RemoveXattr(ctx, i, name) } // CheckPermission will check if the caller may access this file in the diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 70f2eae96..2bbfb72ef 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -190,7 +190,18 @@ type InodeOperations interface { // ListXattr returns the set of all extended attributes names that // have values. Inodes that do not support extended attributes return // EOPNOTSUPP. - ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error) + // + // If this is called through the listxattr(2) syscall, size indicates the + // size of the buffer that the application has allocated to hold the + // attribute list. If the list would be larger than size, implementations may + // return ERANGE to indicate that the buffer is too small, but they are also + // free to ignore the hint entirely. All size checking is done independently + // at the syscall layer. + ListXattr(ctx context.Context, inode *Inode, size uint64) (map[string]struct{}, error) + + // RemoveXattr removes an extended attribute specified by name. Inodes that + // do not support extended attributes return EOPNOTSUPP. + RemoveXattr(ctx context.Context, inode *Inode, name string) error // Check determines whether an Inode can be accessed with the // requested permission mask using the context (which gives access diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 4729b4aac..5ada33a32 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -564,15 +564,15 @@ func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, valu return o.upper.SetXattr(ctx, d, name, value, flags) } -func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) { +func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[string]struct{}, error) { o.copyMu.RLock() defer o.copyMu.RUnlock() var names map[string]struct{} var err error if o.upper != nil { - names, err = o.upper.ListXattr(ctx) + names, err = o.upper.ListXattr(ctx, size) } else { - names, err = o.lower.ListXattr(ctx) + names, err = o.lower.ListXattr(ctx, size) } for name := range names { // Same as overlayGetXattr, we shouldn't forward along @@ -584,6 +584,18 @@ func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{} return names, err } +func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error { + // Don't allow changes to overlay xattrs through a removexattr syscall. + if strings.HasPrefix(XattrOverlayPrefix, name) { + return syserror.EPERM + } + + if err := copyUp(ctx, d); err != nil { + return err + } + return o.upper.RemoveXattr(ctx, d, name) +} + func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { o.copyMu.RLock() // Hot path. Avoid defers. diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index c00cef0a5..3c2b583ae 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -159,8 +159,13 @@ func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, fla } // ListXattr implements fs.InodeOperations.ListXattr. -func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) { - return d.ramfsDir.ListXattr(ctx, i) +func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode, size uint64) (map[string]struct{}, error) { + return d.ramfsDir.ListXattr(ctx, i, size) +} + +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (d *Dir) RemoveXattr(ctx context.Context, i *fs.Inode, name string) error { + return d.ramfsDir.RemoveXattr(ctx, i, name) } // Lookup implements fs.InodeOperations.Lookup. diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 588f8b087..79066ad2a 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -228,21 +228,18 @@ var AMD64 = &kernel.SyscallTable{ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), - // TODO(b/148303075): Enable set/getxattr (in their various - // forms) once we also have list and removexattr. The JVM - // assumes that if get/set exist, then list and remove do too. - 188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), 200: syscalls.Supported("tkill", Tkill), 201: syscalls.Supported("time", Time), 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index 06e5ee401..7421619de 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -36,26 +36,23 @@ var ARM64 = &kernel.SyscallTable{ }, AuditNumber: linux.AUDIT_ARCH_AARCH64, Table: map[uintptr]kernel.Syscall{ - 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - // TODO(b/148303075): Enable set/getxattr (in their various - // forms) once we also have list and removexattr. The JVM - // assumes that if get/set exist, then list and remove do too. - 5: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 6: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 7: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 8: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 13: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 14: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 15: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), - 16: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}), + 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), 17: syscalls.Supported("getcwd", Getcwd), 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 19: syscalls.Supported("eventfd2", Eventfd2), diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index efb95555c..342337726 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -72,7 +72,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink } valueLen := 0 - err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } @@ -172,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink return 0, nil, err } - return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } @@ -187,12 +187,12 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si return syserror.EINVAL } - if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + name, err := copyInXattrName(t, nameAddr) + if err != nil { return err } - name, err := copyInXattrName(t, nameAddr) - if err != nil { + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { return err } @@ -226,12 +226,18 @@ func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { return name, nil } +// Restrict xattrs to regular files and directories. +// +// TODO(b/148380782): In Linux, this restriction technically only applies to +// xattrs in the "user.*" namespace. Make file type checks specific to the +// namespace once we allow other xattr prefixes. +func xattrFileTypeOk(i *fs.Inode) bool { + return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr) +} + func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error { // Restrict xattrs to regular files and directories. - // - // In Linux, this restriction technically only applies to xattrs in the - // "user.*" namespace, but we don't allow any other xattr prefixes anyway. - if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) { + if !xattrFileTypeOk(i) { if perms.Write { return syserror.EPERM } @@ -240,3 +246,179 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error return i.CheckPermission(t, perms) } + +// ListXattr implements linux syscall listxattr(2). +func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listXattrFromPath(t, args, true) +} + +// LListXattr implements linux syscall llistxattr(2). +func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listXattrFromPath(t, args, false) +} + +// FListXattr implements linux syscall flistxattr(2). +func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := uint64(args[2].SizeT()) + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + n, err := listXattr(t, f.Dirent, listAddr, size) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := uint64(args[2].SizeT()) + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + n := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + n, err = listXattr(t, d, listAddr, size) + return err + }) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) { + if !xattrFileTypeOk(d.Inode) { + return 0, nil + } + + // If listxattr(2) is called with size 0, the buffer size needed to contain + // the xattr list will be returned successfully even if it is nonzero. In + // that case, we need to retrieve the entire list so we can compute and + // return the correct size. + requestedSize := size + if size == 0 || size > linux.XATTR_SIZE_MAX { + requestedSize = linux.XATTR_SIZE_MAX + } + xattrs, err := d.Inode.ListXattr(t, requestedSize) + if err != nil { + return 0, err + } + + // TODO(b/148380782): support namespaces other than "user". + for x := range xattrs { + if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + delete(xattrs, x) + } + } + + listSize := xattrListSize(xattrs) + if listSize > linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + if uint64(listSize) > requestedSize { + return 0, syserror.ERANGE + } + + // Don't copy out the attributes if size is 0. + if size == 0 { + return listSize, nil + } + + buf := make([]byte, 0, listSize) + for x := range xattrs { + buf = append(buf, []byte(x)...) + buf = append(buf, 0) + } + if _, err := t.CopyOutBytes(addr, buf); err != nil { + return 0, err + } + + return len(buf), nil +} + +func xattrListSize(xattrs map[string]struct{}) int { + size := 0 + for x := range xattrs { + size += len(x) + 1 + } + return size +} + +// RemoveXattr implements linux syscall removexattr(2). +func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return removeXattrFromPath(t, args, true) +} + +// LRemoveXattr implements linux syscall lremovexattr(2). +func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return removeXattrFromPath(t, args, false) +} + +// FRemoveXattr implements linux syscall fremovexattr(2). +func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + return 0, nil, removeXattr(t, f.Dirent, nameAddr) +} + +func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + return removeXattr(t, d, nameAddr) + }) +} + +// removeXattr implements removexattr(2) from the given *fs.Dirent. +func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error { + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + return err + } + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + + return d.Inode.RemoveXattr(t, d, name) +} diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 4d84ad999..cadd83273 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -768,12 +768,22 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } // TODO(b/127675828): support getxattr. -func (l *localFile) GetXattr(name string, size uint64) (string, error) { +func (*localFile) GetXattr(string, uint64) (string, error) { return "", syscall.EOPNOTSUPP } // TODO(b/127675828): support setxattr. -func (l *localFile) SetXattr(name, value string, flags uint32) error { +func (*localFile) SetXattr(string, string, uint32) error { + return syscall.EOPNOTSUPP +} + +// TODO(b/148303075): support listxattr. +func (*localFile) ListXattr(uint64) (map[string]struct{}, error) { + return nil, syscall.EOPNOTSUPP +} + +// TODO(b/148303075): support removexattr. +func (*localFile) RemoveXattr(string) error { return syscall.EOPNOTSUPP } @@ -790,7 +800,7 @@ func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error } // Rename implements p9.File; this should never be called. -func (l *localFile) Rename(p9.File, string) error { +func (*localFile) Rename(p9.File, string) error { panic("rename called directly") } diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 12d389c3e..ca1af209a 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3782,6 +3782,7 @@ cc_binary( "//test/util:capability_util", "//test/util:file_descriptor", "//test/util:fs_util", + "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/strings", gtest, "//test/util:posix_error", diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc index 85eb31847..8b00ef44c 100644 --- a/test/syscalls/linux/xattr.cc +++ b/test/syscalls/linux/xattr.cc @@ -24,6 +24,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "absl/container/flat_hash_set.h" #include "test/syscalls/linux/file_base.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" @@ -38,36 +39,36 @@ namespace { class XattrTest : public FileTest {}; -TEST_F(XattrTest, XattrNullName) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); +TEST_F(XattrTest, XattrNonexistentFile) { + const char* path = "/does/not/exist"; + EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0), + SyscallFailsWithErrno(ENOENT)); + EXPECT_THAT(getxattr(path, nullptr, nullptr, 0), + SyscallFailsWithErrno(ENOENT)); + EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT)); + EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT)); +} +TEST_F(XattrTest, XattrNullName) { const char* path = test_file_name_.c_str(); EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(EFAULT)); EXPECT_THAT(getxattr(path, nullptr, nullptr, 0), SyscallFailsWithErrno(EFAULT)); + EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(EFAULT)); } TEST_F(XattrTest, XattrEmptyName) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(ERANGE)); EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE)); + EXPECT_THAT(removexattr(path, ""), SyscallFailsWithErrno(ERANGE)); } TEST_F(XattrTest, XattrLargeName) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); std::string name = "user."; name += std::string(XATTR_NAME_MAX - name.length(), 'a'); @@ -86,28 +87,23 @@ TEST_F(XattrTest, XattrLargeName) { SyscallFailsWithErrno(ERANGE)); EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), SyscallFailsWithErrno(ERANGE)); + EXPECT_THAT(removexattr(path, name.c_str()), SyscallFailsWithErrno(ERANGE)); } TEST_F(XattrTest, XattrInvalidPrefix) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); std::string name(XATTR_NAME_MAX, 'a'); EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(EOPNOTSUPP)); EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), SyscallFailsWithErrno(EOPNOTSUPP)); + EXPECT_THAT(removexattr(path, name.c_str()), + SyscallFailsWithErrno(EOPNOTSUPP)); } // Do not allow save/restore cycles after making the test file read-only, as // the restore will fail to open it with r/w permissions. TEST_F(XattrTest, XattrReadOnly_NoRandomSave) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - // Drop capabilities that allow us to override file and directory permissions. ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); @@ -124,19 +120,21 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) { EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallFailsWithErrno(EACCES)); + EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EACCES)); char buf = '-'; EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size)); EXPECT_EQ(buf, val); + + char list[sizeof(name)]; + EXPECT_THAT(listxattr(path, list, sizeof(list)), + SyscallSucceedsWithValue(sizeof(name))); + EXPECT_STREQ(list, name); } // Do not allow save/restore cycles after making the test file write-only, as // the restore will fail to open it with r/w permissions. TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - // Drop capabilities that allow us to override file and directory permissions. ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); @@ -152,6 +150,14 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) { EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds()); EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES)); + + // listxattr will succeed even without read permissions. + char list[sizeof(name)]; + EXPECT_THAT(listxattr(path, list, sizeof(list)), + SyscallSucceedsWithValue(sizeof(name))); + EXPECT_STREQ(list, name); + + EXPECT_THAT(removexattr(path, name), SyscallSucceeds()); } TEST_F(XattrTest, XattrTrustedWithNonadmin) { @@ -163,64 +169,66 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) { const char name[] = "trusted.abc"; EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM)); EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); } TEST_F(XattrTest, XattrOnDirectory) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const char name[] = "user.test"; - EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0), + EXPECT_THAT(setxattr(dir.path().c_str(), name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); - EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0), + EXPECT_THAT(getxattr(dir.path().c_str(), name, nullptr, 0), SyscallSucceedsWithValue(0)); + + char list[sizeof(name)]; + EXPECT_THAT(listxattr(dir.path().c_str(), list, sizeof(list)), + SyscallSucceedsWithValue(sizeof(name))); + EXPECT_STREQ(list, name); + + EXPECT_THAT(removexattr(dir.path().c_str(), name), SyscallSucceeds()); } TEST_F(XattrTest, XattrOnSymlink) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); TempPath link = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateSymlinkTo(dir.path(), test_file_name_)); const char name[] = "user.test"; - EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0), + EXPECT_THAT(setxattr(link.path().c_str(), name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); - EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0), + EXPECT_THAT(getxattr(link.path().c_str(), name, nullptr, 0), SyscallSucceedsWithValue(0)); + + char list[sizeof(name)]; + EXPECT_THAT(listxattr(link.path().c_str(), list, sizeof(list)), + SyscallSucceedsWithValue(sizeof(name))); + EXPECT_STREQ(list, name); + + EXPECT_THAT(removexattr(link.path().c_str(), name), SyscallSucceeds()); } TEST_F(XattrTest, XattrOnInvalidFileTypes) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char name[] = "user.test"; char char_device[] = "/dev/zero"; - EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0), + EXPECT_THAT(setxattr(char_device, name, nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(EPERM)); - EXPECT_THAT(getxattr(char_device, name, NULL, 0), + EXPECT_THAT(getxattr(char_device, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); + EXPECT_THAT(listxattr(char_device, nullptr, 0), SyscallSucceedsWithValue(0)); // Use tmpfs, where creation of named pipes is supported. const std::string fifo = NewTempAbsPathInDir("/dev/shm"); const char* path = fifo.c_str(); EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds()); - EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0), + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(EPERM)); - EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA)); + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); + EXPECT_THAT(listxattr(path, nullptr, 0), SyscallSucceedsWithValue(0)); + EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM)); } TEST_F(XattrTest, SetxattrSizeSmallerThanValue) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -236,10 +244,6 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) { } TEST_F(XattrTest, SetxattrZeroSize) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -252,10 +256,6 @@ TEST_F(XattrTest, SetxattrZeroSize) { } TEST_F(XattrTest, SetxattrSizeTooLarge) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; @@ -271,10 +271,6 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) { } TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0), @@ -284,10 +280,6 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) { } TEST_F(XattrTest, SetxattrNullValueAndZeroSize) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); @@ -296,10 +288,6 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) { } TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val(XATTR_SIZE_MAX + 1); @@ -316,10 +304,6 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) { } TEST_F(XattrTest, SetxattrReplaceWithSmaller) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -335,10 +319,6 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) { } TEST_F(XattrTest, SetxattrReplaceWithLarger) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -353,10 +333,6 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) { } TEST_F(XattrTest, SetxattrCreateFlag) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE), @@ -368,10 +344,6 @@ TEST_F(XattrTest, SetxattrCreateFlag) { } TEST_F(XattrTest, SetxattrReplaceFlag) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE), @@ -384,10 +356,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) { } TEST_F(XattrTest, SetxattrInvalidFlags) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); int invalid_flags = 0xff; EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags), @@ -395,10 +363,6 @@ TEST_F(XattrTest, SetxattrInvalidFlags) { } TEST_F(XattrTest, Getxattr) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; int val = 1234; @@ -411,10 +375,6 @@ TEST_F(XattrTest, Getxattr) { } TEST_F(XattrTest, GetxattrSizeSmallerThanValue) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; std::vector val = {'a', 'a'}; @@ -427,10 +387,6 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) { } TEST_F(XattrTest, GetxattrSizeLargerThanValue) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -446,10 +402,6 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) { } TEST_F(XattrTest, GetxattrZeroSize) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -463,10 +415,6 @@ TEST_F(XattrTest, GetxattrZeroSize) { } TEST_F(XattrTest, GetxattrSizeTooLarge) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -483,10 +431,6 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) { } TEST_F(XattrTest, GetxattrNullValue) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -498,10 +442,6 @@ TEST_F(XattrTest, GetxattrNullValue) { } TEST_F(XattrTest, GetxattrNullValueAndZeroSize) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - const char* path = test_file_name_.c_str(); const char name[] = "user.test"; char val = 'a'; @@ -518,35 +458,109 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) { } TEST_F(XattrTest, GetxattrNonexistentName) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); + const char* path = test_file_name_.c_str(); + const char name[] = "user.test"; + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); +} + +TEST_F(XattrTest, Listxattr) { + const char* path = test_file_name_.c_str(); + const std::string name = "user.test"; + const std::string name2 = "user.test2"; + const std::string name3 = "user.test3"; + EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name2.c_str(), nullptr, 0, /*flags=*/0), + SyscallSucceeds()); + EXPECT_THAT(setxattr(path, name3.c_str(), nullptr, 0, /*flags=*/0), + SyscallSucceeds()); + std::vector list(name.size() + 1 + name2.size() + 1 + name3.size() + 1); + char* buf = list.data(); + EXPECT_THAT(listxattr(path, buf, XATTR_SIZE_MAX), + SyscallSucceedsWithValue(list.size())); + + absl::flat_hash_set got = {}; + for (char* p = buf; p < buf + list.size(); p += strlen(p) + 1) { + got.insert(std::string{p}); + } + + absl::flat_hash_set expected = {name, name2, name3}; + EXPECT_EQ(got, expected); +} + +TEST_F(XattrTest, ListxattrNoXattrs) { + const char* path = test_file_name_.c_str(); + + std::vector list, expected; + EXPECT_THAT(listxattr(path, list.data(), sizeof(list)), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(list, expected); + + // Listxattr should succeed if there are no attributes, even if the buffer + // passed in is a nullptr. + EXPECT_THAT(listxattr(path, nullptr, sizeof(list)), + SyscallSucceedsWithValue(0)); +} + +TEST_F(XattrTest, ListxattrNullBuffer) { const char* path = test_file_name_.c_str(); const char name[] = "user.test"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); + + EXPECT_THAT(listxattr(path, nullptr, sizeof(name)), + SyscallFailsWithErrno(EFAULT)); +} + +TEST_F(XattrTest, ListxattrSizeTooSmall) { + const char* path = test_file_name_.c_str(); + const char name[] = "user.test"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); + + char list[sizeof(name) - 1]; + EXPECT_THAT(listxattr(path, list, sizeof(list)), + SyscallFailsWithErrno(ERANGE)); +} + +TEST_F(XattrTest, ListxattrZeroSize) { + const char* path = test_file_name_.c_str(); + const char name[] = "user.test"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); + EXPECT_THAT(listxattr(path, nullptr, 0), + SyscallSucceedsWithValue(sizeof(name))); +} + +TEST_F(XattrTest, RemoveXattr) { + const char* path = test_file_name_.c_str(); + const char name[] = "user.test"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds()); + EXPECT_THAT(removexattr(path, name), SyscallSucceeds()); EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); } -TEST_F(XattrTest, LGetSetxattrOnSymlink) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); +TEST_F(XattrTest, RemoveXattrNonexistentName) { + const char* path = test_file_name_.c_str(); + const char name[] = "user.test"; + EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENODATA)); +} +TEST_F(XattrTest, LXattrOnSymlink) { + const char name[] = "user.test"; TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); TempPath link = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateSymlinkTo(dir.path(), test_file_name_)); - EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0), + EXPECT_THAT(lsetxattr(link.path().c_str(), name, nullptr, 0, 0), SyscallFailsWithErrno(EPERM)); - EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0), + EXPECT_THAT(lgetxattr(link.path().c_str(), name, nullptr, 0), SyscallFailsWithErrno(ENODATA)); + EXPECT_THAT(llistxattr(link.path().c_str(), nullptr, 0), + SyscallSucceedsWithValue(0)); + EXPECT_THAT(lremovexattr(link.path().c_str(), name), + SyscallFailsWithErrno(EPERM)); } -TEST_F(XattrTest, LGetSetxattrOnNonsymlink) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); - +TEST_F(XattrTest, LXattrOnNonsymlink) { const char* path = test_file_name_.c_str(); const char name[] = "user.test"; int val = 1234; @@ -558,13 +572,16 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) { EXPECT_THAT(lgetxattr(path, name, &buf, size), SyscallSucceedsWithValue(size)); EXPECT_EQ(buf, val); -} -TEST_F(XattrTest, FGetSetxattr) { - // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are - // supported, and get/set have been added pack to the syscall table. - SKIP_IF(IsRunningOnGvisor()); + char list[sizeof(name)]; + EXPECT_THAT(llistxattr(path, list, sizeof(list)), + SyscallSucceedsWithValue(sizeof(name))); + EXPECT_STREQ(list, name); + EXPECT_THAT(lremovexattr(path, name), SyscallSucceeds()); +} + +TEST_F(XattrTest, XattrWithFD) { const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0)); const char name[] = "user.test"; @@ -577,6 +594,13 @@ TEST_F(XattrTest, FGetSetxattr) { EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size), SyscallSucceedsWithValue(size)); EXPECT_EQ(buf, val); + + char list[sizeof(name)]; + EXPECT_THAT(flistxattr(fd.get(), list, sizeof(list)), + SyscallSucceedsWithValue(sizeof(name))); + EXPECT_STREQ(list, name); + + EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds()); } } // namespace -- cgit v1.2.3 From 9cbf5a3dcc71d85cd857f117d4b7189a101be9c1 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Fri, 25 Oct 2019 03:09:59 +0000 Subject: Enable pkg/cpuid support on arm64. Fixes #1255 Signed-off-by: Haibo Xu Change-Id: I8614e6f3ee321c2989567e4e712aa8f28cc9db14 --- pkg/cpuid/BUILD | 9 +- pkg/cpuid/cpuid.go | 1098 +----------------------------------- pkg/cpuid/cpuid_arm64.go | 461 ++++++++++++++++ pkg/cpuid/cpuid_arm64_test.go | 55 ++ pkg/cpuid/cpuid_parse_test.go | 142 ----- pkg/cpuid/cpuid_parse_x86_test.go | 144 +++++ pkg/cpuid/cpuid_test.go | 241 -------- pkg/cpuid/cpuid_x86.go | 1100 +++++++++++++++++++++++++++++++++++++ pkg/cpuid/cpuid_x86_test.go | 243 ++++++++ 9 files changed, 2018 insertions(+), 1475 deletions(-) create mode 100644 pkg/cpuid/cpuid_arm64.go create mode 100644 pkg/cpuid/cpuid_arm64_test.go delete mode 100644 pkg/cpuid/cpuid_parse_test.go create mode 100644 pkg/cpuid/cpuid_parse_x86_test.go delete mode 100644 pkg/cpuid/cpuid_test.go create mode 100644 pkg/cpuid/cpuid_x86.go create mode 100644 pkg/cpuid/cpuid_x86_test.go (limited to 'pkg') diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD index 43a432190..d6cb1a549 100644 --- a/pkg/cpuid/BUILD +++ b/pkg/cpuid/BUILD @@ -7,6 +7,8 @@ go_library( srcs = [ "cpu_amd64.s", "cpuid.go", + "cpuid_arm64.go", + "cpuid_x86.go", ], visibility = ["//:sandbox"], deps = ["//pkg/log"], @@ -15,7 +17,10 @@ go_library( go_test( name = "cpuid_test", size = "small", - srcs = ["cpuid_test.go"], + srcs = [ + "cpuid_arm64_test.go", + "cpuid_x86_test.go", + ], library = ":cpuid", ) @@ -23,7 +28,7 @@ go_test( name = "cpuid_parse_test", size = "small", srcs = [ - "cpuid_parse_test.go", + "cpuid_parse_x86_test.go", ], library = ":cpuid", tags = ["manual"], diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go index cf50ee53f..f7f9dbf86 100644 --- a/pkg/cpuid/cpuid.go +++ b/pkg/cpuid/cpuid.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build i386 amd64 - // Package cpuid provides basic functionality for creating and adjusting CPU // feature sets. // @@ -21,1100 +19,20 @@ // known platform, or HostFeatureSet()) and then add, remove, and test for // features as desired. // -// For example: Test for hardware extended state saving, and if we don't have -// it, don't expose AVX, which cannot be saved with fxsave. +// For example: on x86, test for hardware extended state saving, and if +// we don't have it, don't expose AVX, which cannot be saved with fxsave. // // if !HostFeatureSet().HasFeature(X86FeatureXSAVE) { // exposedFeatures.Remove(X86FeatureAVX) // } package cpuid -import ( - "bytes" - "fmt" - "io/ioutil" - "strconv" - "strings" - - "gvisor.dev/gvisor/pkg/log" -) - -// Common references for CPUID leaves and bits: -// -// Intel: -// * Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date) -// * Intel Application Note 485 (more detailed) -// -// AMD: -// * AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..." - // Feature is a unique identifier for a particular cpu feature. We just use an -// int as a feature number on x86. +// int as a feature number on x86 and arm64. // -// Features are numbered according to "blocks". Each block is 32 bits, and -// feature bits from the same source (cpuid leaf/level) are in the same block. -type Feature int - -// block is a collection of 32 Feature bits. -type block int - -const blockSize = 32 - -// Feature bits are numbered according to "blocks". Each block is 32 bits, and +// On x86, features are numbered according to "blocks". Each block is 32 bits, and // feature bits from the same source (cpuid leaf/level) are in the same block. -func featureID(b block, bit int) Feature { - return Feature(32*int(b) + bit) -} - -// Block 0 constants are all of the "basic" feature bits returned by a cpuid in -// ecx with eax=1. -const ( - X86FeatureSSE3 Feature = iota - X86FeaturePCLMULDQ - X86FeatureDTES64 - X86FeatureMONITOR - X86FeatureDSCPL - X86FeatureVMX - X86FeatureSMX - X86FeatureEST - X86FeatureTM2 - X86FeatureSSSE3 // Not a typo, "supplemental" SSE3. - X86FeatureCNXTID - X86FeatureSDBG - X86FeatureFMA - X86FeatureCX16 - X86FeatureXTPR - X86FeaturePDCM - _ // ecx bit 16 is reserved. - X86FeaturePCID - X86FeatureDCA - X86FeatureSSE4_1 - X86FeatureSSE4_2 - X86FeatureX2APIC - X86FeatureMOVBE - X86FeaturePOPCNT - X86FeatureTSCD - X86FeatureAES - X86FeatureXSAVE - X86FeatureOSXSAVE - X86FeatureAVX - X86FeatureF16C - X86FeatureRDRAND - _ // ecx bit 31 is reserved. -) - -// Block 1 constants are all of the "basic" feature bits returned by a cpuid in -// edx with eax=1. -const ( - X86FeatureFPU Feature = 32 + iota - X86FeatureVME - X86FeatureDE - X86FeaturePSE - X86FeatureTSC - X86FeatureMSR - X86FeaturePAE - X86FeatureMCE - X86FeatureCX8 - X86FeatureAPIC - _ // edx bit 10 is reserved. - X86FeatureSEP - X86FeatureMTRR - X86FeaturePGE - X86FeatureMCA - X86FeatureCMOV - X86FeaturePAT - X86FeaturePSE36 - X86FeaturePSN - X86FeatureCLFSH - _ // edx bit 20 is reserved. - X86FeatureDS - X86FeatureACPI - X86FeatureMMX - X86FeatureFXSR - X86FeatureSSE - X86FeatureSSE2 - X86FeatureSS - X86FeatureHTT - X86FeatureTM - X86FeatureIA64 - X86FeaturePBE -) - -// Block 2 bits are the "structured extended" features returned in ebx for -// eax=7, ecx=0. -const ( - X86FeatureFSGSBase Feature = 2*32 + iota - X86FeatureTSC_ADJUST - _ // ebx bit 2 is reserved. - X86FeatureBMI1 - X86FeatureHLE - X86FeatureAVX2 - X86FeatureFDP_EXCPTN_ONLY - X86FeatureSMEP - X86FeatureBMI2 - X86FeatureERMS - X86FeatureINVPCID - X86FeatureRTM - X86FeatureCQM - X86FeatureFPCSDS - X86FeatureMPX - X86FeatureRDT - X86FeatureAVX512F - X86FeatureAVX512DQ - X86FeatureRDSEED - X86FeatureADX - X86FeatureSMAP - X86FeatureAVX512IFMA - X86FeaturePCOMMIT - X86FeatureCLFLUSHOPT - X86FeatureCLWB - X86FeatureIPT // Intel processor trace. - X86FeatureAVX512PF - X86FeatureAVX512ER - X86FeatureAVX512CD - X86FeatureSHA - X86FeatureAVX512BW - X86FeatureAVX512VL -) - -// Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0. -const ( - X86FeaturePREFETCHWT1 Feature = 3*32 + iota - X86FeatureAVX512VBMI - X86FeatureUMIP - X86FeaturePKU - X86FeatureOSPKE - X86FeatureWAITPKG - X86FeatureAVX512_VBMI2 - _ // ecx bit 7 is reserved - X86FeatureGFNI - X86FeatureVAES - X86FeatureVPCLMULQDQ - X86FeatureAVX512_VNNI - X86FeatureAVX512_BITALG - X86FeatureTME - X86FeatureAVX512_VPOPCNTDQ - _ // ecx bit 15 is reserved - X86FeatureLA57 - // ecx bits 17-21 are reserved - _ - _ - _ - _ - _ - X86FeatureRDPID - // ecx bits 23-24 are reserved - _ - _ - X86FeatureCLDEMOTE - _ // ecx bit 26 is reserved - X86FeatureMOVDIRI - X86FeatureMOVDIR64B -) - -// Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX. -// The CPUID leaf is available only if 'X86FeatureXSAVE' is present. -const ( - X86FeatureXSAVEOPT Feature = 4*32 + iota - X86FeatureXSAVEC - X86FeatureXGETBV1 - X86FeatureXSAVES - // EAX[31:4] are reserved. -) - -// Block 5 constants are the extended feature bits in -// CPUID.(EAX=0x80000001):ECX. -const ( - X86FeatureLAHF64 Feature = 5*32 + iota - X86FeatureCMP_LEGACY - X86FeatureSVM - X86FeatureEXTAPIC - X86FeatureCR8_LEGACY - X86FeatureLZCNT - X86FeatureSSE4A - X86FeatureMISALIGNSSE - X86FeaturePREFETCHW - X86FeatureOSVW - X86FeatureIBS - X86FeatureXOP - X86FeatureSKINIT - X86FeatureWDT - _ // ecx bit 14 is reserved. - X86FeatureLWP - X86FeatureFMA4 - X86FeatureTCE - _ // ecx bit 18 is reserved. - _ // ecx bit 19 is reserved. - _ // ecx bit 20 is reserved. - X86FeatureTBM - X86FeatureTOPOLOGY - X86FeaturePERFCTR_CORE - X86FeaturePERFCTR_NB - _ // ecx bit 25 is reserved. - X86FeatureBPEXT - X86FeaturePERFCTR_TSC - X86FeaturePERFCTR_LLC - X86FeatureMWAITX - // ECX[31:30] are reserved. -) - -// Block 6 constants are the extended feature bits in -// CPUID.(EAX=0x80000001):EDX. // -// These are sparse, and so the bit positions are assigned manually. -const ( - // On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features - // also defined in block 1 (in identical bit positions). Those features - // are not listed here. - block6DuplicateMask = 0x183f3ff - - X86FeatureSYSCALL Feature = 6*32 + 11 - X86FeatureNX Feature = 6*32 + 20 - X86FeatureMMXEXT Feature = 6*32 + 22 - X86FeatureFXSR_OPT Feature = 6*32 + 25 - X86FeatureGBPAGES Feature = 6*32 + 26 - X86FeatureRDTSCP Feature = 6*32 + 27 - X86FeatureLM Feature = 6*32 + 29 - X86Feature3DNOWEXT Feature = 6*32 + 30 - X86Feature3DNOW Feature = 6*32 + 31 -) - -// linuxBlockOrder defines the order in which linux organizes the feature -// blocks. Linux also tracks feature bits in 32-bit blocks, but in an order -// which doesn't match well here, so for the /proc/cpuinfo generation we simply -// re-map the blocks to Linux's ordering and then go through the bits in each -// block. -var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3} - -// To make emulation of /proc/cpuinfo easy, these names match the names of the -// basic features in Linux defined in arch/x86/kernel/cpu/capflags.c. -var x86FeatureStrings = map[Feature]string{ - // Block 0. - X86FeatureSSE3: "pni", - X86FeaturePCLMULDQ: "pclmulqdq", - X86FeatureDTES64: "dtes64", - X86FeatureMONITOR: "monitor", - X86FeatureDSCPL: "ds_cpl", - X86FeatureVMX: "vmx", - X86FeatureSMX: "smx", - X86FeatureEST: "est", - X86FeatureTM2: "tm2", - X86FeatureSSSE3: "ssse3", - X86FeatureCNXTID: "cid", - X86FeatureSDBG: "sdbg", - X86FeatureFMA: "fma", - X86FeatureCX16: "cx16", - X86FeatureXTPR: "xtpr", - X86FeaturePDCM: "pdcm", - X86FeaturePCID: "pcid", - X86FeatureDCA: "dca", - X86FeatureSSE4_1: "sse4_1", - X86FeatureSSE4_2: "sse4_2", - X86FeatureX2APIC: "x2apic", - X86FeatureMOVBE: "movbe", - X86FeaturePOPCNT: "popcnt", - X86FeatureTSCD: "tsc_deadline_timer", - X86FeatureAES: "aes", - X86FeatureXSAVE: "xsave", - X86FeatureAVX: "avx", - X86FeatureF16C: "f16c", - X86FeatureRDRAND: "rdrand", - - // Block 1. - X86FeatureFPU: "fpu", - X86FeatureVME: "vme", - X86FeatureDE: "de", - X86FeaturePSE: "pse", - X86FeatureTSC: "tsc", - X86FeatureMSR: "msr", - X86FeaturePAE: "pae", - X86FeatureMCE: "mce", - X86FeatureCX8: "cx8", - X86FeatureAPIC: "apic", - X86FeatureSEP: "sep", - X86FeatureMTRR: "mtrr", - X86FeaturePGE: "pge", - X86FeatureMCA: "mca", - X86FeatureCMOV: "cmov", - X86FeaturePAT: "pat", - X86FeaturePSE36: "pse36", - X86FeaturePSN: "pn", - X86FeatureCLFSH: "clflush", - X86FeatureDS: "dts", - X86FeatureACPI: "acpi", - X86FeatureMMX: "mmx", - X86FeatureFXSR: "fxsr", - X86FeatureSSE: "sse", - X86FeatureSSE2: "sse2", - X86FeatureSS: "ss", - X86FeatureHTT: "ht", - X86FeatureTM: "tm", - X86FeatureIA64: "ia64", - X86FeaturePBE: "pbe", - - // Block 2. - X86FeatureFSGSBase: "fsgsbase", - X86FeatureTSC_ADJUST: "tsc_adjust", - X86FeatureBMI1: "bmi1", - X86FeatureHLE: "hle", - X86FeatureAVX2: "avx2", - X86FeatureSMEP: "smep", - X86FeatureBMI2: "bmi2", - X86FeatureERMS: "erms", - X86FeatureINVPCID: "invpcid", - X86FeatureRTM: "rtm", - X86FeatureCQM: "cqm", - X86FeatureMPX: "mpx", - X86FeatureRDT: "rdt_a", - X86FeatureAVX512F: "avx512f", - X86FeatureAVX512DQ: "avx512dq", - X86FeatureRDSEED: "rdseed", - X86FeatureADX: "adx", - X86FeatureSMAP: "smap", - X86FeatureCLWB: "clwb", - X86FeatureAVX512PF: "avx512pf", - X86FeatureAVX512ER: "avx512er", - X86FeatureAVX512CD: "avx512cd", - X86FeatureSHA: "sha_ni", - X86FeatureAVX512BW: "avx512bw", - X86FeatureAVX512VL: "avx512vl", - - // Block 3. - X86FeatureAVX512VBMI: "avx512vbmi", - X86FeatureUMIP: "umip", - X86FeaturePKU: "pku", - X86FeatureOSPKE: "ospke", - X86FeatureWAITPKG: "waitpkg", - X86FeatureAVX512_VBMI2: "avx512_vbmi2", - X86FeatureGFNI: "gfni", - X86FeatureVAES: "vaes", - X86FeatureVPCLMULQDQ: "vpclmulqdq", - X86FeatureAVX512_VNNI: "avx512_vnni", - X86FeatureAVX512_BITALG: "avx512_bitalg", - X86FeatureTME: "tme", - X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq", - X86FeatureLA57: "la57", - X86FeatureRDPID: "rdpid", - X86FeatureCLDEMOTE: "cldemote", - X86FeatureMOVDIRI: "movdiri", - X86FeatureMOVDIR64B: "movdir64b", - - // Block 4. - X86FeatureXSAVEOPT: "xsaveopt", - X86FeatureXSAVEC: "xsavec", - X86FeatureXGETBV1: "xgetbv1", - X86FeatureXSAVES: "xsaves", - - // Block 5. - X86FeatureLAHF64: "lahf_lm", // LAHF/SAHF in long mode - X86FeatureCMP_LEGACY: "cmp_legacy", - X86FeatureSVM: "svm", - X86FeatureEXTAPIC: "extapic", - X86FeatureCR8_LEGACY: "cr8_legacy", - X86FeatureLZCNT: "abm", // Advanced bit manipulation - X86FeatureSSE4A: "sse4a", - X86FeatureMISALIGNSSE: "misalignsse", - X86FeaturePREFETCHW: "3dnowprefetch", - X86FeatureOSVW: "osvw", - X86FeatureIBS: "ibs", - X86FeatureXOP: "xop", - X86FeatureSKINIT: "skinit", - X86FeatureWDT: "wdt", - X86FeatureLWP: "lwp", - X86FeatureFMA4: "fma4", - X86FeatureTCE: "tce", - X86FeatureTBM: "tbm", - X86FeatureTOPOLOGY: "topoext", - X86FeaturePERFCTR_CORE: "perfctr_core", - X86FeaturePERFCTR_NB: "perfctr_nb", - X86FeatureBPEXT: "bpext", - X86FeaturePERFCTR_TSC: "ptsc", - X86FeaturePERFCTR_LLC: "perfctr_llc", - X86FeatureMWAITX: "mwaitx", - - // Block 6. - X86FeatureSYSCALL: "syscall", - X86FeatureNX: "nx", - X86FeatureMMXEXT: "mmxext", - X86FeatureFXSR_OPT: "fxsr_opt", - X86FeatureGBPAGES: "pdpe1gb", - X86FeatureRDTSCP: "rdtscp", - X86FeatureLM: "lm", - X86Feature3DNOWEXT: "3dnowext", - X86Feature3DNOW: "3dnow", -} - -// These flags are parse only---they can be used for setting / unsetting the -// flags, but will not get printed out in /proc/cpuinfo. -var x86FeatureParseOnlyStrings = map[Feature]string{ - // Block 0. - X86FeatureOSXSAVE: "osxsave", - - // Block 2. - X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only", - X86FeatureFPCSDS: "fpcsds", - X86FeatureIPT: "pt", - X86FeatureCLFLUSHOPT: "clfushopt", - - // Block 3. - X86FeaturePREFETCHWT1: "prefetchwt1", -} - -// intelCacheDescriptors describe the caches and TLBs on the system. They are -// returned in the registers for eax=2. Intel only. -type intelCacheDescriptor uint8 - -// Valid cache/TLB descriptors. All descriptors can be found in Intel SDM Vol. -// 2, Ch. 3.2, "CPUID", Table 3-12 "Encoding of CPUID Leaf 2 Descriptors". -const ( - intelNullDescriptor intelCacheDescriptor = 0 - intelNoTLBDescriptor intelCacheDescriptor = 0xfe - intelNoCacheDescriptor intelCacheDescriptor = 0xff - - // Most descriptors omitted for brevity as they are currently unused. -) - -// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4. -type CacheType uint8 - -const ( - // cacheNull indicates that there are no more entries. - cacheNull CacheType = iota - - // CacheData is a data cache. - CacheData - - // CacheInstruction is an instruction cache. - CacheInstruction - - // CacheUnified is a unified instruction and data cache. - CacheUnified -) - -// Cache describes the parameters of a single cache on the system. -// -// +stateify savable -type Cache struct { - // Level is the hierarchical level of this cache (L1, L2, etc). - Level uint32 - - // Type is the type of cache. - Type CacheType - - // FullyAssociative indicates that entries may be placed in any block. - FullyAssociative bool - - // Partitions is the number of physical partitions in the cache. - Partitions uint32 - - // Ways is the number of ways of associativity in the cache. - Ways uint32 - - // Sets is the number of sets in the cache. - Sets uint32 - - // InvalidateHierarchical indicates that WBINVD/INVD from threads - // sharing this cache acts upon lower level caches for threads sharing - // this cache. - InvalidateHierarchical bool - - // Inclusive indicates that this cache is inclusive of lower cache - // levels. - Inclusive bool - - // DirectMapped indicates that this cache is directly mapped from - // address, rather than using a hash function. - DirectMapped bool -} - -// Just a way to wrap cpuid function numbers. -type cpuidFunction uint32 - -// The constants below are the lower or "standard" cpuid functions, ordered as -// defined by the hardware. -const ( - vendorID cpuidFunction = iota // Returns vendor ID and largest standard function. - featureInfo // Returns basic feature bits and processor signature. - intelCacheDescriptors // Returns list of cache descriptors. Intel only. - intelSerialNumber // Returns processor serial number (obsolete on new hardware). Intel only. - intelDeterministicCacheParams // Returns deterministic cache information. Intel only. - monitorMwaitParams // Returns information about monitor/mwait instructions. - powerParams // Returns information about power management and thermal sensors. - extendedFeatureInfo // Returns extended feature bits. - _ // Function 0x8 is reserved. - intelDCAParams // Returns direct cache access information. Intel only. - intelPMCInfo // Returns information about performance monitoring features. Intel only. - intelX2APICInfo // Returns core/logical processor topology. Intel only. - _ // Function 0xc is reserved. - xSaveInfo // Returns information about extended state management. -) - -// The "extended" functions start at 0x80000000. -const ( - extendedFunctionInfo cpuidFunction = 0x80000000 + iota // Returns highest available extended function in eax. - extendedFeatures // Returns some extended feature bits in edx and ecx. -) - -// These are the extended floating point state features. They are used to -// enumerate floating point features in XCR0, XSTATE_BV, etc. -const ( - XSAVEFeatureX87 = 1 << 0 - XSAVEFeatureSSE = 1 << 1 - XSAVEFeatureAVX = 1 << 2 - XSAVEFeatureBNDREGS = 1 << 3 - XSAVEFeatureBNDCSR = 1 << 4 - XSAVEFeatureAVX512op = 1 << 5 - XSAVEFeatureAVX512zmm0 = 1 << 6 - XSAVEFeatureAVX512zmm16 = 1 << 7 - XSAVEFeaturePKRU = 1 << 9 -) - -var cpuFreqMHz float64 - -// x86FeaturesFromString includes features from x86FeatureStrings and -// x86FeatureParseOnlyStrings. -var x86FeaturesFromString = make(map[string]Feature) - -// FeatureFromString returns the Feature associated with the given feature -// string plus a bool to indicate if it could find the feature. -func FeatureFromString(s string) (Feature, bool) { - f, b := x86FeaturesFromString[s] - return f, b -} - -// String implements fmt.Stringer. -func (f Feature) String() string { - if s := f.flagString(false); s != "" { - return s - } - - block := int(f) / 32 - bit := int(f) % 32 - return fmt.Sprintf("", f, block, bit) -} - -func (f Feature) flagString(cpuinfoOnly bool) string { - if s, ok := x86FeatureStrings[f]; ok { - return s - } - if !cpuinfoOnly { - return x86FeatureParseOnlyStrings[f] - } - return "" -} - -// FeatureSet is a set of Features for a CPU. -// -// +stateify savable -type FeatureSet struct { - // Set is the set of features that are enabled in this FeatureSet. - Set map[Feature]bool - - // VendorID is the 12-char string returned in ebx:edx:ecx for eax=0. - VendorID string - - // ExtendedFamily is part of the processor signature. - ExtendedFamily uint8 - - // ExtendedModel is part of the processor signature. - ExtendedModel uint8 - - // ProcessorType is part of the processor signature. - ProcessorType uint8 - - // Family is part of the processor signature. - Family uint8 - - // Model is part of the processor signature. - Model uint8 - - // SteppingID is part of the processor signature. - SteppingID uint8 - - // Caches describes the caches on the CPU. - Caches []Cache - - // CacheLine is the size of a cache line in bytes. - // - // All caches use the same line size. This is not enforced in the CPUID - // encoding, but is true on all known x86 processors. - CacheLine uint32 -} - -// FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is -// equivalent to the "flags" field in /proc/cpuinfo. -func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string { - var s []string - for _, b := range linuxBlockOrder { - for i := 0; i < blockSize; i++ { - if f := featureID(b, i); fs.Set[f] { - if fstr := f.flagString(cpuinfoOnly); fstr != "" { - s = append(s, fstr) - } - } - } - } - return strings.Join(s, " ") -} - -// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is -// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are -// not always printed in Linux. The bogomips field is simply made up. -func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) { - fmt.Fprintf(b, "processor\t: %d\n", cpu) - fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID) - fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family) - fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model) - fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now. - fmt.Fprintf(b, "stepping\t: %s\n", "unknown") // Unknown for now. - fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz) - fmt.Fprintln(b, "fpu\t\t: yes") - fmt.Fprintln(b, "fpu_exception\t: yes") - fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID. - fmt.Fprintln(b, "wp\t\t: yes") - fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true)) - fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. - fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine) - fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine) - fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48) - fmt.Fprintln(b, "power management:") // This is always here, but can be blank. - fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline. -} - -const ( - amdVendorID = "AuthenticAMD" - intelVendorID = "GenuineIntel" -) - -// AMD returns true if fs describes an AMD CPU. -func (fs *FeatureSet) AMD() bool { - return fs.VendorID == amdVendorID -} - -// Intel returns true if fs describes an Intel CPU. -func (fs *FeatureSet) Intel() bool { - return fs.VendorID == intelVendorID -} - -// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a -// subset of the host feature set. -type ErrIncompatible struct { - message string -} - -// Error implements error. -func (e ErrIncompatible) Error() string { - return e.message -} - -// CheckHostCompatible returns nil if fs is a subset of the host feature set. -func (fs *FeatureSet) CheckHostCompatible() error { - hfs := HostFeatureSet() - - if diff := fs.Subtract(hfs); diff != nil { - return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)} - } - - // The size of a cache line must match, as it is critical to correctly - // utilizing CLFLUSH. Other cache properties are allowed to change, as - // they are not important to correctness. - if fs.CacheLine != hfs.CacheLine { - return ErrIncompatible{fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fs.CacheLine, hfs.CacheLine)} - } - - return nil -} - -// Helper to convert 3 regs into 12-byte vendor ID. -func vendorIDFromRegs(bx, cx, dx uint32) string { - bytes := make([]byte, 0, 12) - for i := uint(0); i < 4; i++ { - b := byte(bx >> (i * 8)) - bytes = append(bytes, b) - } - - for i := uint(0); i < 4; i++ { - b := byte(dx >> (i * 8)) - bytes = append(bytes, b) - } - - for i := uint(0); i < 4; i++ { - b := byte(cx >> (i * 8)) - bytes = append(bytes, b) - } - return string(bytes) -} - -// ExtendedStateSize returns the number of bytes needed to save the "extended -// state" for this processor and the boundary it must be aligned to. Extended -// state includes floating point registers, and other cpu state that's not -// associated with the normal task context. -// -// Note: We can save some space here with an optimization where we use a -// smaller chunk of memory depending on features that are actually enabled. -// Currently we just use the largest possible size for simplicity (which is -// about 2.5K worst case, with avx512). -func (fs *FeatureSet) ExtendedStateSize() (size, align uint) { - if fs.UseXsave() { - // Leaf 0 of xsaveinfo function returns the size for currently - // enabled xsave features in ebx, the maximum size if all valid - // features are saved with xsave in ecx, and valid XCR0 bits in - // edx:eax. - _, _, maxSize, _ := HostID(uint32(xSaveInfo), 0) - return uint(maxSize), 64 - } - - // If we don't support xsave, we fall back to fxsave, which requires - // 512 bytes aligned to 16 bytes. - return 512, 16 -} - -// ValidXCR0Mask returns the bits that may be set to 1 in control register -// XCR0. -func (fs *FeatureSet) ValidXCR0Mask() uint64 { - if !fs.UseXsave() { - return 0 - } - eax, _, _, edx := HostID(uint32(xSaveInfo), 0) - return uint64(edx)<<32 | uint64(eax) -} - -// vendorIDRegs returns the 3 register values used to construct the 12-byte -// vendor ID string for eax=0. -func (fs *FeatureSet) vendorIDRegs() (bx, dx, cx uint32) { - for i := uint(0); i < 4; i++ { - bx |= uint32(fs.VendorID[i]) << (i * 8) - } - - for i := uint(0); i < 4; i++ { - dx |= uint32(fs.VendorID[i+4]) << (i * 8) - } - - for i := uint(0); i < 4; i++ { - cx |= uint32(fs.VendorID[i+8]) << (i * 8) - } - return -} - -// signature returns the signature dword that's returned in eax when eax=1. -func (fs *FeatureSet) signature() uint32 { - var s uint32 - s |= uint32(fs.SteppingID & 0xf) - s |= uint32(fs.Model&0xf) << 4 - s |= uint32(fs.Family&0xf) << 8 - s |= uint32(fs.ProcessorType&0x3) << 12 - s |= uint32(fs.ExtendedModel&0xf) << 16 - s |= uint32(fs.ExtendedFamily&0xff) << 20 - return s -} - -// Helper to deconstruct signature dword. -func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) { - sid = uint8(v & 0xf) - m = uint8(v>>4) & 0xf - f = uint8(v>>8) & 0xf - pt = uint8(v>>12) & 0x3 - em = uint8(v>>16) & 0xf - ef = uint8(v >> 20) - return -} - -// Helper to convert blockwise feature bit masks into a set of features. Masks -// must be provided in order for each block, without skipping them. If a block -// does not matter for this feature set, 0 is specified. -func setFromBlockMasks(blocks ...uint32) map[Feature]bool { - s := make(map[Feature]bool) - for b, blockMask := range blocks { - for i := 0; i < blockSize; i++ { - if blockMask&1 != 0 { - s[featureID(block(b), i)] = true - } - blockMask >>= 1 - } - } - return s -} - -// blockMask returns the 32-bit mask associated with a block of features. -func (fs *FeatureSet) blockMask(b block) uint32 { - var mask uint32 - for i := 0; i < blockSize; i++ { - if fs.Set[featureID(b, i)] { - mask |= 1 << uint(i) - } - } - return mask -} - -// Remove removes a Feature from a FeatureSet. It ignores features -// that are not in the FeatureSet. -func (fs *FeatureSet) Remove(feature Feature) { - delete(fs.Set, feature) -} - -// Add adds a Feature to a FeatureSet. It ignores duplicate features. -func (fs *FeatureSet) Add(feature Feature) { - fs.Set[feature] = true -} - -// HasFeature tests whether or not a feature is in the given feature set. -func (fs *FeatureSet) HasFeature(feature Feature) bool { - return fs.Set[feature] -} - -// Subtract returns the features present in fs that are not present in other. -// If all features in fs are present in other, Subtract returns nil. -func (fs *FeatureSet) Subtract(other *FeatureSet) (diff map[Feature]bool) { - for f := range fs.Set { - if !other.Set[f] { - if diff == nil { - diff = make(map[Feature]bool) - } - diff[f] = true - } - } - - return -} - -// EmulateID emulates a cpuid instruction based on the feature set. -func (fs *FeatureSet) EmulateID(origAx, origCx uint32) (ax, bx, cx, dx uint32) { - switch cpuidFunction(origAx) { - case vendorID: - ax = uint32(xSaveInfo) // 0xd (xSaveInfo) is the highest function we support. - bx, dx, cx = fs.vendorIDRegs() - case featureInfo: - // CLFLUSH line size is encoded in quadwords. Other fields in bx unsupported. - bx = (fs.CacheLine / 8) << 8 - cx = fs.blockMask(block(0)) - dx = fs.blockMask(block(1)) - ax = fs.signature() - case intelCacheDescriptors: - if !fs.Intel() { - // Reserved on non-Intel. - return 0, 0, 0, 0 - } - - // "The least-significant byte in register EAX (register AL) - // will always return 01H. Software should ignore this value - // and not interpret it as an informational descriptor." - SDM - // - // We only support reporting cache parameters via - // intelDeterministicCacheParams; report as much here. - // - // We do not support exposing TLB information at all. - ax = 1 | (uint32(intelNoCacheDescriptor) << 8) - case intelDeterministicCacheParams: - if !fs.Intel() { - // Reserved on non-Intel. - return 0, 0, 0, 0 - } - - // cx is the index of the cache to describe. - if int(origCx) >= len(fs.Caches) { - return uint32(cacheNull), 0, 0, 0 - } - c := fs.Caches[origCx] - - ax = uint32(c.Type) - ax |= c.Level << 5 - ax |= 1 << 8 // Always claim the cache is "self-initializing". - if c.FullyAssociative { - ax |= 1 << 9 - } - // Processor topology not supported. - - bx = fs.CacheLine - 1 - bx |= (c.Partitions - 1) << 12 - bx |= (c.Ways - 1) << 22 - - cx = c.Sets - 1 - - if !c.InvalidateHierarchical { - dx |= 1 - } - if c.Inclusive { - dx |= 1 << 1 - } - if !c.DirectMapped { - dx |= 1 << 2 - } - case xSaveInfo: - if !fs.UseXsave() { - return 0, 0, 0, 0 - } - return HostID(uint32(xSaveInfo), origCx) - case extendedFeatureInfo: - if origCx != 0 { - break // Only leaf 0 is supported. - } - bx = fs.blockMask(block(2)) - cx = fs.blockMask(block(3)) - case extendedFunctionInfo: - // We only support showing the extended features. - ax = uint32(extendedFeatures) - cx = 0 - case extendedFeatures: - cx = fs.blockMask(block(5)) - dx = fs.blockMask(block(6)) - if fs.AMD() { - // AMD duplicates some block 1 features in block 6. - dx |= fs.blockMask(block(1)) & block6DuplicateMask - } - } - - return -} - -// UseXsave returns the choice of fp state saving instruction. -func (fs *FeatureSet) UseXsave() bool { - return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE) -} - -// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction. -func (fs *FeatureSet) UseXsaveopt() bool { - return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT) -} - -// HostID executes a native CPUID instruction. -func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32) - -// HostFeatureSet uses cpuid to get host values and construct a feature set -// that matches that of the host machine. Note that there are several places -// where there appear to be some unnecessary assignments between register names -// (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show -// where the different feature blocks come from, to make the code easier to -// inspect and read. -func HostFeatureSet() *FeatureSet { - // eax=0 gets max supported feature and vendor ID. - _, bx, cx, dx := HostID(0, 0) - vendorID := vendorIDFromRegs(bx, cx, dx) - - // eax=1 gets basic features in ecx:edx. - ax, bx, cx, dx := HostID(1, 0) - featureBlock0 := cx - featureBlock1 := dx - ef, em, pt, f, m, sid := signatureSplit(ax) - cacheLine := 8 * (bx >> 8) & 0xff - - // eax=4, ecx=i gets details about cache index i. Only supported on Intel. - var caches []Cache - if vendorID == intelVendorID { - // ecx selects the cache index until a null type is returned. - for i := uint32(0); ; i++ { - ax, bx, cx, dx := HostID(4, i) - t := CacheType(ax & 0xf) - if t == cacheNull { - break - } - - lineSize := (bx & 0xfff) + 1 - if lineSize != cacheLine { - panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine)) - } - - caches = append(caches, Cache{ - Type: t, - Level: (ax >> 5) & 0x7, - FullyAssociative: ((ax >> 9) & 1) == 1, - Partitions: ((bx >> 12) & 0x3ff) + 1, - Ways: ((bx >> 22) & 0x3ff) + 1, - Sets: cx + 1, - InvalidateHierarchical: (dx & 1) == 0, - Inclusive: ((dx >> 1) & 1) == 1, - DirectMapped: ((dx >> 2) & 1) == 0, - }) - } - } - - // eax=7, ecx=0 gets extended features in ecx:ebx. - _, bx, cx, _ = HostID(7, 0) - featureBlock2 := bx - featureBlock3 := cx - - // Leaf 0xd is supported only if CPUID.1:ECX.XSAVE[bit 26] is set. - var featureBlock4 uint32 - if (featureBlock0 & (1 << 26)) != 0 { - featureBlock4, _, _, _ = HostID(uint32(xSaveInfo), 1) - } - - // eax=0x80000000 gets supported extended levels. We use this to - // determine if there are any non-zero block 4 or block 6 bits to find. - var featureBlock5, featureBlock6 uint32 - if ax, _, _, _ := HostID(uint32(extendedFunctionInfo), 0); ax >= uint32(extendedFeatures) { - // eax=0x80000001 gets AMD added feature bits. - _, _, cx, dx = HostID(uint32(extendedFeatures), 0) - featureBlock5 = cx - // Ignore features duplicated from block 1 on AMD. These bits - // are reserved on Intel. - featureBlock6 = dx &^ block6DuplicateMask - } - - set := setFromBlockMasks(featureBlock0, featureBlock1, featureBlock2, featureBlock3, featureBlock4, featureBlock5, featureBlock6) - return &FeatureSet{ - Set: set, - VendorID: vendorID, - ExtendedFamily: ef, - ExtendedModel: em, - ProcessorType: pt, - Family: f, - Model: m, - SteppingID: sid, - CacheLine: cacheLine, - Caches: caches, - } -} - -// Reads max cpu frequency from host /proc/cpuinfo. Must run before -// whitelisting. This value is used to create the fake /proc/cpuinfo from a -// FeatureSet. -func initCPUFreq() { - cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo") - if err != nil { - // Leave it as 0... The standalone VDSO bails out in the same - // way. - log.Warningf("Could not read /proc/cpuinfo: %v", err) - return - } - cpuinfo := string(cpuinfob) - - // We get the value straight from host /proc/cpuinfo. On machines with - // frequency scaling enabled, this will only get the current value - // which will likely be inaccurate. This is fine on machines with - // frequency scaling disabled. - for _, line := range strings.Split(cpuinfo, "\n") { - if strings.Contains(line, "cpu MHz") { - splitMHz := strings.Split(line, ":") - if len(splitMHz) < 2 { - log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line") - return - } - - // If there was a problem, leave cpuFreqMHz as 0. - var err error - cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64) - if err != nil { - log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err) - cpuFreqMHz = 0 - return - } - return - } - } - log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz") -} - -func initFeaturesFromString() { - for f, s := range x86FeatureStrings { - x86FeaturesFromString[s] = f - } - for f, s := range x86FeatureParseOnlyStrings { - x86FeaturesFromString[s] = f - } -} - -func init() { - // initCpuFreq must be run before whitelists are enabled. - initCPUFreq() - initFeaturesFromString() -} +// On arm64, features are numbered according to the ELF HWCAP definition. +// arch/arm64/include/uapi/asm/hwcap.h +type Feature int diff --git a/pkg/cpuid/cpuid_arm64.go b/pkg/cpuid/cpuid_arm64.go new file mode 100644 index 000000000..6d71290c9 --- /dev/null +++ b/pkg/cpuid/cpuid_arm64.go @@ -0,0 +1,461 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package cpuid + +import ( + "bytes" + "encoding/binary" + "fmt" + "io/ioutil" + "strconv" + "strings" + + "gvisor.dev/gvisor/pkg/log" +) + +// ARM64 doesn't have a 'cpuid' equivalent, which means it have no architected +// discovery mechanism for hardware features available to userspace code at EL0. +// The kernel exposes the presence of these features to userspace through a set +// of flags(HWCAP/HWCAP2) bits, exposed in the auxilliary vector. +// Ref Documentation/arm64/elf_hwcaps.rst for more info. +// +// Currently, only the HWCAP bits are supported. + +const ( + // Single and double precision float point types. + ARM64FeatureFP Feature = iota + + // Advanced SIMD with single and double precision + // float point arithmetic. + ARM64FeatureASIMD + + // The generic timer is configured to generate + // events at a frequency of approximately 100KHz. + ARM64FeatureEVTSTRM + + // AES instructions(AESE/AESD/AESMC/AESIMC). + ARM64FeatureAES + + // AES instructions(PMULL/PMULL2). + ARM64FeaturePMULL + + // SHA1 instructions(SHA1C/SHA1P/SHA1M etc). + ARM64FeatureSHA1 + + // SHA2 instructions(SHA256H/SHA256H2/SHA256SU0 etc). + ARM64FeatureSHA2 + + // CRC32 instructions(CRC32B/CRC32H/CRC32W etc). + ARM64FeatureCRC32 + + // Atomic instructions(LDADD/LDCLR/LDEOR/LDSET etc). + ARM64FeatureATOMICS + + // Half precision float point arithmetic. + ARM64FeatureFPHP + + // ASIMD with half precision float point arithmetic. + ARM64FeatureASIMDHP + + // EL0 access to certain ID registers is available. + ARM64FeatureCPUID + + // SQRDMLAH and SQRDMLSH instructions implemented. + ARM64FeatureASIMDRDM + + // The FJCVTZS instruction is implemented. + ARM64FeatureJSCVT + + // The FCMLA and FCADD instructions are implemented. + ARM64FeatureFCMA + + // The LDAPRB/LDAPRH/LDAPR instructions are implemented. + ARM64FeatureLRCPC + + // DC instruction(DC CVAP) supported. + ARM64FeatureDCPOP + + // SHA3 instructions(EOR3/RAX1/XAR/BCAX) implemented. + ARM64FeatureSHA3 + + // SM3 instructions(SM3SS1/SM3TT1A/SM3TT1B) implemented. + ARM64FeatureSM3 + + // SM4 instructions(SM4E/SM4EKEY) implemented. + ARM64FeatureSM4 + + // Dot Product instructions(UDOT/SDOT) implemented. + ARM64FeatureASIMDDP + + // SHA2 instructions(SHA512H/SHA512H2/SHA512SU0) implemented. + ARM64FeatureSHA512 + + // Scalable Vector Extension implemented. + ARM64FeatureSVE + + // FMLAL and FMLSL instructions are implemented. + ARM64FeatureASIMDFHM +) + +// ELF auxiliary vector tags +const ( + _AT_NULL = 0 // End of vector + _AT_HWCAP = 16 // hardware capability bit vector + _AT_HWCAP2 = 26 // hardware capability bit vector 2 +) + +// These should not be changed after they are initialized. +var hwCap uint + +// To make emulation of /proc/cpuinfo easy, these names match the names of the +// basic features in Linux defined in arch/arm64/kernel/cpuinfo.c. +var arm64FeatureStrings = map[Feature]string{ + ARM64FeatureFP: "fp", + ARM64FeatureASIMD: "asimd", + ARM64FeatureEVTSTRM: "evtstrm", + ARM64FeatureAES: "aes", + ARM64FeaturePMULL: "pmull", + ARM64FeatureSHA1: "sha1", + ARM64FeatureSHA2: "sha2", + ARM64FeatureCRC32: "crc32", + ARM64FeatureATOMICS: "atomics", + ARM64FeatureFPHP: "fphp", + ARM64FeatureASIMDHP: "asimdhp", + ARM64FeatureCPUID: "cpuid", + ARM64FeatureASIMDRDM: "asimdrdm", + ARM64FeatureJSCVT: "jscvt", + ARM64FeatureFCMA: "fcma", + ARM64FeatureLRCPC: "lrcpc", + ARM64FeatureDCPOP: "dcpop", + ARM64FeatureSHA3: "sha3", + ARM64FeatureSM3: "sm3", + ARM64FeatureSM4: "sm4", + ARM64FeatureASIMDDP: "asimddp", + ARM64FeatureSHA512: "sha512", + ARM64FeatureSVE: "sve", + ARM64FeatureASIMDFHM: "asimdfhm", +} + +var ( + cpuFreqMHz float64 + cpuImplHex uint64 + cpuArchDec uint64 + cpuVarHex uint64 + cpuPartHex uint64 + cpuRevDec uint64 +) + +// arm64FeaturesFromString includes features from arm64FeatureStrings. +var arm64FeaturesFromString = make(map[string]Feature) + +// FeatureFromString returns the Feature associated with the given feature +// string plus a bool to indicate if it could find the feature. +func FeatureFromString(s string) (Feature, bool) { + f, b := arm64FeaturesFromString[s] + return f, b +} + +// String implements fmt.Stringer. +func (f Feature) String() string { + if s := f.flagString(); s != "" { + return s + } + + return fmt.Sprintf("", f) +} + +func (f Feature) flagString() string { + if s, ok := arm64FeatureStrings[f]; ok { + return s + } + + return "" +} + +// FeatureSet is a set of Features for a CPU. +// +// +stateify savable +type FeatureSet struct { + // Set is the set of features that are enabled in this FeatureSet. + Set map[Feature]bool + + // CPUImplementer is part of the processor signature. + CPUImplementer uint8 + + // CPUArchitecture is part of the processor signature. + CPUArchitecture uint8 + + // CPUVariant is part of the processor signature. + CPUVariant uint8 + + // CPUPartnum is part of the processor signature. + CPUPartnum uint16 + + // CPURevision is part of the processor signature. + CPURevision uint8 +} + +// CheckHostCompatible returns nil if fs is a subset of the host feature set. +// Noop on arm64. +func (fs *FeatureSet) CheckHostCompatible() error { + return nil +} + +// ExtendedStateSize returns the number of bytes needed to save the "extended +// state" for this processor and the boundary it must be aligned to. Extended +// state includes floating point(NEON) registers, and other cpu state that's not +// associated with the normal task context. +func (fs *FeatureSet) ExtendedStateSize() (size, align uint) { + // ARMv8 provide 32x128bits NEON registers. + // + // Ref arch/arm64/include/uapi/asm/ptrace.h + // struct user_fpsimd_state { + // __uint128_t vregs[32]; + // __u32 fpsr; + // __u32 fpcr; + // __u32 __reserved[2]; + // }; + return 528, 16 +} + +// HasFeature tests whether or not a feature is in the given feature set. +func (fs *FeatureSet) HasFeature(feature Feature) bool { + return fs.Set[feature] +} + +// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction. +// Noop on arm64. +func (fs *FeatureSet) UseXsave() bool { + return false +} + +// FlagsString prints out supported CPU "flags" field in /proc/cpuinfo. +func (fs *FeatureSet) FlagsString() string { + var s []string + for f, _ := range arm64FeatureStrings { + if fs.Set[f] { + if fstr := f.flagString(); fstr != "" { + s = append(s, fstr) + } + } + } + return strings.Join(s, " ") +} + +// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is +// a minimal /proc/cpuinfo, and the bogomips field is simply made up. +func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) { + fmt.Fprintf(b, "processor\t: %d\n", cpu) + fmt.Fprintf(b, "BogoMIPS\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. + fmt.Fprintf(b, "Features\t\t: %s\n", fs.FlagsString()) + fmt.Fprintf(b, "CPU implementer\t: 0x%x\n", cpuImplHex) + fmt.Fprintf(b, "CPU architecture\t: %d\n", cpuArchDec) + fmt.Fprintf(b, "CPU variant\t: 0x%x\n", cpuVarHex) + fmt.Fprintf(b, "CPU part\t: 0x%x\n", cpuPartHex) + fmt.Fprintf(b, "CPU revision\t: %d\n", cpuRevDec) + fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline. +} + +// HostFeatureSet uses hwCap to get host values and construct a feature set +// that matches that of the host machine. +func HostFeatureSet() *FeatureSet { + s := make(map[Feature]bool) + + for f, _ := range arm64FeatureStrings { + if hwCap&(1<