Skip to content

Commit 9d05e90

Browse files
Merge pull request #130 from fasaxc/nodeport-fix
Fix return path of NodePort traffic.
2 parents e51f34e + 2cce7de commit 9d05e90

File tree

4 files changed

+411
-45
lines changed

4 files changed

+411
-45
lines changed

ipamd/introspect.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
log "github.com/cihub/seelog"
2424
"github.com/prometheus/client_golang/prometheus/promhttp"
2525

26+
"github.com/aws/amazon-vpc-cni-k8s/pkg/networkutils"
2627
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils"
2728
)
2829

@@ -68,8 +69,9 @@ func (c *IPAMContext) SetupHTTP() {
6869

6970
func (c *IPAMContext) setupServer() *http.Server {
7071
serverFunctions := map[string]func(w http.ResponseWriter, r *http.Request){
71-
"/v1/enis": eniV1RequestHandler(c),
72-
"/v1/pods": podV1RequestHandler(c),
72+
"/v1/enis": eniV1RequestHandler(c),
73+
"/v1/pods": podV1RequestHandler(c),
74+
"/v1/env-settings": envV1RequestHandler(c),
7375
}
7476
paths := make([]string, 0, len(serverFunctions))
7577
for path := range serverFunctions {
@@ -132,6 +134,18 @@ func podV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Requ
132134
}
133135
}
134136

137+
func envV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
138+
return func(w http.ResponseWriter, r *http.Request) {
139+
responseJSON, err := json.Marshal(networkutils.GetConfigForDebug())
140+
if err != nil {
141+
log.Error("Failed to marshal env var data: %v", err)
142+
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
143+
return
144+
}
145+
w.Write(responseJSON)
146+
}
147+
}
148+
135149
func metricsHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
136150
return func(w http.ResponseWriter, r *http.Request) {
137151
promhttp.Handler()

pkg/networkutils/network.go

Lines changed: 217 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
package networkutils
1515

1616
import (
17+
"fmt"
18+
"io"
19+
"math"
1720
"net"
1821
"os"
1922
"strconv"
@@ -47,8 +50,26 @@ const (
4750

4851
// This environment is used to specify whether an external NAT gateway will be used to provide SNAT of
4952
// secondary ENI IP addresses. If set to "true", the SNAT iptables rule and off-VPC ip rule will not
50-
// be installed and will be removed if they are already installed.
53+
// be installed and will be removed if they are already installed. Defaults to false.
5154
envExternalSNAT = "AWS_VPC_K8S_CNI_EXTERNALSNAT"
55+
56+
// envNodePortSupport is the name of environment variable that configures whether we implement support for
57+
// NodePorts on the primary ENI. This requires that we add additional iptables rules and loosen the kernel's
58+
// RPF check as described below. Defaults to true.
59+
envNodePortSupport = "AWS_VPC_CNI_NODE_PORT_SUPPORT"
60+
61+
// envConnmark is the name of the environment variable that overrides the default connection mark, used to
62+
// mark traffic coming from the primary ENI so that return traffic can be forced out of the same interface.
63+
// Without using a mark, NodePort DNAT and our source-based routing do not work together if the target pod
64+
// behind the node port is not on the main ENI. In that case, the un-DNAT is done after the source-based
65+
// routing, resulting in the packet being sent out of the pod's ENI, when the NodePort traffic should be
66+
// sent over the main ENI.
67+
envConnmark = "AWS_VPC_K8S_CNI_CONNMARK"
68+
69+
// defaultConnmark is the default value for the connmark described above. Note: the mark space is a little crowded,
70+
// - kube-proxy uses 0x0000c000
71+
// - Calico uses 0xffff0000.
72+
defaultConnmark = 0x80
5273
)
5374

5475
// NetworkAPIs defines the host level and the eni level network related operations
@@ -60,14 +81,45 @@ type NetworkAPIs interface {
6081
}
6182

6283
type linuxNetwork struct {
63-
netLink netlinkwrapper.NetLink
64-
ns nswrapper.NS
84+
useExternalSNAT bool
85+
nodePortSupportEnabled bool
86+
connmark uint32
87+
88+
netLink netlinkwrapper.NetLink
89+
ns nswrapper.NS
90+
newIptables func() (iptablesIface, error)
91+
mainENIMark uint32
92+
openFile func(name string, flag int, perm os.FileMode) (stringWriteCloser, error)
93+
}
94+
95+
type iptablesIface interface {
96+
Exists(table, chain string, rulespec ...string) (bool, error)
97+
Append(table, chain string, rulespec ...string) error
98+
Delete(table, chain string, rulespec ...string) error
6599
}
66100

67101
// New creates a linuxNetwork object
68102
func New() NetworkAPIs {
69-
return &linuxNetwork{netLink: netlinkwrapper.NewNetLink(),
70-
ns: nswrapper.NewNS()}
103+
return &linuxNetwork{
104+
useExternalSNAT: useExternalSNAT(),
105+
nodePortSupportEnabled: nodePortSupportEnabled(),
106+
mainENIMark: getConnmark(),
107+
108+
netLink: netlinkwrapper.NewNetLink(),
109+
ns: nswrapper.NewNS(),
110+
newIptables: func() (iptablesIface, error) {
111+
ipt, err := iptables.New()
112+
return ipt, err
113+
},
114+
openFile: func(name string, flag int, perm os.FileMode) (stringWriteCloser, error) {
115+
return os.OpenFile(name, flag, perm)
116+
},
117+
}
118+
}
119+
120+
type stringWriteCloser interface {
121+
io.Closer
122+
WriteString(s string) (int, error)
71123
}
72124

73125
func isDuplicateRuleAdd(err error) bool {
@@ -76,85 +128,213 @@ func isDuplicateRuleAdd(err error) bool {
76128

77129
// SetupHostNetwork performs node level network configuration
78130
// TODO : implement ip rule not to 10.0.0.0/16(vpc'subnet) table main priority 1024
79-
func (os *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP) error {
80-
81-
externalSNAT := useExternalSNAT()
82-
hostRule := os.netLink.NewRule()
131+
func (n *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP) error {
132+
log.Info("Setting up host network")
133+
hostRule := n.netLink.NewRule()
83134
hostRule.Dst = vpcCIDR
84135
hostRule.Table = mainRoutingTable
85136
hostRule.Priority = hostRulePriority
86137
hostRule.Invert = true
87138

88139
// If this is a restart, cleanup previous rule first
89-
err := os.netLink.RuleDel(hostRule)
140+
err := n.netLink.RuleDel(hostRule)
90141
if err != nil && !containsNoSuchRule(err) {
91142
log.Errorf("Failed to cleanup old host IP rule: %v", err)
92143
return errors.Wrapf(err, "host network setup: failed to delete old host rule")
93144
}
94145

95146
// Only include the rule if SNAT is not being handled by an external NAT gateway and needs to be
96147
// handled on-node.
97-
if !externalSNAT {
98-
err = os.netLink.RuleAdd(hostRule)
148+
if !n.useExternalSNAT {
149+
err = n.netLink.RuleAdd(hostRule)
99150
if err != nil {
100151
log.Errorf("Failed to add host IP rule: %v", err)
101152
return errors.Wrapf(err, "host network setup: failed to add host rule")
102153
}
103154
}
104155

105-
ipt, err := iptables.New()
156+
if n.nodePortSupportEnabled {
157+
// If node port support is enabled, configure the kernel's reverse path filter check on eth0 for "loose"
158+
// filtering. This is required because
159+
// - NodePorts are exposed on eth0
160+
// - The kernel's RPF check happens after incoming packets to NodePorts are DNATted to the pod IP.
161+
// - For pods assigned to secondary ENIs, the routing table includes source-based routing. When the kernel does
162+
// the RPF check, it looks up the route using the pod IP as the source.
163+
// - Thus, it finds the source-based route that leaves via the secondary ENI.
164+
// - In "strict" mode, the RPF check fails because the return path uses a different interface to the incoming
165+
// packet. In "loose" mode, the check passes because some route was found.
166+
const eth0RPFilter = "/proc/sys/net/ipv4/conf/eth0/rp_filter"
167+
const rpFilterLoose = "2"
168+
err := n.setProcSys(eth0RPFilter, rpFilterLoose)
169+
if err != nil {
170+
return errors.Wrapf(err, "failed to configure eth0 RPF check")
171+
}
172+
}
106173

107-
if err != nil {
108-
return errors.Wrap(err, "host network setup: failed to create iptables")
174+
// If node port support is enabled, add a rule that will force force marked traffic out of the main ENI. We then
175+
// add iptables rules below that will mark traffic that needs this special treatment. In particular NodePort
176+
// traffic always comes in via the main ENI but response traffic would go out of the pod's assigned ENI if we
177+
// didn't handle it specially. This is because the routing decision is done before the NodePort's DNAT is
178+
// reversed so, to the routing table, it looks like the traffic is pod traffic instead of NodePort traffic.
179+
mainENIRule := n.netLink.NewRule()
180+
mainENIRule.Mark = int(n.mainENIMark)
181+
mainENIRule.Mask = int(n.mainENIMark)
182+
mainENIRule.Table = mainRoutingTable
183+
mainENIRule.Priority = hostRulePriority
184+
// If this is a restart, cleanup previous rule first
185+
err = n.netLink.RuleDel(mainENIRule)
186+
if err != nil && !containsNoSuchRule(err) {
187+
log.Errorf("Failed to cleanup old main ENI rule: %v", err)
188+
return errors.Wrapf(err, "host network setup: failed to delete old main ENI rule")
109189
}
110190

111-
natCmd := []string{"!", "-d", vpcCIDR.String(), "-m", "comment", "--comment", "AWS, SNAT",
112-
"-m", "addrtype", "!", "--dst-type", "LOCAL", "-j", "SNAT", "--to-source", primaryAddr.String()}
113-
exists, err := ipt.Exists("nat", "POSTROUTING", natCmd...)
191+
if n.nodePortSupportEnabled {
192+
err = n.netLink.RuleAdd(mainENIRule)
193+
if err != nil {
194+
log.Errorf("Failed to add host main ENI rule: %v", err)
195+
return errors.Wrapf(err, "host network setup: failed to add main ENI rule")
196+
}
197+
}
198+
199+
ipt, err := n.newIptables()
114200

115201
if err != nil {
116-
return errors.Wrapf(err, "host network setup: failed to add POSTROUTING rule for primary address %s", primaryAddr)
202+
return errors.Wrap(err, "host network setup: failed to create iptables")
117203
}
118204

119-
if !exists && !externalSNAT {
120-
// We are handling SNAT on-node, so include the iptables SNAT POSTROUTING rule.
121-
err = ipt.Append("nat", "POSTROUTING", natCmd...)
122-
205+
for _, rule := range []iptablesRule{
206+
{
207+
name: "connmark for primary ENI",
208+
shouldExist: n.nodePortSupportEnabled,
209+
table: "mangle",
210+
chain: "PREROUTING",
211+
rule: []string{
212+
"-m", "comment", "--comment", "AWS, primary ENI",
213+
"-i", "eth0",
214+
"-m", "addrtype", "--dst-type", "LOCAL", "--limit-iface-in",
215+
"-j", "CONNMARK", "--set-mark", fmt.Sprintf("%#x/%#x", n.mainENIMark, n.mainENIMark),
216+
},
217+
},
218+
{
219+
name: "connmark restore for primary ENI",
220+
shouldExist: n.nodePortSupportEnabled,
221+
table: "mangle",
222+
chain: "PREROUTING",
223+
rule: []string{
224+
"-m", "comment", "--comment", "AWS, primary ENI",
225+
"-i", "eni+", "-j", "CONNMARK", "--restore-mark", "--mask", fmt.Sprintf("%#x", n.mainENIMark),
226+
},
227+
},
228+
{
229+
name: fmt.Sprintf("rule for primary address %s", primaryAddr),
230+
shouldExist: !n.useExternalSNAT,
231+
table: "nat",
232+
chain: "POSTROUTING",
233+
rule: []string{
234+
"!", "-d", vpcCIDR.String(),
235+
"-m", "comment", "--comment", "AWS, SNAT",
236+
"-m", "addrtype", "!", "--dst-type", "LOCAL",
237+
"-j", "SNAT", "--to-source", primaryAddr.String()},
238+
},
239+
} {
240+
exists, err := ipt.Exists(rule.table, rule.chain, rule.rule...)
123241
if err != nil {
124-
return errors.Wrapf(err, "host network setup: failed to append POSTROUTING rule for primary address %s", primaryAddr)
242+
return errors.Wrapf(err, "host network setup: failed to check existence of %v", rule)
125243
}
126-
} else if exists && externalSNAT {
127-
// We are not handling SNAT on-node, so delete the existing iptables SNAT POSTROUTING rule.
128-
err = ipt.Delete("nat", "POSTROUTING", natCmd...)
129244

130-
if err != nil {
131-
return errors.Wrapf(err, "host network setup: failed to delete POSTROUTING rule for primary address %s", primaryAddr)
245+
if !exists && rule.shouldExist {
246+
err = ipt.Append(rule.table, rule.chain, rule.rule...)
247+
if err != nil {
248+
return errors.Wrapf(err, "host network setup: failed to add %v", rule)
249+
}
250+
} else if exists && !rule.shouldExist {
251+
err = ipt.Delete(rule.table, rule.chain, rule.rule...)
252+
if err != nil {
253+
return errors.Wrapf(err, "host network setup: failed to delete %v", rule)
254+
}
132255
}
133256
}
134257

135258
return nil
136259
}
137260

261+
func (n *linuxNetwork) setProcSys(key, value string) error {
262+
f, err := n.openFile(key, os.O_WRONLY, 0644)
263+
if err != nil {
264+
return err
265+
}
266+
defer f.Close()
267+
_, err = f.WriteString(value)
268+
if err != nil {
269+
return err
270+
}
271+
return nil
272+
}
273+
274+
type iptablesRule struct {
275+
name string
276+
shouldExist bool
277+
table, chain string
278+
rule []string
279+
}
280+
281+
func (r iptablesRule) String() string {
282+
return fmt.Sprintf("%s/%s rule %s", r.table, r.chain, r.name)
283+
}
284+
138285
func containsNoSuchRule(err error) bool {
139286
if errno, ok := err.(syscall.Errno); ok {
140287
return errno == syscall.ENOENT
141288
}
142289
return false
143290
}
144291

292+
// GetConfigForDebug returns the active values of the configuration env vars (for debugging purposes).
293+
func GetConfigForDebug() map[string]interface{} {
294+
return map[string]interface{}{
295+
envExternalSNAT: useExternalSNAT(),
296+
envNodePortSupport: nodePortSupportEnabled(),
297+
envConnmark: getConnmark(),
298+
}
299+
}
300+
145301
// useExternalSNAT returns whether SNAT of secondary ENI IPs should be handled with an external
146302
// NAT gateway rather than on node. Failure to parse the setting will result in a log and the
147303
// setting will be disabled.
148304
func useExternalSNAT() bool {
149-
if externalSNATStr := os.Getenv(envExternalSNAT); externalSNATStr != "" {
150-
externalSNAT, err := strconv.ParseBool(externalSNATStr)
305+
return getBoolEnvVar(envExternalSNAT, false)
306+
}
307+
308+
func nodePortSupportEnabled() bool {
309+
return getBoolEnvVar(envNodePortSupport, true)
310+
}
311+
312+
func getBoolEnvVar(name string, defaultValue bool) bool {
313+
if strValue := os.Getenv(name); strValue != "" {
314+
parsedValue, err := strconv.ParseBool(strValue)
151315
if err != nil {
152-
log.Error("Failed to parse "+envExternalSNAT, err.Error())
153-
return false
316+
log.Error("Failed to parse "+name+"; using default: "+fmt.Sprint(defaultValue), err.Error())
317+
return defaultValue
154318
}
155-
return externalSNAT
319+
return parsedValue
156320
}
157-
return false
321+
return defaultValue
322+
}
323+
324+
func getConnmark() uint32 {
325+
if connmark := os.Getenv(envConnmark); connmark != "" {
326+
mark, err := strconv.ParseInt(connmark, 0, 64)
327+
if err != nil {
328+
log.Error("Failed to parse "+envConnmark+"; will use ", defaultConnmark, err.Error())
329+
return defaultConnmark
330+
}
331+
if mark > math.MaxUint32 || mark <= 0 {
332+
log.Error(""+envConnmark+" out of range; will use ", defaultConnmark)
333+
return defaultConnmark
334+
}
335+
return uint32(mark)
336+
}
337+
return defaultConnmark
158338
}
159339

160340
// LinkByMac returns linux netlink based on interface MAC
@@ -177,8 +357,8 @@ func LinkByMac(mac string, netLink netlinkwrapper.NetLink) (netlink.Link, error)
177357
}
178358

179359
// SetupENINetwork adds default route to route table (eni-<eni_table>)
180-
func (os *linuxNetwork) SetupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string) error {
181-
return setupENINetwork(eniIP, eniMAC, eniTable, eniSubnetCIDR, os.netLink)
360+
func (n *linuxNetwork) SetupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string) error {
361+
return setupENINetwork(eniIP, eniMAC, eniTable, eniSubnetCIDR, n.netLink)
182362
}
183363

184364
func setupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string, netLink netlinkwrapper.NetLink) error {

0 commit comments

Comments
 (0)