14
14
package networkutils
15
15
16
16
import (
17
+ "fmt"
18
+ "io"
19
+ "math"
17
20
"net"
18
21
"os"
19
22
"strconv"
@@ -47,8 +50,26 @@ const (
47
50
48
51
// This environment is used to specify whether an external NAT gateway will be used to provide SNAT of
49
52
// secondary ENI IP addresses. If set to "true", the SNAT iptables rule and off-VPC ip rule will not
50
- // be installed and will be removed if they are already installed.
53
+ // be installed and will be removed if they are already installed. Defaults to false.
51
54
envExternalSNAT = "AWS_VPC_K8S_CNI_EXTERNALSNAT"
55
+
56
+ // envNodePortSupport is the name of environment variable that configures whether we implement support for
57
+ // NodePorts on the primary ENI. This requires that we add additional iptables rules and loosen the kernel's
58
+ // RPF check as described below. Defaults to true.
59
+ envNodePortSupport = "AWS_VPC_CNI_NODE_PORT_SUPPORT"
60
+
61
+ // envConnmark is the name of the environment variable that overrides the default connection mark, used to
62
+ // mark traffic coming from the primary ENI so that return traffic can be forced out of the same interface.
63
+ // Without using a mark, NodePort DNAT and our source-based routing do not work together if the target pod
64
+ // behind the node port is not on the main ENI. In that case, the un-DNAT is done after the source-based
65
+ // routing, resulting in the packet being sent out of the pod's ENI, when the NodePort traffic should be
66
+ // sent over the main ENI.
67
+ envConnmark = "AWS_VPC_K8S_CNI_CONNMARK"
68
+
69
+ // defaultConnmark is the default value for the connmark described above. Note: the mark space is a little crowded,
70
+ // - kube-proxy uses 0x0000c000
71
+ // - Calico uses 0xffff0000.
72
+ defaultConnmark = 0x80
52
73
)
53
74
54
75
// NetworkAPIs defines the host level and the eni level network related operations
@@ -60,14 +81,45 @@ type NetworkAPIs interface {
60
81
}
61
82
62
83
type linuxNetwork struct {
63
- netLink netlinkwrapper.NetLink
64
- ns nswrapper.NS
84
+ useExternalSNAT bool
85
+ nodePortSupportEnabled bool
86
+ connmark uint32
87
+
88
+ netLink netlinkwrapper.NetLink
89
+ ns nswrapper.NS
90
+ newIptables func () (iptablesIface , error )
91
+ mainENIMark uint32
92
+ openFile func (name string , flag int , perm os.FileMode ) (stringWriteCloser , error )
93
+ }
94
+
95
+ type iptablesIface interface {
96
+ Exists (table , chain string , rulespec ... string ) (bool , error )
97
+ Append (table , chain string , rulespec ... string ) error
98
+ Delete (table , chain string , rulespec ... string ) error
65
99
}
66
100
67
101
// New creates a linuxNetwork object
68
102
func New () NetworkAPIs {
69
- return & linuxNetwork {netLink : netlinkwrapper .NewNetLink (),
70
- ns : nswrapper .NewNS ()}
103
+ return & linuxNetwork {
104
+ useExternalSNAT : useExternalSNAT (),
105
+ nodePortSupportEnabled : nodePortSupportEnabled (),
106
+ mainENIMark : getConnmark (),
107
+
108
+ netLink : netlinkwrapper .NewNetLink (),
109
+ ns : nswrapper .NewNS (),
110
+ newIptables : func () (iptablesIface , error ) {
111
+ ipt , err := iptables .New ()
112
+ return ipt , err
113
+ },
114
+ openFile : func (name string , flag int , perm os.FileMode ) (stringWriteCloser , error ) {
115
+ return os .OpenFile (name , flag , perm )
116
+ },
117
+ }
118
+ }
119
+
120
+ type stringWriteCloser interface {
121
+ io.Closer
122
+ WriteString (s string ) (int , error )
71
123
}
72
124
73
125
func isDuplicateRuleAdd (err error ) bool {
@@ -76,85 +128,213 @@ func isDuplicateRuleAdd(err error) bool {
76
128
77
129
// SetupHostNetwork performs node level network configuration
78
130
// TODO : implement ip rule not to 10.0.0.0/16(vpc'subnet) table main priority 1024
79
- func (os * linuxNetwork ) SetupHostNetwork (vpcCIDR * net.IPNet , primaryAddr * net.IP ) error {
80
-
81
- externalSNAT := useExternalSNAT ()
82
- hostRule := os .netLink .NewRule ()
131
+ func (n * linuxNetwork ) SetupHostNetwork (vpcCIDR * net.IPNet , primaryAddr * net.IP ) error {
132
+ log .Info ("Setting up host network" )
133
+ hostRule := n .netLink .NewRule ()
83
134
hostRule .Dst = vpcCIDR
84
135
hostRule .Table = mainRoutingTable
85
136
hostRule .Priority = hostRulePriority
86
137
hostRule .Invert = true
87
138
88
139
// If this is a restart, cleanup previous rule first
89
- err := os .netLink .RuleDel (hostRule )
140
+ err := n .netLink .RuleDel (hostRule )
90
141
if err != nil && ! containsNoSuchRule (err ) {
91
142
log .Errorf ("Failed to cleanup old host IP rule: %v" , err )
92
143
return errors .Wrapf (err , "host network setup: failed to delete old host rule" )
93
144
}
94
145
95
146
// Only include the rule if SNAT is not being handled by an external NAT gateway and needs to be
96
147
// handled on-node.
97
- if ! externalSNAT {
98
- err = os .netLink .RuleAdd (hostRule )
148
+ if ! n . useExternalSNAT {
149
+ err = n .netLink .RuleAdd (hostRule )
99
150
if err != nil {
100
151
log .Errorf ("Failed to add host IP rule: %v" , err )
101
152
return errors .Wrapf (err , "host network setup: failed to add host rule" )
102
153
}
103
154
}
104
155
105
- ipt , err := iptables .New ()
156
+ if n .nodePortSupportEnabled {
157
+ // If node port support is enabled, configure the kernel's reverse path filter check on eth0 for "loose"
158
+ // filtering. This is required because
159
+ // - NodePorts are exposed on eth0
160
+ // - The kernel's RPF check happens after incoming packets to NodePorts are DNATted to the pod IP.
161
+ // - For pods assigned to secondary ENIs, the routing table includes source-based routing. When the kernel does
162
+ // the RPF check, it looks up the route using the pod IP as the source.
163
+ // - Thus, it finds the source-based route that leaves via the secondary ENI.
164
+ // - In "strict" mode, the RPF check fails because the return path uses a different interface to the incoming
165
+ // packet. In "loose" mode, the check passes because some route was found.
166
+ const eth0RPFilter = "/proc/sys/net/ipv4/conf/eth0/rp_filter"
167
+ const rpFilterLoose = "2"
168
+ err := n .setProcSys (eth0RPFilter , rpFilterLoose )
169
+ if err != nil {
170
+ return errors .Wrapf (err , "failed to configure eth0 RPF check" )
171
+ }
172
+ }
106
173
107
- if err != nil {
108
- return errors .Wrap (err , "host network setup: failed to create iptables" )
174
+ // If node port support is enabled, add a rule that will force force marked traffic out of the main ENI. We then
175
+ // add iptables rules below that will mark traffic that needs this special treatment. In particular NodePort
176
+ // traffic always comes in via the main ENI but response traffic would go out of the pod's assigned ENI if we
177
+ // didn't handle it specially. This is because the routing decision is done before the NodePort's DNAT is
178
+ // reversed so, to the routing table, it looks like the traffic is pod traffic instead of NodePort traffic.
179
+ mainENIRule := n .netLink .NewRule ()
180
+ mainENIRule .Mark = int (n .mainENIMark )
181
+ mainENIRule .Mask = int (n .mainENIMark )
182
+ mainENIRule .Table = mainRoutingTable
183
+ mainENIRule .Priority = hostRulePriority
184
+ // If this is a restart, cleanup previous rule first
185
+ err = n .netLink .RuleDel (mainENIRule )
186
+ if err != nil && ! containsNoSuchRule (err ) {
187
+ log .Errorf ("Failed to cleanup old main ENI rule: %v" , err )
188
+ return errors .Wrapf (err , "host network setup: failed to delete old main ENI rule" )
109
189
}
110
190
111
- natCmd := []string {"!" , "-d" , vpcCIDR .String (), "-m" , "comment" , "--comment" , "AWS, SNAT" ,
112
- "-m" , "addrtype" , "!" , "--dst-type" , "LOCAL" , "-j" , "SNAT" , "--to-source" , primaryAddr .String ()}
113
- exists , err := ipt .Exists ("nat" , "POSTROUTING" , natCmd ... )
191
+ if n .nodePortSupportEnabled {
192
+ err = n .netLink .RuleAdd (mainENIRule )
193
+ if err != nil {
194
+ log .Errorf ("Failed to add host main ENI rule: %v" , err )
195
+ return errors .Wrapf (err , "host network setup: failed to add main ENI rule" )
196
+ }
197
+ }
198
+
199
+ ipt , err := n .newIptables ()
114
200
115
201
if err != nil {
116
- return errors .Wrapf (err , "host network setup: failed to add POSTROUTING rule for primary address %s" , primaryAddr )
202
+ return errors .Wrap (err , "host network setup: failed to create iptables" )
117
203
}
118
204
119
- if ! exists && ! externalSNAT {
120
- // We are handling SNAT on-node, so include the iptables SNAT POSTROUTING rule.
121
- err = ipt .Append ("nat" , "POSTROUTING" , natCmd ... )
122
-
205
+ for _ , rule := range []iptablesRule {
206
+ {
207
+ name : "connmark for primary ENI" ,
208
+ shouldExist : n .nodePortSupportEnabled ,
209
+ table : "mangle" ,
210
+ chain : "PREROUTING" ,
211
+ rule : []string {
212
+ "-m" , "comment" , "--comment" , "AWS, primary ENI" ,
213
+ "-i" , "eth0" ,
214
+ "-m" , "addrtype" , "--dst-type" , "LOCAL" , "--limit-iface-in" ,
215
+ "-j" , "CONNMARK" , "--set-mark" , fmt .Sprintf ("%#x/%#x" , n .mainENIMark , n .mainENIMark ),
216
+ },
217
+ },
218
+ {
219
+ name : "connmark restore for primary ENI" ,
220
+ shouldExist : n .nodePortSupportEnabled ,
221
+ table : "mangle" ,
222
+ chain : "PREROUTING" ,
223
+ rule : []string {
224
+ "-m" , "comment" , "--comment" , "AWS, primary ENI" ,
225
+ "-i" , "eni+" , "-j" , "CONNMARK" , "--restore-mark" , "--mask" , fmt .Sprintf ("%#x" , n .mainENIMark ),
226
+ },
227
+ },
228
+ {
229
+ name : fmt .Sprintf ("rule for primary address %s" , primaryAddr ),
230
+ shouldExist : ! n .useExternalSNAT ,
231
+ table : "nat" ,
232
+ chain : "POSTROUTING" ,
233
+ rule : []string {
234
+ "!" , "-d" , vpcCIDR .String (),
235
+ "-m" , "comment" , "--comment" , "AWS, SNAT" ,
236
+ "-m" , "addrtype" , "!" , "--dst-type" , "LOCAL" ,
237
+ "-j" , "SNAT" , "--to-source" , primaryAddr .String ()},
238
+ },
239
+ } {
240
+ exists , err := ipt .Exists (rule .table , rule .chain , rule .rule ... )
123
241
if err != nil {
124
- return errors .Wrapf (err , "host network setup: failed to append POSTROUTING rule for primary address %s " , primaryAddr )
242
+ return errors .Wrapf (err , "host network setup: failed to check existence of %v " , rule )
125
243
}
126
- } else if exists && externalSNAT {
127
- // We are not handling SNAT on-node, so delete the existing iptables SNAT POSTROUTING rule.
128
- err = ipt .Delete ("nat" , "POSTROUTING" , natCmd ... )
129
244
130
- if err != nil {
131
- return errors .Wrapf (err , "host network setup: failed to delete POSTROUTING rule for primary address %s" , primaryAddr )
245
+ if ! exists && rule .shouldExist {
246
+ err = ipt .Append (rule .table , rule .chain , rule .rule ... )
247
+ if err != nil {
248
+ return errors .Wrapf (err , "host network setup: failed to add %v" , rule )
249
+ }
250
+ } else if exists && ! rule .shouldExist {
251
+ err = ipt .Delete (rule .table , rule .chain , rule .rule ... )
252
+ if err != nil {
253
+ return errors .Wrapf (err , "host network setup: failed to delete %v" , rule )
254
+ }
132
255
}
133
256
}
134
257
135
258
return nil
136
259
}
137
260
261
+ func (n * linuxNetwork ) setProcSys (key , value string ) error {
262
+ f , err := n .openFile (key , os .O_WRONLY , 0644 )
263
+ if err != nil {
264
+ return err
265
+ }
266
+ defer f .Close ()
267
+ _ , err = f .WriteString (value )
268
+ if err != nil {
269
+ return err
270
+ }
271
+ return nil
272
+ }
273
+
274
+ type iptablesRule struct {
275
+ name string
276
+ shouldExist bool
277
+ table , chain string
278
+ rule []string
279
+ }
280
+
281
+ func (r iptablesRule ) String () string {
282
+ return fmt .Sprintf ("%s/%s rule %s" , r .table , r .chain , r .name )
283
+ }
284
+
138
285
func containsNoSuchRule (err error ) bool {
139
286
if errno , ok := err .(syscall.Errno ); ok {
140
287
return errno == syscall .ENOENT
141
288
}
142
289
return false
143
290
}
144
291
292
+ // GetConfigForDebug returns the active values of the configuration env vars (for debugging purposes).
293
+ func GetConfigForDebug () map [string ]interface {} {
294
+ return map [string ]interface {}{
295
+ envExternalSNAT : useExternalSNAT (),
296
+ envNodePortSupport : nodePortSupportEnabled (),
297
+ envConnmark : getConnmark (),
298
+ }
299
+ }
300
+
145
301
// useExternalSNAT returns whether SNAT of secondary ENI IPs should be handled with an external
146
302
// NAT gateway rather than on node. Failure to parse the setting will result in a log and the
147
303
// setting will be disabled.
148
304
func useExternalSNAT () bool {
149
- if externalSNATStr := os .Getenv (envExternalSNAT ); externalSNATStr != "" {
150
- externalSNAT , err := strconv .ParseBool (externalSNATStr )
305
+ return getBoolEnvVar (envExternalSNAT , false )
306
+ }
307
+
308
+ func nodePortSupportEnabled () bool {
309
+ return getBoolEnvVar (envNodePortSupport , true )
310
+ }
311
+
312
+ func getBoolEnvVar (name string , defaultValue bool ) bool {
313
+ if strValue := os .Getenv (name ); strValue != "" {
314
+ parsedValue , err := strconv .ParseBool (strValue )
151
315
if err != nil {
152
- log .Error ("Failed to parse " + envExternalSNAT , err .Error ())
153
- return false
316
+ log .Error ("Failed to parse " + name + "; using default: " + fmt . Sprint ( defaultValue ) , err .Error ())
317
+ return defaultValue
154
318
}
155
- return externalSNAT
319
+ return parsedValue
156
320
}
157
- return false
321
+ return defaultValue
322
+ }
323
+
324
+ func getConnmark () uint32 {
325
+ if connmark := os .Getenv (envConnmark ); connmark != "" {
326
+ mark , err := strconv .ParseInt (connmark , 0 , 64 )
327
+ if err != nil {
328
+ log .Error ("Failed to parse " + envConnmark + "; will use " , defaultConnmark , err .Error ())
329
+ return defaultConnmark
330
+ }
331
+ if mark > math .MaxUint32 || mark <= 0 {
332
+ log .Error ("" + envConnmark + " out of range; will use " , defaultConnmark )
333
+ return defaultConnmark
334
+ }
335
+ return uint32 (mark )
336
+ }
337
+ return defaultConnmark
158
338
}
159
339
160
340
// LinkByMac returns linux netlink based on interface MAC
@@ -177,8 +357,8 @@ func LinkByMac(mac string, netLink netlinkwrapper.NetLink) (netlink.Link, error)
177
357
}
178
358
179
359
// SetupENINetwork adds default route to route table (eni-<eni_table>)
180
- func (os * linuxNetwork ) SetupENINetwork (eniIP string , eniMAC string , eniTable int , eniSubnetCIDR string ) error {
181
- return setupENINetwork (eniIP , eniMAC , eniTable , eniSubnetCIDR , os .netLink )
360
+ func (n * linuxNetwork ) SetupENINetwork (eniIP string , eniMAC string , eniTable int , eniSubnetCIDR string ) error {
361
+ return setupENINetwork (eniIP , eniMAC , eniTable , eniSubnetCIDR , n .netLink )
182
362
}
183
363
184
364
func setupENINetwork (eniIP string , eniMAC string , eniTable int , eniSubnetCIDR string , netLink netlinkwrapper.NetLink ) error {
0 commit comments