Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ annotation. The annotation value selects the ingress mode:
In both managed modes, egress traffic from the backend pod is SNATed to the
LoadBalancer IP for source-IP preservation.

The optional `networking.cozystack.io/allowICMP: "true"` annotation, only
meaningful in port-filter mode (`wholeIP: "false"`), accepts ICMP traffic
toward the backend pod IP that would otherwise be dropped by the per-port
filter. Without it, all ICMP to a port-filtered pod is dropped — which also
blocks `ping`, **PMTU discovery** (ICMP "fragmentation needed"), and ICMP
unreachable signalling. Recommended for any service where path-MTU mismatches
or observability matter.

## Datapath

The nftables ruleset placed in table `ip cozy_proxy` consists of:
Expand All @@ -55,7 +63,9 @@ The nftables ruleset placed in table `ip cozy_proxy` consists of:
`(daddr, l4proto, dport)` is not in `allowed_ports`. The chain accepts
packets in conntrack states `established` or `related` first, so reply
packets of egress flows bypass the filter even when their dport is the VM's
ephemeral source port.
ephemeral source port. ICMP is dropped by default; if the
`allowICMP: "true"` annotation is set, the pod IP is added to
`icmp_allowed_pods` and ICMP toward it is accepted before the drop rule.

## Installation

Expand Down
133 changes: 79 additions & 54 deletions pkg/controllers/services_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,16 +235,7 @@ func (c *ServicesController) addServiceFunc(obj interface{}) {
podIP := ep.Subsets[0].Addresses[0].IP
// Ensure NAT mapping rules are set.
c.Proxy.EnsureRules(svcIP, podIP)
// Apply (or remove) port filtering depending on annotation value.
if wholeIPPassthrough(svc) {
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter", "svcIP", svcIP, "podIP", podIP)
}
} else {
if err := c.Proxy.EnsurePortFilter(svcIP, podIP, svc.Spec.Ports); err != nil {
log.Error(err, "failed to ensure port filter", "svcIP", svcIP, "podIP", podIP)
}
}
c.reconcilePortFilter(svc, svcIP, podIP, "on svc add")
}
}

Expand All @@ -267,9 +258,7 @@ func (c *ServicesController) deleteServiceFunc(obj interface{}) {

svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
podIP := se.Endpoint.Subsets[0].Addresses[0].IP
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter on svc deletion", "svcIP", svcIP, "podIP", podIP)
}
c.clearPortFilter(svcIP, podIP, "on svc deletion")
c.Proxy.DeleteRules(svcIP, podIP)
c.Services.Delete(svc.Namespace, svc.Name)
}
Expand All @@ -289,9 +278,7 @@ func (c *ServicesController) updateServiceFunc(oldObj, newObj interface{}) {
if hasValidServiceIP(se.Service) && hasValidEndpointIP(se.Endpoint) {
svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
podIP := se.Endpoint.Subsets[0].Addresses[0].IP
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter", "svcIP", svcIP, "podIP", podIP)
}
c.clearPortFilter(svcIP, podIP, "on annotation removal")
c.Proxy.DeleteRules(svcIP, podIP)
}
c.Services.Delete(svc.Namespace, svc.Name)
Expand All @@ -305,9 +292,7 @@ func (c *ServicesController) updateServiceFunc(oldObj, newObj interface{}) {
if hasValidServiceIP(se.Service) && hasValidEndpointIP(se.Endpoint) {
svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
podIP := se.Endpoint.Subsets[0].Addresses[0].IP
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter", "svcIP", svcIP, "podIP", podIP)
}
c.clearPortFilter(svcIP, podIP, "on svc IP loss")
c.Proxy.DeleteRules(svcIP, podIP)
}
c.Services.Delete(svc.Namespace, svc.Name)
Expand Down Expand Up @@ -348,15 +333,7 @@ func (c *ServicesController) updateServiceFunc(oldObj, newObj interface{}) {
svcIP := svc.Status.LoadBalancer.Ingress[0].IP
podIP := ep.Subsets[0].Addresses[0].IP
c.Proxy.EnsureRules(svcIP, podIP)
if wholeIPPassthrough(svc) {
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter", "svcIP", svcIP, "podIP", podIP)
}
} else {
if err := c.Proxy.EnsurePortFilter(svcIP, podIP, svc.Spec.Ports); err != nil {
log.Error(err, "failed to ensure port filter", "svcIP", svcIP, "podIP", podIP)
}
}
c.reconcilePortFilter(svc, svcIP, podIP, "on svc update")

// Update or add the service mapping with the new endpoint.
c.Services.Set(svc.Namespace, svc.Name, &ServiceEndpoints{Service: svc, Endpoint: ep})
Expand Down Expand Up @@ -386,16 +363,7 @@ func (c *ServicesController) addEndpointFunc(obj interface{}) {
svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
podIP := ep.Subsets[0].Addresses[0].IP
c.Proxy.EnsureRules(svcIP, podIP)
// Mirror the port-filter state to the new endpoint.
if wholeIPPassthrough(se.Service) {
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter on endpoint add", "svcIP", svcIP, "podIP", podIP)
}
} else {
if err := c.Proxy.EnsurePortFilter(svcIP, podIP, se.Service.Spec.Ports); err != nil {
log.Error(err, "failed to ensure port filter on endpoint add", "svcIP", svcIP, "podIP", podIP)
}
}
c.reconcilePortFilter(se.Service, svcIP, podIP, "on endpoint add")
}
}

Expand All @@ -417,9 +385,7 @@ func (c *ServicesController) deleteEndpointFunc(obj interface{}) {
}
svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
podIP := se.Endpoint.Subsets[0].Addresses[0].IP
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter on endpoint delete", "svcIP", svcIP, "podIP", podIP)
}
c.clearPortFilter(svcIP, podIP, "on endpoint delete")
c.Proxy.DeleteRules(svcIP, podIP)
// Set the endpoint to nil.
c.Services.SetEndpoint(ep.Namespace, ep.Name, nil)
Expand All @@ -442,9 +408,7 @@ func (c *ServicesController) updateEndpointFunc(oldObj, newObj interface{}) {
if hasValidServiceIP(se.Service) && hasValidEndpointIP(se.Endpoint) {
svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
oldPodIP := se.Endpoint.Subsets[0].Addresses[0].IP
if err := c.Proxy.DeletePortFilter(svcIP, oldPodIP); err != nil {
log.Error(err, "failed to delete port filter on endpoint invalidation", "svcIP", svcIP, "podIP", oldPodIP)
}
c.clearPortFilter(svcIP, oldPodIP, "on endpoint invalidation")
c.Proxy.DeleteRules(svcIP, oldPodIP)
}
c.Services.SetEndpoint(ep.Namespace, ep.Name, ep)
Expand All @@ -459,15 +423,7 @@ func (c *ServicesController) updateEndpointFunc(oldObj, newObj interface{}) {
svcIP := se.Service.Status.LoadBalancer.Ingress[0].IP
podIP := ep.Subsets[0].Addresses[0].IP
c.Proxy.EnsureRules(svcIP, podIP)
if wholeIPPassthrough(se.Service) {
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter on endpoint update", "svcIP", svcIP, "podIP", podIP)
}
} else {
if err := c.Proxy.EnsurePortFilter(svcIP, podIP, se.Service.Spec.Ports); err != nil {
log.Error(err, "failed to ensure port filter on endpoint update", "svcIP", svcIP, "podIP", podIP)
}
}
c.reconcilePortFilter(se.Service, svcIP, podIP, "on endpoint update")
c.Services.SetEndpoint(ep.Namespace, ep.Name, ep)
}

Expand Down Expand Up @@ -506,7 +462,10 @@ func hasValidEndpointIP(ep *v1.Endpoints) bool {
return ep.Subsets[0].Addresses[0].IP != ""
}

const wholeIPAnnotation = "networking.cozystack.io/wholeIP"
const (
wholeIPAnnotation = "networking.cozystack.io/wholeIP"
allowICMPAnnotation = "networking.cozystack.io/allowICMP"
)

// hasWholeIPAnnotation reports whether the service is opted into cozy-proxy
// management via the wholeIP annotation. Any value triggers management — the
Expand All @@ -530,6 +489,54 @@ func wholeIPPassthrough(svc *v1.Service) bool {
return svc.Annotations[wholeIPAnnotation] != "false"
}

// allowICMP reports whether ICMP traffic should bypass the port_filter drop
// rule for this service. Only meaningful in port-filter mode (wholeIP=false);
// in passthrough mode the port_filter drop rule does not apply at all.
func allowICMP(svc *v1.Service) bool {
if svc == nil || svc.Annotations == nil {
return false
}
return svc.Annotations[allowICMPAnnotation] == "true"
}

// reconcilePortFilter applies the port-filter and ICMP-allow state implied by
// the service's annotations. Call sites pass the resolved svcIP/podIP and a
// short context string that ends up in error logs.
func (c *ServicesController) reconcilePortFilter(svc *v1.Service, svcIP, podIP, ctx string) {
if wholeIPPassthrough(svc) {
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter "+ctx, "svcIP", svcIP, "podIP", podIP)
}
if err := c.Proxy.DeleteICMPAllow(svcIP, podIP); err != nil {
log.Error(err, "failed to delete ICMP allow "+ctx, "svcIP", svcIP, "podIP", podIP)
}
return
}
if err := c.Proxy.EnsurePortFilter(svcIP, podIP, svc.Spec.Ports); err != nil {
log.Error(err, "failed to ensure port filter "+ctx, "svcIP", svcIP, "podIP", podIP)
}
if allowICMP(svc) {
if err := c.Proxy.EnsureICMPAllow(svcIP, podIP); err != nil {
log.Error(err, "failed to ensure ICMP allow "+ctx, "svcIP", svcIP, "podIP", podIP)
}
} else {
if err := c.Proxy.DeleteICMPAllow(svcIP, podIP); err != nil {
log.Error(err, "failed to delete ICMP allow "+ctx, "svcIP", svcIP, "podIP", podIP)
}
}
}

// clearPortFilter unconditionally removes both port-filter and ICMP-allow
// state for (svcIP, podIP). Used by delete paths.
func (c *ServicesController) clearPortFilter(svcIP, podIP, ctx string) {
if err := c.Proxy.DeletePortFilter(svcIP, podIP); err != nil {
log.Error(err, "failed to delete port filter "+ctx, "svcIP", svcIP, "podIP", podIP)
}
if err := c.Proxy.DeleteICMPAllow(svcIP, podIP); err != nil {
log.Error(err, "failed to delete ICMP allow "+ctx, "svcIP", svcIP, "podIP", podIP)
}
}

// cleanupRemovedServices performs an initial cleanup for removed services.
func (c *ServicesController) cleanupRemovedServices() error {
keepMap := make(map[string]string)
Expand Down Expand Up @@ -577,5 +584,23 @@ func (c *ServicesController) cleanupRemovedServices() error {
if err := c.Proxy.CleanupPortFilters(keepFilters); err != nil {
return fmt.Errorf("failed to perform port-filter cleanup: %w", err)
}
// Build ICMP-allow snapshot: services in port-filter mode that opt into
// ICMP via the allowICMP annotation.
keepICMP := make(map[string]string)
for _, se := range allServices {
if se.Service == nil || se.Endpoint == nil {
continue
}
if !hasValidServiceIP(se.Service) || !hasValidEndpointIP(se.Endpoint) {
continue
}
if wholeIPPassthrough(se.Service) || !allowICMP(se.Service) {
continue
}
keepICMP[se.Service.Status.LoadBalancer.Ingress[0].IP] = se.Endpoint.Subsets[0].Addresses[0].IP
}
if err := c.Proxy.CleanupICMPAllow(keepICMP); err != nil {
return fmt.Errorf("failed to perform ICMP-allow cleanup: %w", err)
}
return nil
}
22 changes: 22 additions & 0 deletions pkg/controllers/services_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,25 @@ func TestWholeIPPassthrough(t *testing.T) {
})
}
}

func TestAllowICMP(t *testing.T) {
cases := []struct {
name string
svc *v1.Service
expect bool
}{
{"explicit true", svcWith(map[string]string{"networking.cozystack.io/allowICMP": "true"}), true},
{"explicit false", svcWith(map[string]string{"networking.cozystack.io/allowICMP": "false"}), false},
{"empty value", svcWith(map[string]string{"networking.cozystack.io/allowICMP": ""}), false},
{"absent annotation defaults to false", svcWith(map[string]string{}), false},
{"nil annotations defaults to false", &v1.Service{}, false},
{"unrelated annotation", svcWith(map[string]string{"networking.cozystack.io/wholeIP": "false"}), false},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if got := allowICMP(c.svc); got != c.expect {
t.Errorf("allowICMP = %v, want %v", got, c.expect)
}
})
}
}
15 changes: 15 additions & 0 deletions pkg/proxy/dummy.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,20 @@ func (d *DummyProxyProcessor) CleanupPortFilters(keep map[string]PortFilterEntry
return nil
}

func (d *DummyProxyProcessor) EnsureICMPAllow(SvcIP, PodIP string) error {
fmt.Printf("EnsureICMPAllow called with SvcIP: %s, PodIP: %s\n", SvcIP, PodIP)
return nil
}

func (d *DummyProxyProcessor) DeleteICMPAllow(SvcIP, PodIP string) error {
fmt.Printf("DeleteICMPAllow called with SvcIP: %s, PodIP: %s\n", SvcIP, PodIP)
return nil
}

func (d *DummyProxyProcessor) CleanupICMPAllow(keep map[string]string) error {
fmt.Printf("CleanupICMPAllow called with %d entries\n", len(keep))
return nil
}

// Compile-time assertion that DummyProxyProcessor satisfies ProxyProcessor.
var _ ProxyProcessor = (*DummyProxyProcessor)(nil)
14 changes: 14 additions & 0 deletions pkg/proxy/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,20 @@ type ProxyProcessor interface {
// keepFilters. Any stale entries are removed. The PortFilterEntry struct
// carries both the pod IP (used as the actual nft key) and the ports.
CleanupPortFilters(keepFilters map[string]PortFilterEntry) error

// EnsureICMPAllow adds the pod IP to the ICMP allowlist consulted by the
// port_filter chain. With this in place, ICMP traffic to a pod that is
// otherwise port-filtered is accepted instead of dropped (preserves ping,
// PMTU discovery, ICMP unreachable signalling). Idempotent.
EnsureICMPAllow(SvcIP, PodIP string) error

// DeleteICMPAllow removes the pod IP from the ICMP allowlist. No-op if
// not present.
DeleteICMPAllow(SvcIP, PodIP string) error

// CleanupICMPAllow keeps only the entries listed in keepICMP (svcIP →
// podIP) in the ICMP allowlist; everything else is removed.
CleanupICMPAllow(keepICMP map[string]string) error
}

// PortFilterEntry describes a port-filter desired state in the controller's
Expand Down
Loading