From 7b977a09c038fe4ae1db3fbe07c672ecfb709c66 Mon Sep 17 00:00:00 2001 From: Peter Sprygada Date: Fri, 1 May 2026 12:15:06 -0400 Subject: [PATCH 1/2] feat: BGP control plan architecture This commit provides the first draft of the proposed BGP control plane for Galactic VPC. It includes the recommendation for a tiered route reflector hierarchy and defines how worker nodes will connect to the infrastructure. --- .../networking/bgp-control-plane/README.md | 315 ++++++++++++++++++ .../bgp-control-plane/bgp-context.puml | 41 +++ .../bgp-control-plane/bgp-context.svg | 82 +++++ .../bgp-full-architecture.puml | 101 ++++++ .../bgp-full-architecture.svg | 159 +++++++++ .../bgp-worker-component.puml | 25 ++ .../bgp-worker-component.svg | 57 ++++ 7 files changed, 780 insertions(+) create mode 100644 enhancements/networking/bgp-control-plane/README.md create mode 100644 enhancements/networking/bgp-control-plane/bgp-context.puml create mode 100644 enhancements/networking/bgp-control-plane/bgp-context.svg create mode 100644 enhancements/networking/bgp-control-plane/bgp-full-architecture.puml create mode 100644 enhancements/networking/bgp-control-plane/bgp-full-architecture.svg create mode 100644 enhancements/networking/bgp-control-plane/bgp-worker-component.puml create mode 100644 enhancements/networking/bgp-control-plane/bgp-worker-component.svg diff --git a/enhancements/networking/bgp-control-plane/README.md b/enhancements/networking/bgp-control-plane/README.md new file mode 100644 index 0000000..dcd44e7 --- /dev/null +++ b/enhancements/networking/bgp-control-plane/README.md @@ -0,0 +1,315 @@ +# BGP control plane design — Galactic VPC + +## Overview + +This document describes the BGP control plane architecture for Datum's Galactic VPC +fabric. The fabric spans 16 points of presence (PoPs) across three geographic regions +and uses a two-tier hierarchical route reflector (RR) model to distribute routing +information at scale. + +BGP is the single control plane protocol — no IGP runs in the underlay. SRv6 +(RFC 8986) is the committed data plane. Every design decision in this document is +made with that constraint as a hard given. + +--- + +## Design goals + +- Full control plane reachability across all 16 PoPs via a single protocol (BGP) +- Regional forwarding survives total loss of the global RR tier +- No PoP carries more than 2 RR client sessions +- No single point of failure at any tier +- Clean separation of intra-region and inter-region route reflection +- galactic-operator owns GoBGP lifecycle end-to-end; no out-of-band config + +**Non-goals:** + +- IGP in the underlay — BGP handles locator advertisement and underlay reachability +- MPLS fallback — SRv6 is the commitment, not a preference +- Stretched L2 between PoPs + +--- + +## PoP inventory + +| Region | POP ID | City | Notes | +|--------------|--------------|---------------|--------------------------| +| Americas | us-east-1 | Ashburn | Migrating to servers.com | +| Americas | us-east-2 | New York City | Global RR site | +| Americas | us-central-1 | Dallas | Migrating to servers.com | +| Americas | us-west-1 | San Jose | Migrating to servers.com | +| Americas | ca-east-1 | Toronto | | +| Americas | br-east-1 | São Paulo | | +| Americas | cl-central-1 | Santiago | | +| EMEA | de-central-1 | Frankfurt | Global RR site | +| EMEA | gb-south-1 | London | | +| EMEA | nl-west-1 | Amsterdam | | +| EMEA | ae-north-1 | Dubai | | +| EMEA | za-central-1 | Johannesburg | | +| Asia-Pacific | sg-central-1 | Singapore | Regional RR site | +| Asia-Pacific | jp-east-1 | Tokyo | | +| Asia-Pacific | au-east-1 | Sydney | | +| Asia-Pacific | in-west-1 | Mumbai | | + +--- + +## Architecture + +### Two-tier route reflector hierarchy + +The control plane uses a two-tier hierarchy. This eliminates the O(n²) iBGP full-mesh +problem while ensuring regional forwarding is fully independent of the global tier. + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Tier 0 — Global RRs │ +│ │ +│ us-east-2 (NYC) ◄────── iBGP full mesh ──────► de-central-1 (FRA) │ +└─────────────────────────────────────────────────────────────────────┘ + ▲ ▲ ▲ ▲ ▲ ▲ + │ │ │ │ │ │ +┌─────────┴──┴──┐ ┌────────┴──┴──┐ ┌───────┴──┴───┐ +│ Regional RR │ │ Regional RR │ │ Regional RR │ +│ Americas │ │ EMEA │ │ APAC │ +│ (NYC anchor) │ │ (FRA anchor) │ │ (SIN anchor) │ +└───────┬───────┘ └──────┬───────┘ └──────┬───────┘ + │ iBGP │ iBGP │ iBGP + ┌────┴────┐ ┌────┴────┐ ┌────┴────┐ + │ Workers │ │ Workers │ │ Workers │ + │ GoBGP │ │ GoBGP │ │ GoBGP │ + └─────────┘ └─────────┘ └─────────┘ +``` + +**Tier 0 — Global RRs (2 nodes)** + +Two global RRs deployed as an iBGP full-mesh pair: + +| Node | Location | Rationale | +|--------------|---------------|----------------------------------------------------| +| us-east-2 | New York City | Existing PoP; best latency to Americas and EMEA | +| de-central-1 | Frankfurt | Existing PoP; best latency spread to EMEA and APAC | + +The global RRs reflect inter-regional reachability between the three regional +clusters. They do not carry intra-region routes; those are handled entirely within +each regional cluster. No new sites are required — both are existing deployed PoPs. + +**Tier 1 — Regional RR clusters (3 pairs)** + +| Cluster | RR anchor | PoPs served | +|--------------|--------------------------|-----------------------------------------------------------------------------------| +| Americas | us-east-2 (NYC) | us-east-1, us-east-2, us-central-1, us-west-1, ca-east-1, br-east-1, cl-central-1 | +| EMEA | de-central-1 (Frankfurt) | de-central-1, gb-south-1, nl-west-1, ae-north-1, za-central-1 | +| Asia-Pacific | sg-central-1 (Singapore) | sg-central-1, jp-east-1, au-east-1, in-west-1 | + +Each regional cluster is a **pair** of RRs operating active/active. Each worker node +peers with both RRs in its regional cluster — two sessions per node, no more. +Redundancy is built in: loss of one RR in a pair causes no service impact. + +Singapore was chosen as the APAC anchor over Tokyo or Sydney because it minimises +the average RTT across the four APAC PoPs (Singapore, Tokyo, Sydney, Mumbai). Tokyo +would penalise Mumbai and Sydney; Sydney would penalise Tokyo and Mumbai. + +Dallas was explicitly rejected as an Americas anchor despite proximity to LATAM: +the RR is a control plane function and RTT has no material operational impact on +BGP session management. Dallas is also mid-migration to servers.com, making it +unsuitable for load-bearing infrastructure. + +--- + +## BGP Control Plane System Context + +![BGP control plane — system context](bgp-context.svg) + +> Source: [`diagrams/bgp-context.puml`](bgp-context.puml) + +--- + +## Session topology + +### Session counts + +| Node type | Sessions | Peers | +|---------------------|-----------------|--------------------------------------------------------------------| +| Worker node (GoBGP) | 2 | Both RRs in regional pair | +| Regional RR node | 2 + (N clients) | Both global RRs + all regional clients | +| Global RR node | 2 + 6 | Other global RR + both nodes of each regional pair (3 regions × 2) | + +At 16 PoPs today, with an average of 3 workers per PoP, this is approximately +96 worker-to-RR sessions globally — entirely manageable. The design scales linearly: +adding a PoP adds exactly 2 RR sessions. + +### BGP session establishment sequence + +The following shows how a worker node establishes its control plane sessions on boot: + +```mermaid +sequenceDiagram + participant W as Worker node
(GoBGP) + participant GO as galactic-operator + participant RR1 as Regional RR 1 + participant RR2 as Regional RR 2 + participant GRR as Global RR + + GO->>W: Render GoBGP config (peers, RD/RT, VRFs) + GO->>W: SIGHUP GoBGP + W->>RR1: TCP SYN → port 179 + W->>RR2: TCP SYN → port 179 + RR1-->>W: BGP OPEN (capabilities: IPv6-unicast, VPNv4, VPNv6, EVPN, BGP-LS) + RR2-->>W: BGP OPEN (capabilities: IPv6-unicast, VPNv4, VPNv6, EVPN, BGP-LS) + W-->>RR1: BGP OPEN + KEEPALIVE + W-->>RR2: BGP OPEN + KEEPALIVE + RR1->>W: BGP UPDATE — full intra-region RIB + RR2->>W: BGP UPDATE — full intra-region RIB + Note over RR1,GRR: Regional RRs already peered
with global tier at startup + RR1->>W: BGP UPDATE — inter-region routes (via global RR reflection) + W->>RR1: BGP UPDATE — local VPNv4/VPNv6 prefixes (RFC 9252 SRv6 TLV) + RR1->>GRR: BGP UPDATE — reflects worker prefixes to global tier + GRR->>RR2: BGP UPDATE — reflects to other regional clusters +``` + +### Route propagation — inter-region example + +This shows how a prefix originating on a worker in Tokyo reaches a worker in London: + +```mermaid +sequenceDiagram + participant W_JP as Worker
jp-east-1 + participant RR_AP as APAC RR
sg-central-1 + participant GRR as Global RR
us-east-2 + participant RR_EU as EMEA RR
de-central-1 + participant W_GB as Worker
gb-south-1 + + W_JP->>RR_AP: BGP UPDATE — tenant prefix + SRv6 TLV
(End.DT46 SID for jp-east-1) + Note over RR_AP: Reflects to regional clients
AND to global RRs + RR_AP->>GRR: BGP UPDATE — reflects prefix + GRR->>RR_EU: BGP UPDATE — reflects to EMEA + RR_EU->>W_GB: BGP UPDATE — installs in tenant VRF + Note over W_GB: Traffic to Tokyo now
SRv6-encapsulated with
jp-east-1 End.DT46 SID +``` + +--- + +## SAFIs + +All RR sessions negotiate the following address families: + +| Address family | Purpose | +|-------------------------------------------------|---------------------------------------------------| +| IPv6 Unicast | Underlay reachability, SRv6 locator advertisement | +| VPNv4 (RFC 4364) + SRv6 Services TLV (RFC 9252) | Tenant L3VPN overlay — IPv4 prefixes | +| VPNv6 (RFC 4659) + SRv6 Services TLV (RFC 9252) | Tenant L3VPN overlay — IPv6 prefixes | +| EVPN + SRv6 Services TLV (RFC 9252) | Tenant L2/L3 overlay | +| BGP-LS | Topology export to controller / PCE | + +RFC 9252 SRv6 Services TLV carries the `End.DT4` (IPv4) or `End.DT6` (IPv6) SID for +each tenant VRF alongside the VPN prefix. This is the glue between BGP VPN signalling +and SRv6 forwarding — a remote PE receiving a VPN prefix uses the TLV to determine +which SRv6 SID to use for encapsulation. VPNv4 and VPNv6 use the same SID structure; +the difference is the BGP NLRI encoding and the SRv6 endpoint behavior (`End.DT4` vs +`End.DT46` vs `End.DT6` depending on whether the VRF is IPv4-only, dual-stack, or +IPv6-only). + +> **Note on GoBGP RFC 9252 support:** GoBGP's SRv6 Services TLV implementation has +> known gaps in the `SRv6 SID Structure` sub-TLV (specifically the +> `TranspositionLength`/`TranspositionOffset` fields used for uSID compression). This +> must be validated against the regional RR implementation before production rollout. +> See the RFC 9252 validation spike in the backlog. + +--- + +## Node-level architecture + +Each Kubernetes worker node runs three daemonset processes that together form the +per-node control and data plane: + +![Worker node — component view](bgp-worker-component.svg) + +> Source: [`bgp-worker-component.puml`](bgp-worker-component.puml) + +**galactic-operator reconciliation loop (per tenant pod):** + +``` +1. Watch: pod CREATE with datum.net/tenant= on this node +2. Ensure: vrf-tenant- exists (Netlink) — idempotent +3. Ensure: GoBGP VRF configured with RD/RT (gRPC AddVrf) +4. Wait: Cilium creates lxcXXXX (watch netlink NEWLINK) +5. Act: ip link set lxcXXXX master vrf-tenant- (Netlink) +6. Act: move endpoint route from table 0 → tenant table (Netlink) +7. Act: AddPath to GoBGP VRF for pod /128 (gRPC) + +On pod DELETE: +1. Watch: pod DELETE +2. Act: DeletePath from GoBGP for pod /128 (gRPC) +3. Act: release lxc from VRF (Netlink) +4. Cleanup: remove VRF if no remaining pods in tenant on this node +``` + +galactic-operator is the single source of truth. GoBGP holds no persistent state — +on restart, the operator re-drives all VRF and path state from Kubernetes CRDs. + +--- + +## Failure modes + +### Global RR loss + +Loss of one global RR degrades inter-region route propagation but does not cause an +outage. The remaining global RR continues reflecting between regional clusters. + +Loss of **both** global RRs: inter-region routes are no longer updated but existing +routes remain in the regional RIBs. Intra-region forwarding is completely unaffected. +New prefixes originating in one region will not reach other regions until the global +tier recovers. + +> **This must be tested explicitly in staging.** Do not assume it works. The test is: +> withdraw both global RRs, originate a new prefix in Americas, confirm it does NOT +> appear in EMEA or APAC RIBs, confirm all *existing* inter-region routes remain +> installed and forwarding. + +### Regional RR node loss (one of pair) + +No service impact. Workers continue peering with the surviving RR node. The operator +should alert within 60 seconds; the failed node should be replaced within the SLO +window before the pair degrades to a single point of failure. + +### Worker GoBGP crash + +BGP sessions drop. galactic-operator detects the restart and re-drives VRF and path +state via gRPC. Session re-establishment uses configured keepalive/hold timers +(recommended: 10s keepalive / 30s hold). In-flight traffic to the affected worker +black-holes until sessions re-establish — typically under 30 seconds with aggressive +timers. + +### servers.com migration (us-east-1, us-central-1, us-west-1) + +The three US PoPs migrating to servers.com will experience BGP session bounces during +cutover. Locators and ASN remain unchanged. The regional RR pair should treat these as +normal client reconvergence events. Drain each PoP before migration to avoid in-flight +tenant traffic loss. Do not migrate all three simultaneously. + +--- + +## Timer recommendations + +| Timer | Recommended value | Rationale | +|---------------------|-------------------|---------------------------------------------| +| BGP keepalive | 10s | Faster detection without excessive overhead | +| BGP hold time | 30s | 3× keepalive; aggressive but stable | +| BFD (if enabled) | 300ms × 3 | Sub-second PE failure detection | +| RR client reconnect | 5s | Fast reconnect after transient loss | + +BFD is not currently implemented. Without BFD, PE failure detection relies on BGP +hold timer expiry — up to 30s with the above settings. Implementing BFD on worker +nodes is a tracked backlog item; until then, hold timers are the sole failure +detection mechanism. + +--- + +## Full architecture + +The diagram below shows the complete BGP control plane from tenant pod through to +the global RR tier, including the per-node component relationships. + +![Galactic VPC — BGP control plane full architecture](bgp-full-architecture.svg) + +> Source: [`bgp-full-architecture.puml`](bgp-full-architecture.puml) diff --git a/enhancements/networking/bgp-control-plane/bgp-context.puml b/enhancements/networking/bgp-control-plane/bgp-context.puml new file mode 100644 index 0000000..0d2df56 --- /dev/null +++ b/enhancements/networking/bgp-control-plane/bgp-context.puml @@ -0,0 +1,41 @@ +@startuml bgp-context +!$NEW_C4_STYLE = 1 +!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Context.puml +!include https://raw.githubusercontent.com/datum-cloud/enhancements/refs/heads/main/enhancements/datum-theme.puml + +title BGP control plane — system context + +Boundary(tier0, "Tier 0 — Global RRs") { + System(rr_nyc, "us-east-2", "Global RR\nNew York City") + System(rr_fra, "de-central-1", "Global RR\nFrankfurt") +} + +Boundary(tier1_am, "Tier 1 — Americas") { + System(rr_am, "Americas RR pair", "Anchored at NYC\nactive/active") + System(pop_am, "7 × worker nodes", "GoBGP\nus-east-1/2, us-central-1, us-west-1, ca-east-1, br-east-1, cl-central-1") +} + +Boundary(tier1_eu, "Tier 1 — EMEA") { + System(rr_eu, "EMEA RR pair", "Anchored at Frankfurt\nactive/active") + System(pop_eu, "5 × worker nodes", "GoBGP\nde-central-1, gb-south-1, nl-west-1, ae-north-1, za-central-1") +} + +Boundary(tier1_ap, "Tier 1 — Asia-Pacific") { + System(rr_ap, "APAC RR pair", "Anchored at Singapore\nactive/active") + System(pop_ap, "4 × worker nodes", "GoBGP\nsg-central-1, jp-east-1, au-east-1, in-west-1") +} + +Rel(rr_nyc, rr_fra, "iBGP full mesh") + +Rel(rr_am, rr_nyc, "iBGP", "inter-region routes") +Rel(rr_am, rr_fra, "iBGP", "inter-region routes") +Rel(rr_eu, rr_nyc, "iBGP", "inter-region routes") +Rel(rr_eu, rr_fra, "iBGP", "inter-region routes") +Rel(rr_ap, rr_nyc, "iBGP", "inter-region routes") +Rel(rr_ap, rr_fra, "iBGP", "inter-region routes") + +Rel(pop_am, rr_am, "iBGP", "2 sessions per node") +Rel(pop_eu, rr_eu, "iBGP", "2 sessions per node") +Rel(pop_ap, rr_ap, "iBGP", "2 sessions per node") + +@enduml diff --git a/enhancements/networking/bgp-control-plane/bgp-context.svg b/enhancements/networking/bgp-control-plane/bgp-context.svg new file mode 100644 index 0000000..b2d1614 --- /dev/null +++ b/enhancements/networking/bgp-control-plane/bgp-context.svg @@ -0,0 +1,82 @@ +BGP control plane — system contextTier 0 — Global RRsTier 1 — AmericasTier 1 — EMEATier 1 — Asia-Pacificus-east-2Global RR (FRR)New York Cityde-central-1Global RR (FRR)FrankfurtAmericas RR pairFRR — Anchored at NYCactive/active7 × worker nodesGoBGPus-east-1/2, us-central-1, us-west-1ca-east-1, br-east-1, cl-central-1EMEA RR pairFRR — Anchored at Frankfurtactive/active5 × worker nodesGoBGPde-central-1, gb-south-1, nl-west-1ae-north-1, za-central-1APAC RR pairFRR — Anchored at Singaporeactive/active4 × worker nodesGoBGPsg-central-1, jp-east-1au-east-1, in-west-1iBGP full meshiBGPiBGPiBGPiBGPiBGPiBGPiBGP (2 sessions/node)iBGP (2 sessions/node)iBGP (2 sessions/node) \ No newline at end of file diff --git a/enhancements/networking/bgp-control-plane/bgp-full-architecture.puml b/enhancements/networking/bgp-control-plane/bgp-full-architecture.puml new file mode 100644 index 0000000..8fcc275 --- /dev/null +++ b/enhancements/networking/bgp-control-plane/bgp-full-architecture.puml @@ -0,0 +1,101 @@ +@startuml bgp-full-architecture +!$NEW_C4_STYLE = 1 +!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Deployment.puml +!include https://raw.githubusercontent.com/datum-cloud/enhancements/refs/heads/main/enhancements/datum-theme.puml + +title Galactic VPC — BGP control plane full architecture + +' ─── Tier 0: Global RRs ─────────────────────────────────────────────── +Boundary(tier0, "Tier 0 — Global RRs") { + Deployment_Node(nyc_site, "us-east-2 · New York City") { + Container(grr1, "Global RR 1", "FRR", "iBGP cluster A\nreflects inter-region routes") + } + Deployment_Node(fra_site, "de-central-1 · Frankfurt") { + Container(grr2, "Global RR 2", "FRR", "iBGP cluster B\nreflects inter-region routes") + } +} + +' ─── Tier 1: Regional RRs ───────────────────────────────────────────── +Boundary(tier1, "Tier 1 — Regional RRs") { + Deployment_Node(am_region, "Americas") { + Container(rrA1, "Americas RR — primary", "FRR", "Anchor: us-east-2") + Container(rrA2, "Americas RR — secondary", "FRR", "active/active pair") + } + Deployment_Node(eu_region, "EMEA") { + Container(rrE1, "EMEA RR — primary", "FRR", "Anchor: de-central-1") + Container(rrE2, "EMEA RR — secondary", "FRR", "active/active pair") + } + Deployment_Node(ap_region, "Asia-Pacific") { + Container(rrP1, "APAC RR — primary", "FRR", "Anchor: sg-central-1") + Container(rrP2, "APAC RR — secondary", "FRR", "active/active pair") + } +} + +' ─── Tier 2: Worker nodes (representative PoP per region) ───────────── +Boundary(tier2, "Tier 2 — Worker nodes (representative PoPs)") { + Deployment_Node(pop_am, "us-east-1 · Ashburn (representative)") { + Deployment_Node(k8s_am, "K8s worker node") { + Container(gobgp_am, "GoBGP", "DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\nRFC 9252 SRv6 Services TLV") + Container(op_am, "galactic-operator", "DaemonSet (privileged)", "VRF lifecycle\nlxc enslavement\ngRPC → GoBGP") + Container(cilium_am, "Cilium", "CNI DaemonSet", "veth/lxc lifecycle\neBPF policy") + } + } + Deployment_Node(pop_eu, "gb-south-1 · London (representative)") { + Deployment_Node(k8s_eu, "K8s worker node") { + Container(gobgp_eu, "GoBGP", "DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\nRFC 9252 SRv6 Services TLV") + Container(op_eu, "galactic-operator", "DaemonSet (privileged)", "VRF lifecycle\nlxc enslavement\ngRPC → GoBGP") + Container(cilium_eu, "Cilium", "CNI DaemonSet", "veth/lxc lifecycle\neBPF policy") + } + } + Deployment_Node(pop_ap, "sg-central-1 · Singapore (representative)") { + Deployment_Node(k8s_ap, "K8s worker node") { + Container(gobgp_ap, "GoBGP", "DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\nRFC 9252 SRv6 Services TLV") + Container(op_ap, "galactic-operator", "DaemonSet (privileged)", "VRF lifecycle\nlxc enslavement\ngRPC → GoBGP") + Container(cilium_ap, "Cilium", "CNI DaemonSet", "veth/lxc lifecycle\neBPF policy") + } + } +} + +' ─── Tier 0 iBGP full mesh ──────────────────────────────────────────── +Rel(grr1, grr2, "iBGP full mesh") + +' ─── Regional → Global (both RRs in each pair peer both globals) ─────── +Rel(rrA1, grr1, "iBGP") +Rel(rrA1, grr2, "iBGP") +Rel(rrA2, grr1, "iBGP") +Rel(rrA2, grr2, "iBGP") + +Rel(rrE1, grr1, "iBGP") +Rel(rrE1, grr2, "iBGP") +Rel(rrE2, grr1, "iBGP") +Rel(rrE2, grr2, "iBGP") + +Rel(rrP1, grr1, "iBGP") +Rel(rrP1, grr2, "iBGP") +Rel(rrP2, grr1, "iBGP") +Rel(rrP2, grr2, "iBGP") + +' ─── Workers → Regional (2 sessions per worker) ─────────────────────── +Rel(gobgp_am, rrA1, "iBGP") +Rel(gobgp_am, rrA2, "iBGP") + +Rel(gobgp_eu, rrE1, "iBGP") +Rel(gobgp_eu, rrE2, "iBGP") + +Rel(gobgp_ap, rrP1, "iBGP") +Rel(gobgp_ap, rrP2, "iBGP") + +' ─── Intra-node relationships ───────────────────────────────────────── +Rel(op_am, gobgp_am, "gRPC — VRF/path config") +Rel(op_am, cilium_am, "Netlink — lxc NEWLINK watch") +Rel(cilium_am, k8s_am, "Netlink — veth pairs") + +Rel(op_eu, gobgp_eu, "gRPC — VRF/path config") +Rel(op_eu, cilium_eu, "Netlink — lxc NEWLINK watch") +Rel(cilium_eu, k8s_eu, "Netlink — veth pairs") + +Rel(op_ap, gobgp_ap, "gRPC — VRF/path config") +Rel(op_ap, cilium_ap, "Netlink — lxc NEWLINK watch") +Rel(cilium_ap, k8s_ap, "Netlink — veth pairs") + +@enduml diff --git a/enhancements/networking/bgp-control-plane/bgp-full-architecture.svg b/enhancements/networking/bgp-control-plane/bgp-full-architecture.svg new file mode 100644 index 0000000..0316bae --- /dev/null +++ b/enhancements/networking/bgp-control-plane/bgp-full-architecture.svg @@ -0,0 +1,159 @@ +Galactic VPC — BGP control plane full architectureTier 0 — Global RRsus-east-2 · New York Cityde-central-1 · FrankfurtTier 1 — Regional RRsAmericasEMEAAsia-PacificTier 2 — Worker nodes (representative PoPs)us-east-1 · AshburnK8s worker nodegb-south-1 · LondonK8s worker nodesg-central-1 · SingaporeK8s worker nodeGlobal RR 1FRRiBGP cluster Areflects inter-region routesGlobal RR 2FRRiBGP cluster Breflects inter-region routesAmericas RR — primaryFRR · Anchor: us-east-2Americas RR — secondaryFRR · active/active pairEMEA RR — primaryFRR · Anchor: de-central-1EMEA RR — secondaryFRR · active/active pairAPAC RR — primaryFRR · Anchor: sg-central-1APAC RR — secondaryFRR · active/active pairGoBGPDaemonSet (hostNetwork)VPNv4/VPNv6 PE · RFC 9252galactic-operatorDaemonSet (privileged)VRF lifecycle · gRPC→GoBGPCiliumCNI DaemonSetveth/lxc lifecycle · eBPF policyGoBGPDaemonSet (hostNetwork)VPNv4/VPNv6 PE · RFC 9252galactic-operatorDaemonSet (privileged)VRF lifecycle · gRPC→GoBGPCiliumCNI DaemonSetveth/lxc lifecycle · eBPF policyGoBGPDaemonSet (hostNetwork)VPNv4/VPNv6 PE · RFC 9252galactic-operatorDaemonSet (privileged)VRF lifecycle · gRPC→GoBGPCiliumCNI DaemonSetveth/lxc lifecycle · eBPF policyiBGP full meshiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPgRPCNetlinkgRPCNetlinkgRPCNetlink \ No newline at end of file diff --git a/enhancements/networking/bgp-control-plane/bgp-worker-component.puml b/enhancements/networking/bgp-control-plane/bgp-worker-component.puml new file mode 100644 index 0000000..ec9de05 --- /dev/null +++ b/enhancements/networking/bgp-control-plane/bgp-worker-component.puml @@ -0,0 +1,25 @@ +@startuml bgp-worker-component +!$NEW_C4_STYLE = 1 +!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Component.puml +!include https://raw.githubusercontent.com/datum-cloud/enhancements/refs/heads/main/enhancements/datum-theme.puml + +title Worker node — component view + +Boundary(node, "K8s worker node") { + Component(cilium, "Cilium", "CNI DaemonSet", "veth lifecycle, eBPF policy, pod /128 routes") + Component(gobgp, "GoBGP", "BGP DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\none VRF per tenant\nRFC 9252 SRv6 Services TLV") + Component(operator, "galactic-operator", "Controller DaemonSet (hostNetwork, privileged)", "VRF lifecycle, lxc enslavement\nGoBGP config via gRPC") +} + +Boundary(kernel, "Linux kernel") { + Component(vrf, "VRF devices", "vrf-tenant-{name}", "One per tenant\nseparate routing table") + Component(seg6, "seg6local", "SRv6 End.DT4 / End.DT6 / End.DT46", "Decap + VRF lookup\nfor inbound tenant traffic") +} + +Rel(cilium, kernel, "Creates lxcXXXX veth pairs", "Netlink") +Rel(operator, vrf, "Creates/destroys", "Netlink") +Rel(operator, cilium, "Watches lxc NEWLINK events", "Netlink") +Rel(operator, gobgp, "Configures VRFs and paths", "gRPC") +Rel(gobgp, seg6, "Programs End.DT4/DT6/DT46 SIDs", "Netlink / kernel route") + +@enduml diff --git a/enhancements/networking/bgp-control-plane/bgp-worker-component.svg b/enhancements/networking/bgp-control-plane/bgp-worker-component.svg new file mode 100644 index 0000000..abae4f5 --- /dev/null +++ b/enhancements/networking/bgp-control-plane/bgp-worker-component.svg @@ -0,0 +1,57 @@ +Worker node — component viewK8s worker nodeLinux kernelCiliumCNI DaemonSetveth lifecycle, eBPF policypod /128 routesGoBGPBGP DaemonSet (hostNetwork)VPNv4/VPNv6 PEone VRF per tenant · RFC 9252galactic-operatorController DaemonSet (privileged)VRF lifecycle, lxc enslavementGoBGP config via gRPCVRF devicesvrf-tenant-{name}One per tenantseparate routing tableseg6localSRv6 End.DT4 / End.DT6 / End.DT46Decap + VRF lookupfor inbound tenant trafficCreates lxcXXXX veth pairs [Netlink]Creates/destroys [Netlink]Watches lxc NEWLINK events [Netlink]Configures VRFs and paths [gRPC]Programs End.DT4/DT6/DT46 SIDs [Netlink] \ No newline at end of file From 27544db1ed6b9482c5fc801ed896fe6644be468f Mon Sep 17 00:00:00 2001 From: Peter Sprygada Date: Mon, 4 May 2026 17:25:28 -0400 Subject: [PATCH 2/2] updates the bgp control document to provide more design details The initial PR munged together design and implementation details. This patch focuses on removing the implemenation details and expanding on the design details for the gVPC BGP control plane. --- .../networking/bgp-control-plane/README.md | 429 ++++++++---------- .../bgp-control-plane/bgp-context.puml | 41 -- .../bgp-control-plane/bgp-context.svg | 82 ---- .../bgp-full-architecture.puml | 101 ----- .../bgp-full-architecture.svg | 159 ------- .../bgp-worker-component.puml | 25 - .../bgp-worker-component.svg | 57 --- 7 files changed, 200 insertions(+), 694 deletions(-) delete mode 100644 enhancements/networking/bgp-control-plane/bgp-context.puml delete mode 100644 enhancements/networking/bgp-control-plane/bgp-context.svg delete mode 100644 enhancements/networking/bgp-control-plane/bgp-full-architecture.puml delete mode 100644 enhancements/networking/bgp-control-plane/bgp-full-architecture.svg delete mode 100644 enhancements/networking/bgp-control-plane/bgp-worker-component.puml delete mode 100644 enhancements/networking/bgp-control-plane/bgp-worker-component.svg diff --git a/enhancements/networking/bgp-control-plane/README.md b/enhancements/networking/bgp-control-plane/README.md index dcd44e7..f860751 100644 --- a/enhancements/networking/bgp-control-plane/README.md +++ b/enhancements/networking/bgp-control-plane/README.md @@ -2,314 +2,285 @@ ## Overview -This document describes the BGP control plane architecture for Datum's Galactic VPC -fabric. The fabric spans 16 points of presence (PoPs) across three geographic regions -and uses a two-tier hierarchical route reflector (RR) model to distribute routing -information at scale. +This document describes the BGP control plane architecture for Datum's Galactic VPC fabric. The fabric spans points of presence (PoPs) across three geographic regions. -BGP is the single control plane protocol — no IGP runs in the underlay. SRv6 -(RFC 8986) is the committed data plane. Every design decision in this document is -made with that constraint as a hard given. +The underlay provides IPv6 transport and reachability between PoPs. This document covers only the overlay control plane built on top of it. --- ## Design goals -- Full control plane reachability across all 16 PoPs via a single protocol (BGP) +- Full control plane reachability across all PoPs via BGP - Regional forwarding survives total loss of the global RR tier -- No PoP carries more than 2 RR client sessions +- No worker node carries more than 2 RR client sessions - No single point of failure at any tier - Clean separation of intra-region and inter-region route reflection -- galactic-operator owns GoBGP lifecycle end-to-end; no out-of-band config **Non-goals:** -- IGP in the underlay — BGP handles locator advertisement and underlay reachability -- MPLS fallback — SRv6 is the commitment, not a preference +- MPLS data plane — SRv6 is the commitment, not a preference - Stretched L2 between PoPs --- -## PoP inventory - -| Region | POP ID | City | Notes | -|--------------|--------------|---------------|--------------------------| -| Americas | us-east-1 | Ashburn | Migrating to servers.com | -| Americas | us-east-2 | New York City | Global RR site | -| Americas | us-central-1 | Dallas | Migrating to servers.com | -| Americas | us-west-1 | San Jose | Migrating to servers.com | -| Americas | ca-east-1 | Toronto | | -| Americas | br-east-1 | São Paulo | | -| Americas | cl-central-1 | Santiago | | -| EMEA | de-central-1 | Frankfurt | Global RR site | -| EMEA | gb-south-1 | London | | -| EMEA | nl-west-1 | Amsterdam | | -| EMEA | ae-north-1 | Dubai | | -| EMEA | za-central-1 | Johannesburg | | -| Asia-Pacific | sg-central-1 | Singapore | Regional RR site | -| Asia-Pacific | jp-east-1 | Tokyo | | -| Asia-Pacific | au-east-1 | Sydney | | -| Asia-Pacific | in-west-1 | Mumbai | | - ---- - ## Architecture -### Two-tier route reflector hierarchy +The control plane uses a two-tier route reflector hierarchy. This eliminates the O(n²) iBGP full-mesh problem while keeping regional forwarding fully independent of the global tier. -The control plane uses a two-tier hierarchy. This eliminates the O(n²) iBGP full-mesh -problem while ensuring regional forwarding is fully independent of the global tier. - -``` -┌─────────────────────────────────────────────────────────────────────┐ -│ Tier 0 — Global RRs │ -│ │ -│ us-east-2 (NYC) ◄────── iBGP full mesh ──────► de-central-1 (FRA) │ -└─────────────────────────────────────────────────────────────────────┘ - ▲ ▲ ▲ ▲ ▲ ▲ - │ │ │ │ │ │ -┌─────────┴──┴──┐ ┌────────┴──┴──┐ ┌───────┴──┴───┐ -│ Regional RR │ │ Regional RR │ │ Regional RR │ -│ Americas │ │ EMEA │ │ APAC │ -│ (NYC anchor) │ │ (FRA anchor) │ │ (SIN anchor) │ -└───────┬───────┘ └──────┬───────┘ └──────┬───────┘ - │ iBGP │ iBGP │ iBGP - ┌────┴────┐ ┌────┴────┐ ┌────┴────┐ - │ Workers │ │ Workers │ │ Workers │ - │ GoBGP │ │ GoBGP │ │ GoBGP │ - └─────────┘ └─────────┘ └─────────┘ +```mermaid +graph TD + GRR_A["Global RR — Americas"] + GRR_E["Global RR — EMEA"] + GRR_A <-->|iBGP| GRR_E + + RR_AM["Regional RR pair — Americas"] + RR_EM["Regional RR pair — EMEA"] + RR_AP["Regional RR pair — APAC"] + + GRR_A -->|reflects| RR_AM + GRR_A -->|reflects| RR_EM + GRR_A -->|reflects| RR_AP + GRR_E -->|reflects| RR_AM + GRR_E -->|reflects| RR_EM + GRR_E -->|reflects| RR_AP + + W_AM["Workers — Americas"] + W_EM["Workers — EMEA"] + W_AP["Workers — APAC"] + + RR_AM -->|iBGP| W_AM + RR_EM -->|iBGP| W_EM + RR_AP -->|iBGP| W_AP ``` **Tier 0 — Global RRs (2 nodes)** -Two global RRs deployed as an iBGP full-mesh pair: +Two global RRs form an iBGP full-mesh pair, one anchored in Americas and one in EMEA. They reflect inter-regional reachability between the three regional clusters. They carry no intra-region routes — those stay entirely within each regional cluster. Both are co-located at existing PoPs; no new sites are required. + +**Tier 1 — Regional RR clusters (3 pairs)** -| Node | Location | Rationale | -|--------------|---------------|----------------------------------------------------| -| us-east-2 | New York City | Existing PoP; best latency to Americas and EMEA | -| de-central-1 | Frankfurt | Existing PoP; best latency spread to EMEA and APAC | +| Cluster | Scope | +|--------------|-------------------| +| Americas | All Americas PoPs | +| EMEA | All EMEA PoPs | +| Asia-Pacific | All APAC PoPs | -The global RRs reflect inter-regional reachability between the three regional -clusters. They do not carry intra-region routes; those are handled entirely within -each regional cluster. No new sites are required — both are existing deployed PoPs. +Each regional cluster is a pair of RRs operating active/active. Each worker node peers with both RRs in its regional cluster — two sessions per worker, no more. Loss of one RR in a pair causes no service impact. -**Tier 1 — Regional RR clusters (3 pairs)** +**Anchor selection criteria** -| Cluster | RR anchor | PoPs served | -|--------------|--------------------------|-----------------------------------------------------------------------------------| -| Americas | us-east-2 (NYC) | us-east-1, us-east-2, us-central-1, us-west-1, ca-east-1, br-east-1, cl-central-1 | -| EMEA | de-central-1 (Frankfurt) | de-central-1, gb-south-1, nl-west-1, ae-north-1, za-central-1 | -| Asia-Pacific | sg-central-1 (Singapore) | sg-central-1, jp-east-1, au-east-1, in-west-1 | +The anchor PoP for a regional RR pair should minimise average RTT across all PoPs in that region. This is a tiebreaker, not the primary criterion — BGP session management is not latency-sensitive. The hard constraint is that the anchor must not be a PoP undergoing active infrastructure migration or elevated operational risk. -Each regional cluster is a **pair** of RRs operating active/active. Each worker node -peers with both RRs in its regional cluster — two sessions per node, no more. -Redundancy is built in: loss of one RR in a pair causes no service impact. +--- + +## Session topology -Singapore was chosen as the APAC anchor over Tokyo or Sydney because it minimises -the average RTT across the four APAC PoPs (Singapore, Tokyo, Sydney, Mumbai). Tokyo -would penalise Mumbai and Sydney; Sydney would penalise Tokyo and Mumbai. +| Node type | Session count | Peers | +|------------------|-----------------|--------------------------------------------------------------------| +| Worker node | 2 | Both RRs in regional pair | +| Regional RR node | 2 + N | Both global RRs + all worker nodes in the regional cluster | +| Global RR node | 1 + 6 | Other global RR + both nodes of each regional pair (3 regions × 2) | -Dallas was explicitly rejected as an Americas anchor despite proximity to LATAM: -the RR is a control plane function and RTT has no material operational impact on -BGP session management. Dallas is also mid-migration to servers.com, making it -unsuitable for load-bearing infrastructure. +The design scales linearly: adding a PoP adds exactly 2 RR sessions. There is no fan-out at the global tier. --- -## BGP Control Plane System Context +## Route propagation -![BGP control plane — system context](bgp-context.svg) +Routes flow up from worker → regional RR → global RR, then back down to peer regional RRs → workers in the destination region. Intra-region propagation terminates at the regional RR; the global tier is not involved. -> Source: [`diagrams/bgp-context.puml`](bgp-context.puml) +```mermaid +sequenceDiagram + participant W_SRC as Worker (origin) + participant RR_SRC as Regional RR (origin) + participant GRR as Global RR + participant RR_DST as Regional RR (destination) + participant W_DST as Worker (destination) + + W_SRC ->> RR_SRC: BGP UPDATE — tenant prefix + SRv6 TLV + RR_SRC ->> RR_SRC: Reflects to regional clients and global RRs + RR_SRC ->> GRR: BGP UPDATE — reflected prefix + GRR ->> RR_DST: BGP UPDATE — reflected to destination region + RR_DST ->> W_DST: BGP UPDATE — installs in tenant VRF +``` --- -## Session topology +## Address families (SAFIs) -### Session counts +All RR sessions negotiate the following address families: -| Node type | Sessions | Peers | -|---------------------|-----------------|--------------------------------------------------------------------| -| Worker node (GoBGP) | 2 | Both RRs in regional pair | -| Regional RR node | 2 + (N clients) | Both global RRs + all regional clients | -| Global RR node | 2 + 6 | Other global RR + both nodes of each regional pair (3 regions × 2) | +| Address family | Purpose | +|-------------------------------------------------|--------------------------------------| +| VPNv4 (RFC 4364) + SRv6 Services TLV (RFC 9252) | Tenant L3VPN overlay — IPv4 prefixes | +| VPNv6 (RFC 4659) + SRv6 Services TLV (RFC 9252) | Tenant L3VPN overlay — IPv6 prefixes | +| EVPN + SRv6 Services TLV (RFC 9252) | Tenant L2/L3 overlay | +| BGP-LS | Topology export to controller / PCE | -At 16 PoPs today, with an average of 3 workers per PoP, this is approximately -96 worker-to-RR sessions globally — entirely manageable. The design scales linearly: -adding a PoP adds exactly 2 RR sessions. +The RFC 9252 SRv6 Services TLV carries the SRv6 SID for each tenant VRF alongside the VPN prefix. A remote PE receiving a VPN prefix uses the TLV to determine which SRv6 SID to use for encapsulation. The SID endpoint behaviour (`End.DT4`, `End.DT46`, or `End.DT6`) is determined by the VRF address family — IPv4-only, dual-stack, or IPv6-only respectively. -### BGP session establishment sequence +--- -The following shows how a worker node establishes its control plane sessions on boot: +## Tier 0 — Global RRs -```mermaid -sequenceDiagram - participant W as Worker node
(GoBGP) - participant GO as galactic-operator - participant RR1 as Regional RR 1 - participant RR2 as Regional RR 2 - participant GRR as Global RR - - GO->>W: Render GoBGP config (peers, RD/RT, VRFs) - GO->>W: SIGHUP GoBGP - W->>RR1: TCP SYN → port 179 - W->>RR2: TCP SYN → port 179 - RR1-->>W: BGP OPEN (capabilities: IPv6-unicast, VPNv4, VPNv6, EVPN, BGP-LS) - RR2-->>W: BGP OPEN (capabilities: IPv6-unicast, VPNv4, VPNv6, EVPN, BGP-LS) - W-->>RR1: BGP OPEN + KEEPALIVE - W-->>RR2: BGP OPEN + KEEPALIVE - RR1->>W: BGP UPDATE — full intra-region RIB - RR2->>W: BGP UPDATE — full intra-region RIB - Note over RR1,GRR: Regional RRs already peered
with global tier at startup - RR1->>W: BGP UPDATE — inter-region routes (via global RR reflection) - W->>RR1: BGP UPDATE — local VPNv4/VPNv6 prefixes (RFC 9252 SRv6 TLV) - RR1->>GRR: BGP UPDATE — reflects worker prefixes to global tier - GRR->>RR2: BGP UPDATE — reflects to other regional clusters -``` +### Role and scope -### Route propagation — inter-region example +The global RRs exist for one purpose: carrying inter-regional reachability. They do not reflect intra-region routes — regional clusters handle that themselves and the global tier never sees it. The global RRs reflect VPN prefixes, EVPN NLRIs, and BGP-LS topology between the three regional clusters. -This shows how a prefix originating on a worker in Tokyo reaches a worker in London: +Two nodes. Full-mesh iBGP between them. Each global RR is a client of the other — they reflect to each other and both reflect outbound to the regional RR pairs. This means either global RR can independently reflect the full inter-region table to all regional clients. -```mermaid -sequenceDiagram - participant W_JP as Worker
jp-east-1 - participant RR_AP as APAC RR
sg-central-1 - participant GRR as Global RR
us-east-2 - participant RR_EU as EMEA RR
de-central-1 - participant W_GB as Worker
gb-south-1 - - W_JP->>RR_AP: BGP UPDATE — tenant prefix + SRv6 TLV
(End.DT46 SID for jp-east-1) - Note over RR_AP: Reflects to regional clients
AND to global RRs - RR_AP->>GRR: BGP UPDATE — reflects prefix - GRR->>RR_EU: BGP UPDATE — reflects to EMEA - RR_EU->>W_GB: BGP UPDATE — installs in tenant VRF - Note over W_GB: Traffic to Tokyo now
SRv6-encapsulated with
jp-east-1 End.DT46 SID -``` +**Placement:** One anchor per region is sufficient at current scale; not every region requires a global RR anchor. Regional RRs in unanchored regions peer with both existing global RRs via the underlay. Latency on those sessions is irrelevant for correctness — BGP session management is not latency-sensitive. If a region grows to warrant its own global RR anchor, the criteria are: PoP stability, infrastructure maturity, and avoiding any site currently undergoing active infrastructure migration. ---- +### What the global RRs carry -## SAFIs +| SAFI | Scope | +|------|-------| +| VPNv4 + SRv6 Services TLV | Cross-region tenant IPv4 prefixes | +| VPNv6 + SRv6 Services TLV | Cross-region tenant IPv6 prefixes | +| EVPN + SRv6 Services TLV | Cross-region tenant L2/L3 | +| BGP-LS | Full inter-region topology for PCE/controller | -All RR sessions negotiate the following address families: +The global RRs do **not** participate in intra-region VRF distribution. A prefix originating within a region stays within that region's cluster unless it needs to be reachable from workers in other regions. -| Address family | Purpose | -|-------------------------------------------------|---------------------------------------------------| -| IPv6 Unicast | Underlay reachability, SRv6 locator advertisement | -| VPNv4 (RFC 4364) + SRv6 Services TLV (RFC 9252) | Tenant L3VPN overlay — IPv4 prefixes | -| VPNv6 (RFC 4659) + SRv6 Services TLV (RFC 9252) | Tenant L3VPN overlay — IPv6 prefixes | -| EVPN + SRv6 Services TLV (RFC 9252) | Tenant L2/L3 overlay | -| BGP-LS | Topology export to controller / PCE | - -RFC 9252 SRv6 Services TLV carries the `End.DT4` (IPv4) or `End.DT6` (IPv6) SID for -each tenant VRF alongside the VPN prefix. This is the glue between BGP VPN signalling -and SRv6 forwarding — a remote PE receiving a VPN prefix uses the TLV to determine -which SRv6 SID to use for encapsulation. VPNv4 and VPNv6 use the same SID structure; -the difference is the BGP NLRI encoding and the SRv6 endpoint behavior (`End.DT4` vs -`End.DT46` vs `End.DT6` depending on whether the VRF is IPv4-only, dual-stack, or -IPv6-only). - -> **Note on GoBGP RFC 9252 support:** GoBGP's SRv6 Services TLV implementation has -> known gaps in the `SRv6 SID Structure` sub-TLV (specifically the -> `TranspositionLength`/`TranspositionOffset` fields used for uSID compression). This -> must be validated against the regional RR implementation before production rollout. -> See the RFC 9252 validation spike in the backlog. +### Session model ---- +Each global RR maintains: +- 1 iBGP session to the other global RR (full-mesh peer, also a route-reflector client) +- 2 sessions per regional pair × 3 regions = 6 client sessions -## Node-level architecture +Total: 7 sessions per global RR. This is deliberately small. If sessions are being added to the global tier for anything other than a new regional pair, the design should be questioned. -Each Kubernetes worker node runs three daemonset processes that together form the -per-node control and data plane: +### Route-reflector cluster IDs -![Worker node — component view](bgp-worker-component.svg) +Each global RR must have a unique `cluster-id`. The global tier forms its own RR cluster. The cluster ID prevents routing loops: an NLRI reflected by Global RR A carries A's cluster ID, and Global RR B will not re-reflect it back to A. Without distinct cluster IDs, you get silent route suppression or reflection loops depending on implementation. -> Source: [`bgp-worker-component.puml`](bgp-worker-component.puml) +Assign cluster IDs from a reserved block, documented and stable. Do not reuse cluster IDs from the regional tier. -**galactic-operator reconciliation loop (per tenant pod):** +### Failure behaviour -``` -1. Watch: pod CREATE with datum.net/tenant= on this node -2. Ensure: vrf-tenant- exists (Netlink) — idempotent -3. Ensure: GoBGP VRF configured with RD/RT (gRPC AddVrf) -4. Wait: Cilium creates lxcXXXX (watch netlink NEWLINK) -5. Act: ip link set lxcXXXX master vrf-tenant- (Netlink) -6. Act: move endpoint route from table 0 → tenant table (Netlink) -7. Act: AddPath to GoBGP VRF for pod /128 (gRPC) - -On pod DELETE: -1. Watch: pod DELETE -2. Act: DeletePath from GoBGP for pod /128 (gRPC) -3. Act: release lxc from VRF (Netlink) -4. Cleanup: remove VRF if no remaining pods in tenant on this node -``` +**One global RR down:** The surviving node continues reflecting between all regional clusters. No routes are lost. Inter-region convergence for new prefixes continues uninterrupted. The only impact is loss of redundancy — one failure away from inter-region blackout for new prefixes. Treat as an incident; restore within the SLO window. -galactic-operator is the single source of truth. GoBGP holds no persistent state — -on restart, the operator re-drives all VRF and path state from Kubernetes CRDs. +**Both global RRs down:** Existing inter-region routes stay installed in regional RIBs — no immediate forwarding impact. New prefixes originating in one region do not reach other regions. This state must be explicitly tested in staging (see Failure Modes section). Do not assume regional RIBs hold state gracefully without a test confirming it. --- -## Failure modes +## Tier 1 — Regional RR clusters -### Global RR loss +### Role and scope -Loss of one global RR degrades inter-region route propagation but does not cause an -outage. The remaining global RR continues reflecting between regional clusters. +Each regional cluster is an active/active RR pair responsible for full intra-region route distribution. Every worker in the region peers with both RRs. The regional RRs also upstream-peer with both global RRs, carrying inter-region routes back down to regional workers. -Loss of **both** global RRs: inter-region routes are no longer updated but existing -routes remain in the regional RIBs. Intra-region forwarding is completely unaffected. -New prefixes originating in one region will not reach other regions until the global -tier recovers. +The regional RR is the only BGP peer a worker node ever talks to. Workers do not peer with global RRs, with workers in other regions, or with anything outside their regional pair. This is a hard constraint — it's what keeps the session count on workers bounded at 2. -> **This must be tested explicitly in staging.** Do not assume it works. The test is: -> withdraw both global RRs, originate a new prefix in Americas, confirm it does NOT -> appear in EMEA or APAC RIBs, confirm all *existing* inter-region routes remain -> installed and forwarding. +### Cluster assignment -### Regional RR node loss (one of pair) +PoPs are assigned to regional clusters based on geography. Each PoP belongs to exactly one regional cluster. The cluster boundaries are operationally significant: they define RR peering scope, failure domain, and the extent of intra-region route distribution. -No service impact. Workers continue peering with the surviving RR node. The operator -should alert within 60 seconds; the failed node should be replaced within the SLO -window before the pair degrades to a single point of failure. +### Anchor PoP selection -### Worker GoBGP crash +The anchor PoP hosts both nodes of the regional RR pair. Selection criteria in priority order: -BGP sessions drop. galactic-operator detects the restart and re-drives VRF and path -state via gRPC. Session re-establishment uses configured keepalive/hold timers -(recommended: 10s keepalive / 30s hold). In-flight traffic to the affected worker -black-holes until sessions re-establish — typically under 30 seconds with aggressive -timers. +1. **No active infrastructure migration.** Any PoP undergoing active infrastructure migration or with elevated operational risk is excluded as an anchor candidate. This is a hard exclusion — not a tiebreaker. +2. **Operational maturity.** The anchor PoP should have stable infrastructure, proven hardware, and no open reliability incidents. +3. **RTT minimisation.** Among qualified PoPs, prefer the one with lowest average RTT to all other PoPs in the region. -### servers.com migration (us-east-1, us-central-1, us-west-1) +Do not co-locate both RR nodes in the same physical rack or on shared power. The pair is active/active — hardware failure at the rack level should take at most one node. -The three US PoPs migrating to servers.com will experience BGP session bounces during -cutover. Locators and ASN remain unchanged. The regional RR pair should treat these as -normal client reconvergence events. Drain each PoP before migration to avoid in-flight -tenant traffic loss. Do not migrate all three simultaneously. +### Session model per regional RR node ---- +Each regional RR node maintains: +- 2 sessions to global RRs (one each) +- N sessions to worker nodes in the region (N = number of workers in the regional cluster) -## Timer recommendations +Each worker node in the region peers with both RR nodes — so each RR node carries the full worker session load for the region. Size the RR nodes accordingly; at large regional worker counts this is where memory and FIB capacity matter. + +### Route-reflector cluster IDs — regional tier + +Each regional pair operates as a single RR cluster. Both nodes in a pair share the same `cluster-id`. This is intentional: it allows either node to reflect routes without the other node suppressing them due to cluster ID loop prevention. -| Timer | Recommended value | Rationale | -|---------------------|-------------------|---------------------------------------------| -| BGP keepalive | 10s | Faster detection without excessive overhead | -| BGP hold time | 30s | 3× keepalive; aggressive but stable | -| BFD (if enabled) | 300ms × 3 | Sub-second PE failure detection | -| RR client reconnect | 5s | Fast reconnect after transient loss | +The implication: both nodes in a regional pair are authoritative reflectors for the same cluster. A route reflected by node A and a route reflected by node B for the same prefix look identical from a cluster-loop-prevention standpoint. Workers will accept the reflected route from whichever RR they receive it from first. -BFD is not currently implemented. Without BFD, PE failure detection relies on BGP -hold timer expiry — up to 30s with the above settings. Implementing BFD on worker -nodes is a tracked backlog item; until then, hold timers are the sole failure -detection mechanism. +Each regional cluster must have a distinct cluster ID from every other cluster including the global tier. Four cluster IDs total: one per regional pair, two for the global tier. + +### Route reflection flow — intra-region + +A worker in region X originates a tenant VPN prefix: + +1. Worker advertises the prefix (VPNv4 + SRv6 TLV) to both regional RR nodes. +2. Each regional RR reflects the prefix to all other workers in the region and upstream to both global RRs. +3. Other regional workers install the prefix. The originating worker's SRv6 SID (from the TLV) tells them how to encapsulate. +4. Global RRs reflect the prefix to the other regional RR pairs, which distribute it to their workers. + +The global tier is not in the intra-region path. A worker receiving a prefix from another worker in the same region never touches the global tier. This is what makes regional forwarding independent of global RR availability. + +### Route reflection flow — inter-region + +A worker in region X originates a tenant VPN prefix: + +1. Worker → both regional RR nodes in region X. +2. Regional RRs reflect to all workers in region X (intra-region done) and upstream to both global RRs. +3. Global RRs reflect to all other regional RR pairs. +4. Remote regional RRs distribute to their respective workers. + +The SRv6 SID in the TLV is set by the originating worker. Remote workers install the prefix and use that SID for encapsulation — they steer traffic toward the origin's locator, which the underlay resolves via the SRv6 locator advertisement. + +### ADD-PATH + +Regional RRs should advertise multiple paths (BGP ADD-PATH, RFC 7911) to workers where multiple equal-cost paths exist across the fabric. Without ADD-PATH, a worker receives only the best path the RR selected — you lose visibility into alternate paths and make ECMP harder to exploit correctly. Enable ADD-PATH on all regional RR sessions; configure workers to consume it. + +### Next-hop handling + +Regional RRs must **not** modify the NEXT_HOP attribute on reflected routes. The next-hop for a VPN route is the originating PE's loopback (or SRv6 locator address) — the RR is a reflector, not a transit node. If next-hop rewrite is enabled by mistake, workers will try to reach the RR as next-hop and the data plane breaks. + +Explicitly configure `next-hop-unchanged` (or equivalent) on all RR client peering groups. Verify this in staging before bringing up the first regional cluster. + +### Graceful restart + +Configure BGP Graceful Restart (RFC 4724) on regional RR nodes. During a planned RR restart (software upgrade, config push), workers should not withdraw all routes immediately. GR gives the RR time to re-establish sessions and re-reflect routes before workers flush their RIB state. Set the GR restart timer conservatively — 120s is reasonable; enough time for an RR to come back without triggering unnecessary reconvergence. + +Workers should be GR-aware (Helper mode). The RR is the restarting speaker; workers are helpers that hold state during the restart window. + +### Scaling limits + +At the regional tier, the binding constraint is the number of active BGP sessions and the VRF/prefix table size the RR must hold. The RR must hold the full regional table plus the inter-region table reflected from the global tier. At current PoP counts this is well within commodity server capacity, but instrument it: + +- Monitor BGP session count and RIB size per RR node via gNMI. +- Alert if either RR in a pair loses sessions that its partner is still holding — that's a split you need to see immediately. +- Alert on RIB size divergence between the two nodes in a pair — a significant delta indicates a reflection or session issue. + +The design is linear: adding a PoP to a region adds 2 sessions to that region's RR pair (one per node). There is no fan-out, no O(n²) growth. This holds as long as workers peer only with their regional pair. --- -## Full architecture +## Failure modes + +### Global RR loss (one node) + +No outage. The surviving global RR continues reflecting between all regional clusters. Inter-region convergence degrades slightly until the failed node is restored. + +### Global RR loss (both nodes) + +Inter-region routes are no longer updated. Existing routes remain in the regional RIBs; intra-region forwarding is completely unaffected. New prefixes originating in one region will not reach other regions until the global tier recovers. + +> **This must be tested explicitly in staging.** The test: withdraw both global RRs, originate a new prefix in one region, confirm it does not appear in other regions' RIBs, confirm all existing inter-region routes remain installed and forwarding. -The diagram below shows the complete BGP control plane from tenant pod through to -the global RR tier, including the per-node component relationships. +### Regional RR node loss (one of pair) + +No service impact. Workers continue peering with the surviving RR. The failed node should be replaced within the SLO window — a degraded pair is a single point of failure. + +### Worker node BGP session loss + +Routes originated by the affected worker are withdrawn from the fabric. The control plane reconverges once sessions re-establish. In-flight traffic to the affected worker black-holes until sessions are restored. + +--- + +## Timer recommendations -![Galactic VPC — BGP control plane full architecture](bgp-full-architecture.svg) +| Timer | Value | Rationale | +|---------------|-------|---------------------------------------------| +| BGP keepalive | 10s | Faster detection without excessive overhead | +| BGP hold time | 30s | 3× keepalive; aggressive but stable | +| RR reconnect | 5s | Fast reconnect after transient loss | +| GR restart | 120s | Sufficient for planned RR restarts | -> Source: [`bgp-full-architecture.puml`](bgp-full-architecture.puml) +BGP hold timer expiry is the failure detection mechanism at this layer — up to 30s with the above settings. Sub-second failure detection is an underlay concern: the underlay should withdraw reachability fast enough that BGP sessions drop and reconverge without waiting for hold timer expiry. The control plane relies on that signal; it does not attempt to replicate it. diff --git a/enhancements/networking/bgp-control-plane/bgp-context.puml b/enhancements/networking/bgp-control-plane/bgp-context.puml deleted file mode 100644 index 0d2df56..0000000 --- a/enhancements/networking/bgp-control-plane/bgp-context.puml +++ /dev/null @@ -1,41 +0,0 @@ -@startuml bgp-context -!$NEW_C4_STYLE = 1 -!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Context.puml -!include https://raw.githubusercontent.com/datum-cloud/enhancements/refs/heads/main/enhancements/datum-theme.puml - -title BGP control plane — system context - -Boundary(tier0, "Tier 0 — Global RRs") { - System(rr_nyc, "us-east-2", "Global RR\nNew York City") - System(rr_fra, "de-central-1", "Global RR\nFrankfurt") -} - -Boundary(tier1_am, "Tier 1 — Americas") { - System(rr_am, "Americas RR pair", "Anchored at NYC\nactive/active") - System(pop_am, "7 × worker nodes", "GoBGP\nus-east-1/2, us-central-1, us-west-1, ca-east-1, br-east-1, cl-central-1") -} - -Boundary(tier1_eu, "Tier 1 — EMEA") { - System(rr_eu, "EMEA RR pair", "Anchored at Frankfurt\nactive/active") - System(pop_eu, "5 × worker nodes", "GoBGP\nde-central-1, gb-south-1, nl-west-1, ae-north-1, za-central-1") -} - -Boundary(tier1_ap, "Tier 1 — Asia-Pacific") { - System(rr_ap, "APAC RR pair", "Anchored at Singapore\nactive/active") - System(pop_ap, "4 × worker nodes", "GoBGP\nsg-central-1, jp-east-1, au-east-1, in-west-1") -} - -Rel(rr_nyc, rr_fra, "iBGP full mesh") - -Rel(rr_am, rr_nyc, "iBGP", "inter-region routes") -Rel(rr_am, rr_fra, "iBGP", "inter-region routes") -Rel(rr_eu, rr_nyc, "iBGP", "inter-region routes") -Rel(rr_eu, rr_fra, "iBGP", "inter-region routes") -Rel(rr_ap, rr_nyc, "iBGP", "inter-region routes") -Rel(rr_ap, rr_fra, "iBGP", "inter-region routes") - -Rel(pop_am, rr_am, "iBGP", "2 sessions per node") -Rel(pop_eu, rr_eu, "iBGP", "2 sessions per node") -Rel(pop_ap, rr_ap, "iBGP", "2 sessions per node") - -@enduml diff --git a/enhancements/networking/bgp-control-plane/bgp-context.svg b/enhancements/networking/bgp-control-plane/bgp-context.svg deleted file mode 100644 index b2d1614..0000000 --- a/enhancements/networking/bgp-control-plane/bgp-context.svg +++ /dev/null @@ -1,82 +0,0 @@ -BGP control plane — system contextTier 0 — Global RRsTier 1 — AmericasTier 1 — EMEATier 1 — Asia-Pacificus-east-2Global RR (FRR)New York Cityde-central-1Global RR (FRR)FrankfurtAmericas RR pairFRR — Anchored at NYCactive/active7 × worker nodesGoBGPus-east-1/2, us-central-1, us-west-1ca-east-1, br-east-1, cl-central-1EMEA RR pairFRR — Anchored at Frankfurtactive/active5 × worker nodesGoBGPde-central-1, gb-south-1, nl-west-1ae-north-1, za-central-1APAC RR pairFRR — Anchored at Singaporeactive/active4 × worker nodesGoBGPsg-central-1, jp-east-1au-east-1, in-west-1iBGP full meshiBGPiBGPiBGPiBGPiBGPiBGPiBGP (2 sessions/node)iBGP (2 sessions/node)iBGP (2 sessions/node) \ No newline at end of file diff --git a/enhancements/networking/bgp-control-plane/bgp-full-architecture.puml b/enhancements/networking/bgp-control-plane/bgp-full-architecture.puml deleted file mode 100644 index 8fcc275..0000000 --- a/enhancements/networking/bgp-control-plane/bgp-full-architecture.puml +++ /dev/null @@ -1,101 +0,0 @@ -@startuml bgp-full-architecture -!$NEW_C4_STYLE = 1 -!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Deployment.puml -!include https://raw.githubusercontent.com/datum-cloud/enhancements/refs/heads/main/enhancements/datum-theme.puml - -title Galactic VPC — BGP control plane full architecture - -' ─── Tier 0: Global RRs ─────────────────────────────────────────────── -Boundary(tier0, "Tier 0 — Global RRs") { - Deployment_Node(nyc_site, "us-east-2 · New York City") { - Container(grr1, "Global RR 1", "FRR", "iBGP cluster A\nreflects inter-region routes") - } - Deployment_Node(fra_site, "de-central-1 · Frankfurt") { - Container(grr2, "Global RR 2", "FRR", "iBGP cluster B\nreflects inter-region routes") - } -} - -' ─── Tier 1: Regional RRs ───────────────────────────────────────────── -Boundary(tier1, "Tier 1 — Regional RRs") { - Deployment_Node(am_region, "Americas") { - Container(rrA1, "Americas RR — primary", "FRR", "Anchor: us-east-2") - Container(rrA2, "Americas RR — secondary", "FRR", "active/active pair") - } - Deployment_Node(eu_region, "EMEA") { - Container(rrE1, "EMEA RR — primary", "FRR", "Anchor: de-central-1") - Container(rrE2, "EMEA RR — secondary", "FRR", "active/active pair") - } - Deployment_Node(ap_region, "Asia-Pacific") { - Container(rrP1, "APAC RR — primary", "FRR", "Anchor: sg-central-1") - Container(rrP2, "APAC RR — secondary", "FRR", "active/active pair") - } -} - -' ─── Tier 2: Worker nodes (representative PoP per region) ───────────── -Boundary(tier2, "Tier 2 — Worker nodes (representative PoPs)") { - Deployment_Node(pop_am, "us-east-1 · Ashburn (representative)") { - Deployment_Node(k8s_am, "K8s worker node") { - Container(gobgp_am, "GoBGP", "DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\nRFC 9252 SRv6 Services TLV") - Container(op_am, "galactic-operator", "DaemonSet (privileged)", "VRF lifecycle\nlxc enslavement\ngRPC → GoBGP") - Container(cilium_am, "Cilium", "CNI DaemonSet", "veth/lxc lifecycle\neBPF policy") - } - } - Deployment_Node(pop_eu, "gb-south-1 · London (representative)") { - Deployment_Node(k8s_eu, "K8s worker node") { - Container(gobgp_eu, "GoBGP", "DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\nRFC 9252 SRv6 Services TLV") - Container(op_eu, "galactic-operator", "DaemonSet (privileged)", "VRF lifecycle\nlxc enslavement\ngRPC → GoBGP") - Container(cilium_eu, "Cilium", "CNI DaemonSet", "veth/lxc lifecycle\neBPF policy") - } - } - Deployment_Node(pop_ap, "sg-central-1 · Singapore (representative)") { - Deployment_Node(k8s_ap, "K8s worker node") { - Container(gobgp_ap, "GoBGP", "DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\nRFC 9252 SRv6 Services TLV") - Container(op_ap, "galactic-operator", "DaemonSet (privileged)", "VRF lifecycle\nlxc enslavement\ngRPC → GoBGP") - Container(cilium_ap, "Cilium", "CNI DaemonSet", "veth/lxc lifecycle\neBPF policy") - } - } -} - -' ─── Tier 0 iBGP full mesh ──────────────────────────────────────────── -Rel(grr1, grr2, "iBGP full mesh") - -' ─── Regional → Global (both RRs in each pair peer both globals) ─────── -Rel(rrA1, grr1, "iBGP") -Rel(rrA1, grr2, "iBGP") -Rel(rrA2, grr1, "iBGP") -Rel(rrA2, grr2, "iBGP") - -Rel(rrE1, grr1, "iBGP") -Rel(rrE1, grr2, "iBGP") -Rel(rrE2, grr1, "iBGP") -Rel(rrE2, grr2, "iBGP") - -Rel(rrP1, grr1, "iBGP") -Rel(rrP1, grr2, "iBGP") -Rel(rrP2, grr1, "iBGP") -Rel(rrP2, grr2, "iBGP") - -' ─── Workers → Regional (2 sessions per worker) ─────────────────────── -Rel(gobgp_am, rrA1, "iBGP") -Rel(gobgp_am, rrA2, "iBGP") - -Rel(gobgp_eu, rrE1, "iBGP") -Rel(gobgp_eu, rrE2, "iBGP") - -Rel(gobgp_ap, rrP1, "iBGP") -Rel(gobgp_ap, rrP2, "iBGP") - -' ─── Intra-node relationships ───────────────────────────────────────── -Rel(op_am, gobgp_am, "gRPC — VRF/path config") -Rel(op_am, cilium_am, "Netlink — lxc NEWLINK watch") -Rel(cilium_am, k8s_am, "Netlink — veth pairs") - -Rel(op_eu, gobgp_eu, "gRPC — VRF/path config") -Rel(op_eu, cilium_eu, "Netlink — lxc NEWLINK watch") -Rel(cilium_eu, k8s_eu, "Netlink — veth pairs") - -Rel(op_ap, gobgp_ap, "gRPC — VRF/path config") -Rel(op_ap, cilium_ap, "Netlink — lxc NEWLINK watch") -Rel(cilium_ap, k8s_ap, "Netlink — veth pairs") - -@enduml diff --git a/enhancements/networking/bgp-control-plane/bgp-full-architecture.svg b/enhancements/networking/bgp-control-plane/bgp-full-architecture.svg deleted file mode 100644 index 0316bae..0000000 --- a/enhancements/networking/bgp-control-plane/bgp-full-architecture.svg +++ /dev/null @@ -1,159 +0,0 @@ -Galactic VPC — BGP control plane full architectureTier 0 — Global RRsus-east-2 · New York Cityde-central-1 · FrankfurtTier 1 — Regional RRsAmericasEMEAAsia-PacificTier 2 — Worker nodes (representative PoPs)us-east-1 · AshburnK8s worker nodegb-south-1 · LondonK8s worker nodesg-central-1 · SingaporeK8s worker nodeGlobal RR 1FRRiBGP cluster Areflects inter-region routesGlobal RR 2FRRiBGP cluster Breflects inter-region routesAmericas RR — primaryFRR · Anchor: us-east-2Americas RR — secondaryFRR · active/active pairEMEA RR — primaryFRR · Anchor: de-central-1EMEA RR — secondaryFRR · active/active pairAPAC RR — primaryFRR · Anchor: sg-central-1APAC RR — secondaryFRR · active/active pairGoBGPDaemonSet (hostNetwork)VPNv4/VPNv6 PE · RFC 9252galactic-operatorDaemonSet (privileged)VRF lifecycle · gRPC→GoBGPCiliumCNI DaemonSetveth/lxc lifecycle · eBPF policyGoBGPDaemonSet (hostNetwork)VPNv4/VPNv6 PE · RFC 9252galactic-operatorDaemonSet (privileged)VRF lifecycle · gRPC→GoBGPCiliumCNI DaemonSetveth/lxc lifecycle · eBPF policyGoBGPDaemonSet (hostNetwork)VPNv4/VPNv6 PE · RFC 9252galactic-operatorDaemonSet (privileged)VRF lifecycle · gRPC→GoBGPCiliumCNI DaemonSetveth/lxc lifecycle · eBPF policyiBGP full meshiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPiBGPgRPCNetlinkgRPCNetlinkgRPCNetlink \ No newline at end of file diff --git a/enhancements/networking/bgp-control-plane/bgp-worker-component.puml b/enhancements/networking/bgp-control-plane/bgp-worker-component.puml deleted file mode 100644 index ec9de05..0000000 --- a/enhancements/networking/bgp-control-plane/bgp-worker-component.puml +++ /dev/null @@ -1,25 +0,0 @@ -@startuml bgp-worker-component -!$NEW_C4_STYLE = 1 -!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Component.puml -!include https://raw.githubusercontent.com/datum-cloud/enhancements/refs/heads/main/enhancements/datum-theme.puml - -title Worker node — component view - -Boundary(node, "K8s worker node") { - Component(cilium, "Cilium", "CNI DaemonSet", "veth lifecycle, eBPF policy, pod /128 routes") - Component(gobgp, "GoBGP", "BGP DaemonSet (hostNetwork)", "VPNv4/VPNv6 PE\none VRF per tenant\nRFC 9252 SRv6 Services TLV") - Component(operator, "galactic-operator", "Controller DaemonSet (hostNetwork, privileged)", "VRF lifecycle, lxc enslavement\nGoBGP config via gRPC") -} - -Boundary(kernel, "Linux kernel") { - Component(vrf, "VRF devices", "vrf-tenant-{name}", "One per tenant\nseparate routing table") - Component(seg6, "seg6local", "SRv6 End.DT4 / End.DT6 / End.DT46", "Decap + VRF lookup\nfor inbound tenant traffic") -} - -Rel(cilium, kernel, "Creates lxcXXXX veth pairs", "Netlink") -Rel(operator, vrf, "Creates/destroys", "Netlink") -Rel(operator, cilium, "Watches lxc NEWLINK events", "Netlink") -Rel(operator, gobgp, "Configures VRFs and paths", "gRPC") -Rel(gobgp, seg6, "Programs End.DT4/DT6/DT46 SIDs", "Netlink / kernel route") - -@enduml diff --git a/enhancements/networking/bgp-control-plane/bgp-worker-component.svg b/enhancements/networking/bgp-control-plane/bgp-worker-component.svg deleted file mode 100644 index abae4f5..0000000 --- a/enhancements/networking/bgp-control-plane/bgp-worker-component.svg +++ /dev/null @@ -1,57 +0,0 @@ -Worker node — component viewK8s worker nodeLinux kernelCiliumCNI DaemonSetveth lifecycle, eBPF policypod /128 routesGoBGPBGP DaemonSet (hostNetwork)VPNv4/VPNv6 PEone VRF per tenant · RFC 9252galactic-operatorController DaemonSet (privileged)VRF lifecycle, lxc enslavementGoBGP config via gRPCVRF devicesvrf-tenant-{name}One per tenantseparate routing tableseg6localSRv6 End.DT4 / End.DT6 / End.DT46Decap + VRF lookupfor inbound tenant trafficCreates lxcXXXX veth pairs [Netlink]Creates/destroys [Netlink]Watches lxc NEWLINK events [Netlink]Configures VRFs and paths [gRPC]Programs End.DT4/DT6/DT46 SIDs [Netlink] \ No newline at end of file