diff --git a/ansible/files/wiab_server_nftables.conf.j2 b/ansible/files/wiab_server_nftables.conf.j2 index 709c0e6c9..bf36bb6b2 100644 --- a/ansible/files/wiab_server_nftables.conf.j2 +++ b/ansible/files/wiab_server_nftables.conf.j2 @@ -67,7 +67,9 @@ table ip nat { chain POSTROUTING { type nat hook postrouting priority 100; oifname != docker0 ip saddr 172.17.0.0/16 counter masquerade +{% if not (private_deployment | default(true) | bool) %} oifname $INF_WAN counter masquerade comment "{{ wire_comment }} masquerade outgoing traffic" +{% endif %} } chain DOCKER { iifname docker0 counter return diff --git a/ansible/inventory/demo/wiab-staging.yml b/ansible/inventory/demo/wiab-staging.yml index a2f35b678..a3fda05b7 100644 --- a/ansible/inventory/demo/wiab-staging.yml +++ b/ansible/inventory/demo/wiab-staging.yml @@ -6,4 +6,6 @@ wiab-staging: ansible_user: 'demo' ansible_ssh_private_key_file: "~/.ssh/id_ed25519" vars: - artifact_hash: deed80d356cbbc2274de3b125313dfa506a1034e + artifact_hash: 8cd7cf27c149f990a9bca54f196e21fc326cde04 + # when enabled, disable WAN SNAT/masquerading for VMs on the private network + private_deployment: true diff --git a/ansible/wiab-staging-provision.yml b/ansible/wiab-staging-provision.yml index f0bff10c4..5f75e9656 100644 --- a/ansible/wiab-staging-provision.yml +++ b/ansible/wiab-staging-provision.yml @@ -297,9 +297,8 @@ kubenode2_ip: "{{ kubenode_ip_result.results[1].stdout }}" kubenode3_ip: "{{ kubenode_ip_result.results[2].stdout }}" wire_comment: "wiab-stag" - tags: always - name: Configure nftables import_playbook: ./wiab-staging-nftables.yaml - tags: nftables + tags: [never, nftables] diff --git a/bin/debug_logs.sh b/bin/debug_logs.sh index 8a40701b3..3138e025f 100755 --- a/bin/debug_logs.sh +++ b/bin/debug_logs.sh @@ -4,14 +4,14 @@ set -euo pipefail echo "Printing all pods status" kubectl get pods --all-namespaces echo "------------------------------------" -namespaces=$(kubectl get ns -o=jsonpath='{.items[*].metadata.name}') +namespaces="cert-manager-ns default" echo "Namespaces = $namespaces" for ns in $namespaces; do - pods=$(kubectl get pods --all-namespaces -o=jsonpath='{.items[*].metadata.name}') + pods=$(kubectl get pods -n "$ns" -o=jsonpath='{.items[*].metadata.name}') echo "Pods in namespace: $ns = $pods" for pod in $pods; do echo "Logs for pod: $pod" - kubectl logs --all-containers -n "$ns" "$pod" || true + kubectl logs --tail 30 --all-containers -n "$ns" "$pod" || true echo "Description for pod: $pod" kubectl describe pod -n "$ns" "$pod" || true echo "------------------------------------" diff --git a/bin/helm-operations.sh b/bin/helm-operations.sh index 1298ed065..d60a1805a 100755 --- a/bin/helm-operations.sh +++ b/bin/helm-operations.sh @@ -3,17 +3,35 @@ set -Eeo pipefail # Read values from environment variables with defaults -BASE_DIR="/wire-server-deploy" -TARGET_SYSTEM="example.dev" -CERT_MASTER_EMAIL="certmaster@${TARGET_SYSTEM}" +BASE_DIR="${BASE_DIR:-/wire-server-deploy}" +TARGET_SYSTEM="${TARGET_SYSTEM:-example.com}" +CERT_MASTER_EMAIL="${CERT_MASTER_EMAIL:-certmaster@example.com}" + +# DEPLOY_CERT_MANAGER env variable is used to decide if cert_manager and nginx-ingress-services charts should get deployed +# default is set to TRUE to deploy it unless changed +DEPLOY_CERT_MANAGER="${DEPLOY_CERT_MANAGER:-TRUE}" + +# DUMP_LOGS_ON_FAIL to dump logs on failure +# it is false by default +DUMP_LOGS_ON_FAIL="${DUMP_LOGS_ON_FAIL:-FALSE}" # this IP should match the DNS A record value for TARGET_SYSTEM # assuming it to be the public address used by clients to reach public Address -HOST_IP="" +HOST_IP="${HOST_IP:-}" + if [ -z "$HOST_IP" ]; then HOST_IP=$(wget -qO- https://api.ipify.org) fi +function dump_debug_logs { + local exit_code=$? + if [[ "$DUMP_LOGS_ON_FAIL" == "TRUE" ]]; then + "$BASE_DIR"/bin/debug_logs.sh + fi + return $exit_code +} +trap dump_debug_logs ERR + # picking a node for calling traffic (3rd kube worker node) CALLING_NODE=$(kubectl get nodes --no-headers | tail -n 1 | awk '{print $1}') if [[ -z "$CALLING_NODE" ]]; then @@ -21,13 +39,28 @@ if [[ -z "$CALLING_NODE" ]]; then exit 1 fi +sync_pg_secrets() { + echo "Retrieving PostgreSQL password from databases-ephemeral for wire-server deployment..." + if kubectl get secret wire-postgresql-external-secret &>/dev/null; then + # Usage: sync-k8s-secret-to-wire-secrets.sh + "$BASE_DIR/bin/sync-k8s-secret-to-wire-secrets.sh" \ + wire-postgresql-external-secret password \ + "$BASE_DIR/values/wire-server/secrets.yaml" \ + .brig.secrets.pgPassword .galley.secrets.pgPassword .background-worker.secrets.pgPassword + else + echo "⚠️ Warning: PostgreSQL secret 'wire-postgresql-secret' not found, skipping secret sync" + echo " Make sure databases-ephemeral chart is deployed before wire-server" + fi + return $? +} + # Creates values.yaml from prod-values.example.yaml and secrets.yaml from prod-secrets.example.yaml # Works on all chart directories in $BASE_DIR/values/ process_values() { ENV=$1 TYPE=$2 - charts=(fake-aws demo-smtp databases-ephemeral reaper wire-server webapp account-pages team-settings smallstep-accomp ingress-nginx-controller nginx-ingress-services coturn sftd cert-manager) + charts=(fake-aws demo-smtp rabbitmq databases-ephemeral reaper wire-server webapp account-pages team-settings ingress-nginx-controller nginx-ingress-services coturn sftd cert-manager) if [[ "$ENV" != "prod" ]] || [[ -z "$TYPE" ]] ; then echo "Error: This function only supports prod deployments with TYPE as values or secrets. ENV must be 'prod', got: '$ENV' and '$TYPE'" @@ -147,7 +180,7 @@ deploy_charts() { deploy_cert_manager() { kubectl get namespace cert-manager-ns || kubectl create namespace cert-manager-ns - helm upgrade --install -n cert-manager-ns cert-manager "$BASE_DIR/charts/cert-manager" --values "$BASE_DIR/values/cert-manager/values.yaml" + helm upgrade --install --wait --timeout=5m0s -n cert-manager-ns cert-manager "$BASE_DIR/charts/cert-manager" --values "$BASE_DIR/values/cert-manager/values.yaml" # display running pods kubectl get pods --sort-by=.metadata.creationTimestamp -n cert-manager-ns @@ -158,36 +191,45 @@ deploy_calling_services() { echo "Deploying sftd and coturn" # select the node to deploy sftd kubectl annotate node "$CALLING_NODE" wire.com/external-ip="$HOST_IP" --overwrite - helm upgrade --install sftd "$BASE_DIR/charts/sftd" --set "nodeSelector.kubernetes\\.io/hostname=$CALLING_NODE" --values "$BASE_DIR/values/sftd/values.yaml" + helm upgrade --install --wait --timeout=5m0s sftd "$BASE_DIR/charts/sftd" --set "nodeSelector.kubernetes\\.io/hostname=$CALLING_NODE" --values "$BASE_DIR/values/sftd/values.yaml" kubectl annotate node "$CALLING_NODE" wire.com/external-ip="$HOST_IP" --overwrite - helm upgrade --install coturn "$BASE_DIR/charts/coturn" --set "nodeSelector.kubernetes\\.io/hostname=$CALLING_NODE" --values "$BASE_DIR/values/coturn/values.yaml" --values "$BASE_DIR/values/coturn/secrets.yaml" + helm upgrade --install --wait --timeout=5m0s coturn "$BASE_DIR/charts/coturn" --set "nodeSelector.kubernetes\\.io/hostname=$CALLING_NODE" --values "$BASE_DIR/values/coturn/values.yaml" --values "$BASE_DIR/values/coturn/secrets.yaml" + + # display running pods post deploying all helm charts in default namespace + kubectl get pods --sort-by=.metadata.creationTimestamp } main() { + # Create prod-values.example.yaml to values.yaml and take backup process_values "prod" "values" # Create prod-secrets.example.yaml to secrets.yaml and take backup process_values "prod" "secrets" +# Sync postgresql secret +# sync_pg_secrets + # configure chart specific variables for each chart in values.yaml file configure_values # deploying with external datastores, useful for prod setup -deploy_charts cassandra-external elasticsearch-external minio-external rabbitmq-external fake-aws demo-smtp databases-ephemeral reaper wire-server webapp account-pages team-settings smallstep-accomp ingress-nginx-controller +deploy_charts cassandra-external elasticsearch-external minio-external fake-aws demo-smtp rabbitmq-external databases-ephemeral reaper wire-server webapp account-pages team-settings ingress-nginx-controller + +# deploying cert-manager only when the env var DEPLOY_CERT_MANAGER is set to TRUE +if [[ "$DEPLOY_CERT_MANAGER" == "TRUE" ]]; then + # deploying cert manager to issue certs, by default letsencrypt-http01 issuer is configured + deploy_cert_manager -# deploying cert manager to issue certs, by default letsencrypt-http01 issuer is configured -deploy_cert_manager + # nginx-ingress-services chart needs cert-manager to be deployed + deploy_charts nginx-ingress-services -# nginx-ingress-services chart needs cert-manager to be deployed -deploy_charts nginx-ingress-services + # print status of certs + kubectl get certificate +fi # deploying sft and coturn services -# not implemented yet deploy_calling_services - -# print status of certs -kubectl get certificate } -main \ No newline at end of file +main diff --git a/bin/offline-deploy.sh b/bin/offline-deploy.sh index 61c7d3dfa..3bede967a 100755 --- a/bin/offline-deploy.sh +++ b/bin/offline-deploy.sh @@ -41,4 +41,4 @@ fi $DOCKER_RUN_BASE $SSH_MOUNT $WSD_CONTAINER ./bin/offline-cluster.sh -sudo docker run --network=host -v $PWD:/wire-server-deploy $WSD_CONTAINER ./bin/helm-operations.sh +sudo docker run --network=host -v $PWD:/wire-server-deploy $WSD_CONTAINER sh -c 'TARGET_SYSTEM="example.dev" CERT_MASTER_EMAIL="certmaster@example.dev" DEPLOY_CERT_MANAGER=TRUE DUMP_LOGS_ON_FAIL=TRUE ./bin/helm-operations.sh' diff --git a/offline/architecture-wiab-stag.png b/offline/architecture-wiab-stag.png new file mode 100644 index 000000000..9c5c5fd0c Binary files /dev/null and b/offline/architecture-wiab-stag.png differ diff --git a/offline/wiab-staging.md b/offline/wiab-staging.md index c460d87fb..8c3d68486 100644 --- a/offline/wiab-staging.md +++ b/offline/wiab-staging.md @@ -1,35 +1,36 @@ # Scope -**Wire in a Box (WIAB) Staging** is a demo installation of Wire running on a single physical machine using KVM-based virtual machines. This setup replicates the multi-node production Wire architecture in a consolidated environment suitable for testing, evaluation, and learning about Wire's infrastructure—but **not for production use**. +**Wire in a Box (WIAB) Staging** is an installation of Wire running on a single physical machine using KVM-based virtual machines. This setup replicates the multi-node production Wire architecture in a consolidated environment suitable for testing, evaluation, and learning about Wire's infrastructure—but **not for production use**. The main use of this package is to verify that automation inside and outside of the wire product functions in the fashion you expect, before you run said automation in production. This will not test your network environment, load based behaviors, or the interface between wire and it's calling services when using a DMZ'd network configuration. **Important:** This is a sandbox environment. Data from a staging installation cannot be migrated to production. WIAB Staging is designed for experimentation, validation, and understanding Wire's deployment model. +![Wire in a Box Staging Architecture](architecture-wiab-stag.png) + ## Requirements **Architecture Overview:** - Multiple VMs (7) are deployed to simulate production infrastructure with separate roles (Kubernetes, data services, asset storage) - All VMs share the same physical node and storage, creating a single failure domain -- [Calling services](https://docs.wire.com/latest/understand/overview.html#calling) will share the same k8s cluster as Wire services hence, all infrastructure will be DMZ (De-militarized zone). +- [Calling services](https://docs.wire.com/latest/understand/overview.html#calling) are deployed in the same Kubernetes cluster as Wire services. This setup does not implement a separate DMZ, and all components share the same network boundary, reducing the level of isolation compared to a production deployment. - This solution helps developers understand Wire's infrastructure requirements and test deployment processes **Resource Requirements:** -- One physical machine with hypervisor support: +- One physical machine (aka `adminhost`) with hypervisor support: - **Memory:** 55 GiB RAM - **Compute:** 29 vCPUs - **Storage:** 850 GB disk space (thin-provisioned) - - 7 VMs with [Ubuntu 22](https://releases.ubuntu.com/jammy/) as per (#VM-Provisioning) - **DNS Records**: - - a way to create DNS records for your domain name (e.g. wire.example.com) + - A method to create DNS records for your domain name (e.g. wire.example.com) - Find a detailed explanation at [How to set up DNS records](https://docs.wire.com/latest/how-to/install/demo-wiab.html#dns-requirements) - **SSL/TLS certificates**: - - a way to create SSL/TLS certificates for your domain name (to allow connecting via https://) + - A method to create SSL/TLS certificates for your domain name (to allow connecting via https://) - To ease out the process of managing certs, we recommend using [Let's Encrypt](https://letsencrypt.org/getting-started/) & [cert-manager](https://cert-manager.io/docs/tutorials/acme/http-validation/) - **Network**: No interference from UFW or other system specific firewalls, and IP forwarding enabled between network cards. An IP address reachable for ssh and which can act as entry point for Wire traffic. - **Wire-server-deploy artifact**: A tar bundle containing all the required bash scripts, deb packages, ansible playbooks, helm charts and docker images to help with the installation. Reach out to [Wire support](https://support.wire.com/) to get access to the latest stable Wire artifact. ## VM Provisioning -We would require 7 VMs as per the following details, you can choose to use your own hypervisor to manage the VMs or use our [Wiab staging ansible playbook](https://github.com/wireapp/wire-server-deploy/blob/master/ansible/wiab-staging-provision.yml) against your physical node to setup the VMs. +Our deployment will be into 7 VMs with [Ubuntu 22](https://releases.ubuntu.com/jammy/), shown in the below VM Archetecture and Resource Allocation table, You can choose to use your own hypervisor to manage the VMs or use our [Wiab staging ansible playbook](https://github.com/wireapp/wire-server-deploy/blob/master/ansible/wiab-staging-provision.yml) against your physical node to setup the VMs. **VM Architecture and Resource Allocation:** @@ -50,20 +51,32 @@ We would require 7 VMs as per the following details, you can choose to use your - **kubenodes (kubenode1, kubenode2, kubenode3):** Run the Kubernetes cluster and host Wire backend services - **datanodes (datanode1, datanode2, datanode3):** Run distributed data services: - - Cassandra (distributed database) - - PostgreSQL (operational database) - - Elasticsearch (search engine) - - Minio (S3-compatible object storage) - - RabbitMQ (message broker) + - Cassandra + - PostgreSQL + - Elasticsearch + - Minio + - RabbitMQ - **assethost:** Hosts static assets to be used by kubenodes and datanodes +### Internet access for VMs: + +In most cases, Wire Server components do not require internet access, except in the following situations: +- **External email services** – If your users’ email providers are hosted on the public internet (for example, `user@gmail.com`). If outbound internet access is not allowed and no internal email service is available on your local network, email-based flows such as verification codes, invitations, and some login emails will not be delivered. In that case, you must retrieve the required codes from the logs instead. Read more at [I deployed demo-smtp and I want to skip email configuration and retrieve verification codes directly](https://docs.wire.com/latest/how-to/install/troubleshooting.html?h=smtp#i-deployed-demo-smtp-and-i-want-to-skip-email-configuration-and-retrieve-verification-codes-directly). +- **Mobile push notifications (FCM/APNS)** – Required to enable notifications for Android and Apple mobile devices. Wire uses [AWS services](https://docs.wire.com/latest/how-to/install/infrastructure-configuration.html#enable-push-notifications-using-the-public-appstore-playstore-mobile-wire-clients) to relay notifications to Firebase Cloud Messaging (FCM) and Apple Push Notification Service (APNS). +- **Third-party content previews** – If you want clients to display previews for services such as Giphy, Google, Spotify, or SoundCloud. Wire provides a proxy service for third-party content so clients do not communicate directly with these services, preventing exposure of IP addresses, cookies, or other metadata. +- **Federation with other Wire servers** – Required if your deployment needs to federate with another Wire server hosted on the public internet. + +> **Note:** Internet access is also required by the cert-manager pods (via Let's Encrypt) to issue TLS certificates when manual certificates are not used. +> +> This internet access is temporarily enabled as described in [cert-manager behaviour in NAT / bridge environments](#cert-manager-behaviour-in-nat--bridge-environments) to allow certificate issuance. Once the certificates are successfully issued by cert-manager, the internet access is removed from the VMs. + ## WIAB staging ansible playbook -The ansible playbook will perform the following operations for you: +The WIAB-staging ansible playbooks require internet access to be available on the target machine. Assuming it is available, these playbooks will perform the following steps automatically: **System Setup & Networking**: - Updates all system packages and installs required tools (git, curl, docker, qemu, libvirt, yq, etc.) - - Configures SSH, firewall (nftables), and user permissions (sudo, kvm, docker groups) + - Configures SSH and user permissions (sudo, kvm, docker groups) **wire-server-deploy Artifact & Ubuntu Cloud Image**: - Downloads wire-server-deploy static artifact and Ubuntu cloud image @@ -79,7 +92,6 @@ The ansible playbook will perform the following operations for you: - Generates inventory.yml with actual VM IPs replacing placeholders - Configures network interface variables for all k8s-nodes and datanodes - *Note: Skip the Ansible playbook step if you are managing VMs with your own hypervisor.* ### Getting started with Ansible playbook @@ -90,6 +102,7 @@ We need the whole ansible directory as ansible-playbook uses some templates for **Option A: Download as ZIP** ```bash +# requirements: wget and unzip wget https://github.com/wireapp/wire-server-deploy/archive/refs/heads/master.zip unzip master.zip cd wire-server-deploy-master @@ -97,6 +110,7 @@ cd wire-server-deploy-master **Option B: Clone with Git** ```bash +# requirements: git git clone https://github.com/wireapp/wire-server-deploy.git cd wire-server-deploy ``` @@ -104,8 +118,9 @@ cd wire-server-deploy **Step 2: Configure your Ansible inventory for your physical machine** A sample inventory is available at [ansible/inventory/demo/wiab-staging.yml](https://github.com/wireapp/wire-server-deploy/blob/master/ansible/inventory/demo/wiab-staging.yml). +Replace example.com with your physical machine (`adminhost`) address where KVM is available and adjust other variables like `ansible_user` and `ansible_ssh_private_key_file`. The SSH user for ansible `ansible_user` should have password-less `sudo` access. The adminhost should be running Ubuntu 22.04. From here on, we would refer the physical machine as `adminhost`. -*Note: Replace example.com with your physical machine address where KVM is available and adjust other variables accordingly.* +The `private_deployment` variable determines whether the VMs created below will have internet access. When set to `true` (default value), no internet access is available to VMs. Check [Network Traffic Configuration](#network-traffic-configuration) to understand more about it. **Step 3: Run the VM and network provision** @@ -117,30 +132,49 @@ ansible-playbook -i ansible/inventory/demo/wiab-staging.yml ansible/wiab-staging ## Ensure secondary ansible inventory for VMs -Now you should have 7 VMs running on your physical machine. If you have used the ansible playbook, you should also have a directory `/home/ansible_user/wire-server-deploy` with all resources required for further deployment. If you didn't use the above playbook, download the `wire-server-deploy` artifact shared by Wire support and unarchieve (tar tgz) it. +Now you should have 7 VMs running on your `adminhost`. If you have used the ansible playbook, you should also have a directory `/home/ansible_user/wire-server-deploy` with all resources required for further deployment. If you didn't use the above playbook, download the `wire-server-deploy` artifact shared by Wire support and extract it with tar. Ensure the inventory file `ansible/inventory/offline/inventory.yml` in the directory `/home/ansible_user/wire-server-deploy` contains values corresponding to your VMs. If you have already used the [Ansible playbook above](#getting-started-with-ansible-playbook) to set up VMs, this file should have been prepared for you. +The purpose of secondary ansible inventory is to interact only with the VMs. All the operations concerning the secondary inventory are meant to install datastores and k8s services. + ## Next steps -Since the inventory is ready, please continue with the following steps: +Once the inventory is ready, please continue with the following steps: + +> **Note**: All next steps assume that the wire-server-deploy artifact has been downloaded on the `adminhost` (your physical machine) and extracted at `/home/ansible_user/wire-server-deploy`. All commands from here on will be issued from this directory on the `adminhost`. Make sure you SSH into the node before proceeding. ### Environment Setup - **[Making tooling available in your environment](docs_ubuntu_22.04.md#making-tooling-available-in-your-environment)** - - Source the `bin/offline-env.sh` shell script by running `source bin/offline-env.sh` to set up a `d` alias that runs commands inside a Docker container with all necessary tools for offline deployment. + - Source the `bin/offline-env.sh` shell script by running following command to set up a `d` alias that runs commands inside a Docker container with all necessary tools for offline deployment. + ```bash + source bin/offline-env.sh + ``` + - You can always use this alias `d` later to interact with the ansible playbooks, k8s cluster and the helm charts. + - The docker container mounts everything here from the `wire-server-deploy` directory, hence this acts an entry point for all the future interactions with ansible, k8s and helm charts. - **[Generating secrets](docs_ubuntu_22.04.md#generating-secrets)** - - Run `./bin/offline-secrets.sh` to generate fresh secrets for Minio and coturn services. This creates two secret files: `ansible/inventory/group_vars/all/secrets.yaml` and `values/wire-server/secrets.yaml`. + - Run `bin/offline-secrets.sh` to generate fresh secrets for Minio and coturn services. It uses the docker container images shipped inside the `wire-server-deploy` directory. + ```bash + ./bin/offline-secrets.sh + ``` + - This creates following secret files: + - `ansible/inventory/group_vars/all/secrets.yaml` + - `values/wire-server/secrets.yaml` + - `values/coturn/prod-secrets.example.yaml` ### Kubernetes & Data Services Deployment - **[Deploying Kubernetes and stateful services](docs_ubuntu_22.04.md#deploying-kubernetes-and-stateful-services)** - - Run `d ./bin/offline-cluster.sh` to deploy Kubernetes and stateful services (Cassandra, PostgreSQL, Elasticsearch, Minio, RabbitMQ). This script deploys all infrastructure needed for Wire backend operations. + ```bash + d ./bin/offline-cluster.sh + ``` + - Run the above command to deploy Kubernetes and stateful services (Cassandra, PostgreSQL, Elasticsearch, Minio, RabbitMQ). This script deploys all infrastructure needed for Wire backend operations. ### Helm Operations to install wire services and supporting helm charts -**Helm chart deployment (automated):** The script `bin/helm-operations.sh` will deploy the charts for you. It prepares `values.yaml`/`secrets.yaml`, customizes them for your domain/IPs, then runs Helm installs/upgrades in the correct order. +**Helm chart deployment (automated):** The script `bin/helm-operations.sh` will deploy the charts for you. It prepares `values.yaml`/`secrets.yaml`, customizes them for your domain/IPs, then runs Helm installs/upgrades in the correct order. Prepare the values before running it. **User-provided inputs (set these before running):** - `TARGET_SYSTEM`: your domain (e.g., `wire.example.com` or `example.dev`). @@ -148,16 +182,20 @@ Since the inventory is ready, please continue with the following steps: - `HOST_IP`: public IP that matches your DNS A record (auto-detected if empty). **TLS / certificate behavior (cert-manager vs. Bring Your Own):** -- By default, `bin/helm-operations.sh` runs `deploy_cert_manager`, which installs cert-manager and configures a Let’s Encrypt (HTTP-01) issuer for the ingress charts. -- If you **do not** want Let’s Encrypt / cert-manager (for example, you are using **[Bring Your Own certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates)** or you cannot satisfy HTTP-01 requirements), disable this step by commenting out the `deploy_cert_manager` call inside `bin/helm-operations.sh`. - - After disabling cert-manager, ensure your ingress is configured with your own TLS secret(s) as described in the TLS documentation below. +- By default, `bin/helm-operations.sh` has `DEPLOY_CERT_MANAGER=TRUE`, which installs cert-manager and configures a Let’s Encrypt (HTTP-01) issuer for the ingress charts. +- If you **do not** want Let’s Encrypt / cert-manager (for example, you are using **[Bring Your Own certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates)**), disable this step by passing the environment variable `DEPLOY_CERT_MANAGER=FALSE` when running `bin/helm-operations.sh`. + - When choosing `DEPLOY_CERT_MANAGER=FALSE`, ensure your ingress is configured with your own TLS secret(s) as described at [Acquiring / Deploying SSL Certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates). + - When choosing `DEPLOY_CERT_MANAGER=TRUE`, ensure if further network configuration is required by following [cert-manager behaviour in NAT / bridge environments](#cert-manager-behaviour-in-nat--bridge-environments). -**To run the automated helm chart deployment**: -`d ./bin/helm-operations.sh` +**To run the automated helm chart deployment with your variables**: +```bash +# example command - verify the variables before running it +d sh -c 'TARGET_SYSTEM="example.dev" CERT_MASTER_EMAIL="certmaster@example.dev" DEPLOY_CERT_MANAGER=TRUE ./bin/helm-operations.sh' +``` **Charts deployed by the script:** - External datastores and helpers: `cassandra-external`, `elasticsearch-external`, `minio-external`, `rabbitmq-external`, `databases-ephemeral`, `reaper`, `fake-aws`, `demo-smtp`. -- Wire services: `wire-server`, `webapp`, `account-pages`, `team-settings`, `smallstep-accomp`. +- Wire services: `wire-server`, `webapp`, `account-pages`, `team-settings`. - Ingress and certificates: `ingress-nginx-controller`, `cert-manager`, `nginx-ingress-services`. - Calling services: `sftd`, `coturn`. @@ -165,115 +203,224 @@ Since the inventory is ready, please continue with the following steps: - Creates `values.yaml` and `secrets.yaml` from `prod-values.example.yaml` and `prod-secrets.example.yaml` for each chart under `values/`. - Backs up any existing `values.yaml`/`secrets.yaml` before replacing them. -**Values configured by the script:** -- Replaces `example.com` with `TARGET_SYSTEM` in Wire and webapp hostnames. -- Enables cert-manager and sets `certmasterEmail` using `CERT_MASTER_EMAIL`. -- Sets SFTD hosts and switches issuer to `letsencrypt-http01`. -- Sets coturn listen/relay/external IPs using the calling node IP and `HOST_IP`. - *Note: The `bin/helm-operations.sh` script above deploys these charts; you do not need to run the Helm commands manually unless you want to customize or debug.* ## Network Traffic Configuration -### Bring traffic from the physical machine to Wire services in the k8s cluster +### Bring traffic from the adminhost to Wire services in the k8s cluster -If you used the Ansible playbook earlier, nftables firewall rules are pre-configured to forward traffic. If you set up VMs manually with your own hypervisor, you must manually configure network traffic flow using nftables. +Our Wire services are ready to receive traffic but we must enable network access from the `adminhost` network interface to the k8s pods running in the virtual network. We can achieve it by setting up [nftables](https://documentation.ubuntu.com/security/security-features/network/firewall/nftables/) rules on the `adminhost`. When using any other type of firewall tools, please ensure following network configuration is achieved. **Required Network Configuration:** -The physical machine must forward traffic from external clients to the Kubernetes cluster running Wire services. This involves: - -1. **HTTP/HTTPS Traffic (Ingress)** - Forward ports 80 and 443 to the nginx-ingress-controller running on a Kubernetes node - - Port 80 (HTTP) → Kubernetes node port 31772 - - Port 443 (HTTPS) → Kubernetes node port 31773 - -2. **Calling Services Traffic (Coturn/SFT)** - Forward media and TURN protocol traffic to Coturn/SFT - - Port 3478 (TCP/UDP) → Coturn control traffic - - Ports 32768-65535 (UDP) → Media relay traffic for WebRTC calling +The `adminhost` must forward traffic from external clients to the Kubernetes cluster running Wire services. This involves: + +1. **HTTP/HTTPS Traffic (Ingress)** – Forward external web traffic to Kubernetes ingress with load balancing across nodes + - Port 80 (TCP, from any external source to adminhost WAN IP) → DNAT to any Kubernetes node on port 31772 → HTTP ingress + - Port 443 (TCP, from any external source to adminhost WAN IP) → DNAT to any Kubernetes node on port 31773 → HTTPS ingress + +2. **Calling Services Traffic (Coturn/SFT)** – Forward TURN control and media traffic to the dedicated calling node + - Port 3478 (TCP/UDP, from any external source to adminhost WAN IP) → DNAT to calling node → TURN control traffic + - Ports 32768–65535 (UDP, from any external source to adminhost WAN IP) → DNAT to calling node → WebRTC media relay + +3. **Normal Access Rules (Host-Level Access)** – Restrict direct access to adminhost + - Port 22 (TCP, from allowed sources to adminhost) → allow → SSH access + - Traffic from loopback and VM bridge interfaces → allow → internal communication + - Any traffic within VM network → allowed → ensures inter-node communication + - All other inbound traffic to adminhost → drop → default deny policy + +4. **Masquerading (If [Internet access for VMs](#internet-access-for-vms) is required)** – Enable outbound connectivity for VMs + - Any traffic from VM subnet leaving via WAN interface → SNAT/masquerade → ensures return traffic from internet. + - Controlled by the variable `private_deployment` + +5. **Conditional Rules (cert-manager / HTTP-01 in NAT setups)** – Temporary adjustments for certificate validation + - DNAT hairpin traffic (VM → public IP → VM) → may require SNAT/masquerade on VM bridge → ensures return path during HTTP-01 self-checks + - Asymmetric routing scenarios → may require relaxed reverse path filtering → prevents packet drops during validation + +```mermaid +flowchart TB + +%% External Clients +Client[External Client] +LetsEncrypt["(Optional)
Let's Encrypt"] +Internet["(If Required)
Internet Services
(AWS/FCM/APNS, Email Services etc)"] + +%% Admin Host +AdminHost["AdminHost
(Firewall)"] + +%% VM Network +subgraph VM_Network ["VM Network (virbr0)"] + K1[KubeNode1] + K2[KubeNode2] + K3["KubeNode3
(CALLING NODE)"] +end + +%% Ingress Traffic +Client -->|HTTPS → wire-records.example.com| AdminHost +AdminHost -->|"DNAT →31772/31773"| K1 +AdminHost -->|"DNAT →31772/31773"| K2 +AdminHost -->|"DNAT →31772/31773"| K3 + +%% Calling Traffic +Client -->|TCP/UDP Calling| AdminHost +AdminHost -->|DNAT → Calling Node| K3 + +%% Outbound Traffic (Masquerade) +K1 -.->|SNAT via AdminHost| Internet +K2 -.->|SNAT via AdminHost| Internet +K3 -.->|SNAT via AdminHost| Internet + +%% Cert-Manager Flow +K1 <-.->|HTTP-01 self-check| AdminHost +AdminHost-.->|Request TLS certificate| LetsEncrypt +``` **Implementation:** -Use the detailed nftables rules in [../ansible/files/wiab_server_nftables.conf.j2](../ansible/files/wiab_server_nftables.conf.j2) as the template. The guide covers: -- Defining your network variables (Coturn IP, Kubernetes node IP, WAN interface) -- Creating NAT rules for HTTP/HTTPS ingress traffic -- Setting up TURN protocol forwarding for Coturn -- Restarting nftables to apply changes +The nftables rules are detailed in [wiab_server_nftables.conf.j2](https://github.com/wireapp/wire-server-deploy/blob/master/ansible/files/wiab_server_nftables.conf.j2). Please ensure no other firewall services like `ufw` or `iptables` are configured on the node before continuing. -You can also apply these rules using the Ansible playbook, by following: +If you have already used the `wiab-staging-provision.yml` ansible playbook to create the VMs, then you can apply these rules using the same playbook (with the tag `nftables`) against your adminhost, by following: ```bash -ansible-playbook -i inventory.yml ansible/wiab-staging-nftables.yml +ansible-playbook -i ansible/inventory/demo/wiab-staging.yml ansible/wiab-staging-provision.yml --tags nftables ``` -*Note: If you ran the playbook wiab-staging-provision.yml then it might already be configured for you. Please confirm before running.* +> **Note:** You can use this playbook to change the internet access to VMs by modifying the variable `private_deployment` and re-run the above playbook. + +Alternatively, if you have not used the `wiab-staging-provision.yml` ansible playbook to create the VMs but would like to configure nftables rules, you can invoke the ansible playbook [wiab-staging-nftables.yaml](https://github.com/wireapp/wire-server-deploy/blob/master/ansible/wiab-staging-nftables.yaml) against the physical node. The playbook is available in the directory `wire-server-deploy/ansible`. + +The inventory file `inventory.yml` should define the following variables: +```yaml +wiab-staging: + hosts: + deploy_node: + # this should be the adminhost + ansible_host: example.com + ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -o TCPKeepAlive=yes' + ansible_user: 'demo' + ansible_ssh_private_key_file: "~/.ssh/id_ed25519" + vars: + # Kubernetes node IPs + kubenode1_ip: 192.168.122.11 + kubenode2_ip: 192.168.122.12 + kubenode3_ip: 192.168.122.13 + # Calling services node(kubenode3) + calling_node_ip: 192.168.122.13 + wire_comment: "wiab-stag" + # it will disable internet access to VMs created on the private network + private_deployment: true + # the playbook will try to find the default interface i.e. INF_WAN from ansible_default_ipv4.interface +``` -The inventory should define the following variables: +To implement the nftables rules, execute the following command: +```bash +# assuming the inventory.yml storead at wire-server-deploy and run command from the same directory +ansible-playbook -i inventory.yml ansible/wiab-staging-nftables.yaml +``` -```ini -[all:vars] -# Kubernetes node IPs -kubenode1_ip=192.168.122.11 -kubenode2_ip=192.168.122.12 -kubenode3_ip=192.168.122.13 +### cert-manager behaviour in NAT / bridge environments -# Calling services node (usually kubenode3) -calling_node_ip=192.168.122.13 +When cert-manager performs HTTP-01 self-checks inside the cluster, traffic can hairpin: -# Host WAN interface name -inf_wan=eth0 -``` +- Pod → Node → host public IP → DNAT → Node → Ingress -> **Note (cert-manager & hairpin NAT):** -> When cert-manager performs HTTP-01 self-checks inside the cluster, traffic can hairpin (Pod → Node → host public IP → DNAT → Node → Ingress). -> If your nftables rules DNAT in `PREROUTING` without a matching SNAT on `virbr0 → virbr0`, return packets may bypass the host and break conntrack, causing HTTP-01 timeouts, resulting in certificate verification failure. -> Additionally, strict `rp_filter` can drop asymmetric return packets. -> If cert-manager is deployed in a NAT/bridge (`virbr0`) environment, first verify whether certificate issuance is failing before applying hairpin handling. -> Check whether certificates are successfully issued: +> **Note**: Using Let's Encrypt with `cert-manager` requires internet access ([to at least `acme-v02.api.letsencrypt.org`](https://letsencrypt.org/docs/acme-protocol-updates/)) to issue TLS certificates. If you have chosen to keep the network private, that is `private_deployment=true` for the VMs when applying nftables rules, then you need to make a temporary exception for this traffic. The same outbound access will also be required later for certificate renewals (after 180 days). +> +> To temporarily provide outbound internet access from the VMs, add the following nftables masquerading rule on the `adminhost`. Replace `INF_WAN` with the WAN interface that should carry this traffic, or make an equivalent change in your firewall: +> > ```bash -> d kubectl get certificates +> # Host WAN interface name +> INF_WAN=enp41s0 +> sudo nft insert rule ip nat POSTROUTING position 0 \ +> oifname $INF_WAN \ +> counter masquerade \ +> comment "wire-masquerade-for-letsencrypt" > ``` -> If certificates are not in `Ready=True` state, inspect cert-manager logs for HTTP-01 self-check or timeout errors: +> +> If you are using a different implementation than nftables then please ensure Internet access to VMs. + +In NAT/bridge setups (for example, using `virbr0` on the host): + +- If nftables DNAT rules exist in `PREROUTING` without a matching SNAT on `virbr0 → virbr0`, return packets may bypass the host and break conntrack, causing HTTP-01 timeouts and certificate verification failures. +- too strict of `rp_filter` settings can drop asymmetric return packets. + +Before changing anything, first verify whether certificate issuance is actually failing: + +1. Check whether certificates are successfully issued: + ```bash + d kubectl get certificates + ``` +2. Check if k8s pods can access to its own domain: + ```bash + # Replace below. To find the aws-sns pod id, run the command: + # d kubectl get pods -l 'app=fake-aws-sns' + d kubectl exec -ti fake-aws-sns- -- sh -c 'curl --connect-timeout 10 -v webapp.' + ``` +3. If certificates are not in `Ready=True` state, inspect cert-manager logs for HTTP-01 self-check or timeout errors: + ```bash + # To find the , run the following command: + # d kubectl get pods -n cert-manager-ns -l 'app=cert-manager' + d kubectl logs -n cert-manager-ns + ``` + +If you observe HTTP-01 challenge timeouts or self-check failures in a NAT/bridge environment, hairpin SNAT and relaxed reverse-path filtering handling may be required. One possible approach is by making following changes to the adminhost: + +> **Note:** All `nft` and `sysctl` commands should run on the adminhost. + +- Relax reverse-path filtering to loose mode to allow asymmetric flows: + ```bash + sudo sysctl -w net.ipv4.conf.all.rp_filter=2 + sudo sysctl -w net.ipv4.conf.virbr0.rp_filter=2 + ``` + These settings help conntrack reverse DNAT correctly and avoid drops during cert-manager’s HTTP-01 challenges in NAT/bridge (`virbr0`) environments. + +- Enable Hairpin SNAT (temporary for cert-manager HTTP-01): + ```bash + sudo nft insert rule ip nat POSTROUTING position 0 \ + iifname "virbr0" oifname "virbr0" \ + ip daddr 192.168.122.0/24 ct status dnat \ + counter masquerade \ + comment "wire-hairpin-dnat-virbr0" + ``` + This forces DNATed traffic that hairpins over the bridge to be masqueraded, ensuring return traffic flows back through the host and conntrack can correctly reverse the DNAT. + + Verify the rule was added: + ```bash + sudo nft list chain ip nat POSTROUTING + ``` + You should see a rule similar to: + ``` + iifname "virbr0" oifname "virbr0" ip daddr 192.168.122.0/24 ct status dnat counter masquerade # handle + ``` + +- Remove the rule after certificates are issued, confirm by running the following: + ```bash + d kubectl get certificates + ``` + + Once Let’s Encrypt validation completes and certificates are issued, remove the temporary hairpin SNAT rule. Use the following pipeline to locate the rule handle and delete it safely: + ```bash + sudo nft -a list chain ip nat POSTROUTING | \ + grep wire-hairpin-dnat-virbr0 | \ + sed -E 's/.*handle ([0-9]+).*/\1/' | \ + xargs -r -I {} sudo nft delete rule ip nat POSTROUTING handle {} + ``` + +> **Note**: If you added an nftables rule above to allow temporary internet access for the VMs, remove it after certificate issuance is complete. +> +> To remove the nftables masquerading rule for all outgoing traffic run the following command: +> > ```bash -> d kubectl logs -n cert-manager-ns +> # remove the masquerading rule +> sudo nft -a list chain ip nat POSTROUTING | \ +> grep wire-masquerade-for-letsencrypt | \ +> sed -E 's/.*handle ([0-9]+).*/\1/' | \ +> xargs -r -I {} sudo nft delete rule ip nat POSTROUTING handle {} > ``` -> If you observe HTTP-01 challenge timeouts or self-check failures in a NAT/bridge environment, hairpin SNAT and relaxed reverse-path filtering handling may be required. - > - Relax reverse-path filtering to loose mode to allow asymmetric flows: - > ```bash - > sudo sysctl -w net.ipv4.conf.all.rp_filter=2 - > sudo sysctl -w net.ipv4.conf.virbr0.rp_filter=2 - > ``` - > These settings help conntrack reverse DNAT correctly and avoid drops during cert-manager’s HTTP-01 challenges in NAT/bridge (virbr0) environments. - > - > - Enable Hairpin SNAT (temporary for cert-manager HTTP-01): - > ```bash - > sudo nft insert rule ip nat POSTROUTING position 0 \ - > iifname "virbr0" oifname "virbr0" \ - > ip daddr 192.168.122.0/24 ct status dnat \ - > counter masquerade \ - > comment "wire-hairpin-dnat-virbr0" - > ``` - > This forces DNATed traffic that hairpins over the bridge to be masqueraded, ensuring return traffic flows back through the host and conntrack can correctly reverse the DNAT. - > Verify the rule was added: - > ```bash - > sudo nft list chain ip nat POSTROUTING - > ``` - > You should see a rule similar to: - > ``` - > iifname "virbr0" oifname "virbr0" ip daddr 192.168.122.0/24 ct status dnat counter masquerade # handle - > ``` - > - > - Remove the rule after certificates are issued - > ```bash - > d kubectl get certificates - > ``` - > - Once Let's Encrypt validation completes and certificates are issued, remove the temporary hairpin SNAT rule. Use the following pipeline to locate the rule handle and delete it safely: - > ```bash - > sudo nft -a list chain ip nat POSTROUTING | \ - > grep wire-hairpin-dnat-virbr0 | \ - > sed -E 's/.*handle ([0-9]+).*/\1/' | \ - > xargs -r -I {} sudo nft delete rule ip nat POSTROUTING handle {} - > ``` +> +> If you are using a different implementation than nftables then please ensure temporary Internet access to VMs has been removed. +> **Note**: If email delivery is not working, or if Android/iOS push notifications are still not working after you have configured the required AWS credentials, ensure the required outbound access is allowed as explained at [Internet access for VMs](#internet-access-for-vms). ## Further Reading @@ -282,7 +429,6 @@ inf_wan=eth0 - **[Deploying webapp](docs_ubuntu_22.04.md#deploying-webapp)**: Read more about webapp deployment and domain configuration. - **[Deploying team-settings](docs_ubuntu_22.04.md#deploying-team-settings)**: Read more about team settings services. - **[Deploying account-pages](docs_ubuntu_22.04.md#deploying-account-pages)**: Read more about account management services. -- **[Deploying smallstep-accomp](docs_ubuntu_22.04.md#deploying-smallstep-accomp)**: Read more about the ACME companion. - **[Enabling emails for wire](smtp.md)**: Read more about SMTP options for onboarding email delivery and relay setup. - **[Deploy ingress-nginx-controller](docs_ubuntu_22.04.md#deploy-ingress-nginx-controller)**: Read more about ingress configuration and traffic forwarding requirements. - **[Acquiring / Deploying SSL Certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates)**: Read more about TLS options (Bring Your Own or cert-manager) and certificate requirements. diff --git a/values/sftd/prod-values.example.yaml b/values/sftd/prod-values.example.yaml index e10f2d60a..1c2374f9e 100644 --- a/values/sftd/prod-values.example.yaml +++ b/values/sftd/prod-values.example.yaml @@ -1,3 +1,5 @@ +# this value should be set to 3 when deployed in a full production DMZ manner +# replicaCount = 1 is to support the simple wiab-staging solution replicaCount: 1 # image: # tag: some-tag # (only override if you want a newer/different version than what is in the chart) @@ -7,6 +9,16 @@ tls: issuerRef: name: letsencrypt-http01 kind: ClusterIssuer + +joinCall: +# this value should be set to 3 when deployed in a full production DMZ manner +# replicaCount = 1 is to support the simple wiab-staging solution + replicaCount: 1 + image: + repository: docker.io/bitnamilegacy/nginx + pullPolicy: IfNotPresent + tag: "1.27.3-debian-12-r5" + # Uncomment to enable SFT to SFT communication for federated calls # multiSFT: # enabled: true