lightseekorg · zhubohao911 · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.github/workflows/colocate-stability.yml b/.github/workflows/colocate-stability.yml
@@ -0,0 +1,64 @@
+name: Colocate 1000-step stability
+
+# Long-run memory-stability check for the colocate (MPS + NCCL) training
+# path -- the nightly counterpart to the per-PR colocate smoke. Runs
+# tests/colocate/test_stability.py with PHASE6_STABILITY_STEPS=1000 and
+# asserts peak GPU allocation stays within 1 % between a post-warmup
+# early step and a late step (see the test for the rationale).
+#
+# This job needs a real 4xH100 + working-MPS host, which GitHub-hosted
+# runners do not provide. It is therefore pinned to a self-hosted runner
+# labelled `colocate-stability`. Until such a runner is registered the
+# job is queued but never picked up -- that is intentional: the gating
+# structure lives here so wiring a runner is the only remaining step.
+#
+# Triggers:
+#   * schedule       -- nightly at 08:00 UTC.
+#   * workflow_dispatch -- manual, with an optional step-count override.
+#   * pull_request (labeled) -- only when the PR carries the
+#     `colocate-stability` label, so the expensive run is opt-in.
+
+on:
+  schedule:
+    - cron: "0 8 * * *"
+  workflow_dispatch:
+    inputs:
+      steps:
+        description: "PHASE6_STABILITY_STEPS override"
+        required: false
+        default: "1000"
+  pull_request:
+    types: [labeled]
+
+concurrency:
+  group: colocate-stability-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  stability:
+    # Run on schedule/dispatch unconditionally; on a PR only when the
+    # `colocate-stability` label is the one that was just added (or is
+    # present). Keeps the 4xH100 nightly opt-in per PR.
+    if: >-
+      github.event_name != 'pull_request' ||
+      github.event.label.name == 'colocate-stability'
+    runs-on: [self-hosted, gpu, colocate-stability]
+    timeout-minutes: 180
+    env:
+      # workflow_dispatch can override; schedule / PR-label use 1000.
+      PHASE6_STABILITY_STEPS: ${{ github.event.inputs.steps || '1000' }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run 1000-step colocate stability
+        run: bash scripts/colocate/run_smoke_host.sh --stability
+
+      - name: Upload stability report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: colocate-stability-report
+          path: |
+            colocate-smoke-report.txt
+            colocate-smoke-pytest.log
+          if-no-files-found: warn
diff --git a/.gitignore b/.gitignore
@@ -89,3 +89,6 @@ wandb/log.txt
 
 .claude/
 wandb/
+
+# Colocate knowledge docs (keep local only, not for PR)
+docs/colocate/knowledge*.md
diff --git a/configs/colocate_qwen0p6b_2eng_tp2_tiny.yaml b/configs/colocate_qwen0p6b_2eng_tp2_tiny.yaml
@@ -0,0 +1,81 @@
+# Tiny-model colocate config with 2 engines x engine_tp_size=2.
+#
+# Sibling of `configs/colocate_qwen0p6b_tp2_tiny.yaml`, which has a
+# *single* tp=2 engine. This config runs TWO inference engines, each
+# tensor-parallel across 2 GPUs, so it exercises the multi-engine
+# fan-out as well as the per-TP-rank routing:
+#
+#   * world_size = 4  (Phase-0 invariant: engine_count(2) x
+#     engine_tp_size(2) == world_size(4)).
+#   * 4 trainer ranks (FSDP world=4) + 2 engine actors x 2 TP
+#     scheduler subprocesses = 8 logical ranks (union world 2N=8),
+#     paired 1:1 and MPS-shared two-per-GPU across 4 GPUs.
+#   * colocate_loop dispatches per-engine: engine e gets a 2-prompt
+#     batch for trainers [e*2, e*2+2); TP rank t NCCL-sends batch
+#     item t to trainer e*2+t. Engine 0 -> trainers 0,1; engine 1 ->
+#     trainers 2,3. The single-engine tp2 config never exercises the
+#     `for e in range(n_engines)` dispatch loop with n_engines>1.
+#
+# Needs 4 GPUs + working MPS. Used by tests/colocate/test_colocate_multi_engine.py.
+
+model:
+  target_model_path: Qwen/Qwen3-0.6B-Base
+  trust_remote_code: true
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  chat_template: qwen
+  prompt_key: conversations
+
+training:
+  attention_backend: flex_attention
+  micro_batch_size: 1
+  draft_accumulation_steps: 1
+  learning_rate: 1e-4
+  max_concurrent_batches: 1
+  max_grad_norm: 0.5
+  max_seq_length: 2048
+  num_epochs: 1
+  seed: 42
+  # 4 trainer ranks (FSDP world=4).
+  training_num_gpus_per_node: 4
+  training_num_nodes: 1
+  ttt_length: 7
+  save_per_epoch: false
+  warmup_ratio: 0.015
+
+  # ─── Colocate flags ─────────────────────────────────────────────
+  colocate_strategy: mps
+  transfer_mode: nccl
+  train_frac: 0.45
+  infer_frac: 0.45
+
+inference:
+  inference_engine_type: sgl
+  # 2 engines, each tensor-parallel across 2 GPUs. engine_count x
+  # engine_tp_size = 2 x 2 = world_size 4.
+  inference_num_gpus: 4
+  inference_num_gpus_per_engine: 2
+  inference_num_gpus_per_node: 4
+  max_sample_pool_size: 16
+  inference_buffer_threshold: 8
+  inference_batch_size: 2
+  sglang:
+    tp_size: 2
+    mem_fraction_static: 0.45
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 4GB
+  local_buffer_size: 1GB
+
+output_dir: ./outputs/colocate-qwen0p6b-2eng-tp2-tiny
+cache_dir: ./cache/colocate-qwen0p6b-2eng-tp2-tiny
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false
diff --git a/configs/colocate_qwen0p6b_tiny.yaml b/configs/colocate_qwen0p6b_tiny.yaml
@@ -0,0 +1,85 @@
+# Tiny-model colocate config for cheap-host MPS validation.
+#
+# Same colocate code path as `configs/colocate_qwen3_8b.yaml` (MPS strategy +
+# NCCL transfer + Phase-0 invariants), but sized so the entire trainer +
+# engine + KV-cache footprint fits inside a single 24 GB consumer/L40S-class
+# GPU. The intent is to give people without 4×H100 access a way to actually
+# *run* the MPS-required Phase-4/6/7 tests on a $0.30-2.00/hr cheap GPU
+# rental (Vast.ai, Lambda spot, Hyperstack, etc.) for a one-shot
+# correctness check.
+#
+# Footprint at a glance (Qwen3-0.6B Base, 600 M params, fp16):
+#   - trainer (FSDP world=1, no sharding): weights 1.2 GB + grads 1.2 GB
+#     + AdamW fp32 state 4.8 GB ≈ 7.2 GB → fits in 0.45×24 GB = 10.8 GB.
+#   - engine (sglang, tp=1): weights 1.2 GB + KV cache for 16 K ctx
+#     ≈ 4 GB ≈ 5.2 GB → fits in 0.45×24 GB = 10.8 GB.
+#   - 0.10 headroom = 2.4 GB on a 24 GB card; CUDA context + allocator
+#     caches comfortably fit.
+#
+# Phase-0 invariant: engine_count × engine_tp_size == world_size = 1×1 = 1.
+#
+# Run via the local Docker / Vast.ai runner, not the Modal smoke script:
+#   bash scripts/colocate/run_smoke_host.sh
+
+model:
+  target_model_path: Qwen/Qwen3-0.6B-Base
+  trust_remote_code: true
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  chat_template: qwen
+  prompt_key: conversations
+
+training:
+  attention_backend: flex_attention
+  micro_batch_size: 1
+  draft_accumulation_steps: 1
+  learning_rate: 1e-4
+  max_concurrent_batches: 1
+  max_grad_norm: 0.5
+  # Smaller than the Qwen3-8B config so KV cache fits in 0.45×24 GB.
+  max_seq_length: 2048
+  num_epochs: 1
+  seed: 42
+  # 1:1 trainer↔engine on a single GPU. world_size = 1.
+  training_num_gpus_per_node: 1
+  training_num_nodes: 1
+  ttt_length: 7
+  save_per_epoch: false
+  warmup_ratio: 0.015
+
+  # ─── Colocate flags (same as Qwen3-8B config) ────────────────────
+  colocate_strategy: mps
+  transfer_mode: nccl
+  train_frac: 0.45
+  infer_frac: 0.45
+
+inference:
+  inference_engine_type: sgl
+  # 1 engine, 1 GPU, tp=1 — the only topology that satisfies the Phase-0
+  # invariant `engine_count × engine_tp_size == world_size = 1`.
+  inference_num_gpus: 1
+  inference_num_gpus_per_engine: 1
+  inference_num_gpus_per_node: 1
+  max_sample_pool_size: 8
+  inference_buffer_threshold: 4
+  inference_batch_size: 2
+  sglang:
+    tp_size: 1
+    mem_fraction_static: 0.45
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 4GB
+  local_buffer_size: 1GB
+
+output_dir: ./outputs/colocate-qwen0p6b-tiny
+cache_dir: ./cache/colocate-qwen0p6b-tiny
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false
diff --git a/configs/colocate_qwen0p6b_tp2_tiny.yaml b/configs/colocate_qwen0p6b_tp2_tiny.yaml
@@ -0,0 +1,78 @@
+# Tiny-model colocate config with engine_tp_size=2 (multi-engine TP).
+#
+# Sibling of `configs/colocate_qwen0p6b_tiny.yaml`, but the single
+# inference engine runs tensor-parallel across 2 GPUs (tp_size=2)
+# instead of 1. This exercises the multi-engine TP data plane:
+#
+#   * world_size = 2  (Phase-0 invariant: engine_count(1) x
+#     engine_tp_size(2) == world_size(2)).
+#   * 2 trainer ranks (FSDP world=2) + 1 engine actor with 2 TP
+#     scheduler subprocesses = 4 logical ranks, paired 1:1 and
+#     MPS-shared two-per-GPU across 2 GPUs.
+#   * The engine's generate() carries a 2-prompt batch; TP rank t
+#     NCCL-sends batch item t to trainer t (see colocate.patch
+#     _send_hidden_states_to_nccl).
+#
+# Needs 2 GPUs + working MPS. Used by tests/colocate/test_colocate_tp2.py.
+
+model:
+  target_model_path: Qwen/Qwen3-0.6B-Base
+  trust_remote_code: true
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  chat_template: qwen
+  prompt_key: conversations
+
+training:
+  attention_backend: flex_attention
+  micro_batch_size: 1
+  draft_accumulation_steps: 1
+  learning_rate: 1e-4
+  max_concurrent_batches: 1
+  max_grad_norm: 0.5
+  max_seq_length: 2048
+  num_epochs: 1
+  seed: 42
+  # 2 trainer ranks (FSDP world=2).
+  training_num_gpus_per_node: 2
+  training_num_nodes: 1
+  ttt_length: 7
+  save_per_epoch: false
+  warmup_ratio: 0.015
+
+  # ─── Colocate flags ─────────────────────────────────────────────
+  colocate_strategy: mps
+  transfer_mode: nccl
+  train_frac: 0.45
+  infer_frac: 0.45
+
+inference:
+  inference_engine_type: sgl
+  # 1 engine, tensor-parallel across 2 GPUs. engine_count x
+  # engine_tp_size = 1 x 2 = world_size 2.
+  inference_num_gpus: 2
+  inference_num_gpus_per_engine: 2
+  inference_num_gpus_per_node: 2
+  max_sample_pool_size: 8
+  inference_buffer_threshold: 4
+  inference_batch_size: 2
+  sglang:
+    tp_size: 2
+    mem_fraction_static: 0.45
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 4GB
+  local_buffer_size: 1GB
+
+output_dir: ./outputs/colocate-qwen0p6b-tp2-tiny
+cache_dir: ./cache/colocate-qwen0p6b-tp2-tiny
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false