Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/workflows/colocate-stability.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Colocate 1000-step stability

# Long-run memory-stability check for the colocate (MPS + NCCL) training
# path -- the nightly counterpart to the per-PR colocate smoke. Runs
# tests/colocate/test_stability.py with PHASE6_STABILITY_STEPS=1000 and
# asserts peak GPU allocation stays within 1 % between a post-warmup
# early step and a late step (see the test for the rationale).
#
# This job needs a real 4xH100 + working-MPS host, which GitHub-hosted
# runners do not provide. It is therefore pinned to a self-hosted runner
# labelled `colocate-stability`. Until such a runner is registered the
# job is queued but never picked up -- that is intentional: the gating
# structure lives here so wiring a runner is the only remaining step.
#
# Triggers:
# * schedule -- nightly at 08:00 UTC.
# * workflow_dispatch -- manual, with an optional step-count override.
# * pull_request (labeled) -- only when the PR carries the
# `colocate-stability` label, so the expensive run is opt-in.

on:
schedule:
- cron: "0 8 * * *"
workflow_dispatch:
inputs:
steps:
description: "PHASE6_STABILITY_STEPS override"
required: false
default: "1000"
pull_request:
types: [labeled]

concurrency:
group: colocate-stability-${{ github.ref }}
cancel-in-progress: false

jobs:
stability:
# Run on schedule/dispatch unconditionally; on a PR only when the
# `colocate-stability` label is the one that was just added (or is
# present). Keeps the 4xH100 nightly opt-in per PR.
if: >-
github.event_name != 'pull_request' ||
github.event.label.name == 'colocate-stability'
runs-on: [self-hosted, gpu, colocate-stability]
timeout-minutes: 180
env:
# workflow_dispatch can override; schedule / PR-label use 1000.
PHASE6_STABILITY_STEPS: ${{ github.event.inputs.steps || '1000' }}
steps:
- uses: actions/checkout@v4

- name: Run 1000-step colocate stability
run: bash scripts/colocate/run_smoke_host.sh --stability

- name: Upload stability report
if: always()
uses: actions/upload-artifact@v4
with:
name: colocate-stability-report
path: |
colocate-smoke-report.txt
colocate-smoke-pytest.log
if-no-files-found: warn
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,6 @@ wandb/log.txt

.claude/
wandb/

# Colocate knowledge docs (keep local only, not for PR)
docs/colocate/knowledge*.md
81 changes: 81 additions & 0 deletions configs/colocate_qwen0p6b_2eng_tp2_tiny.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Tiny-model colocate config with 2 engines x engine_tp_size=2.
#
# Sibling of `configs/colocate_qwen0p6b_tp2_tiny.yaml`, which has a
# *single* tp=2 engine. This config runs TWO inference engines, each
# tensor-parallel across 2 GPUs, so it exercises the multi-engine
# fan-out as well as the per-TP-rank routing:
#
# * world_size = 4 (Phase-0 invariant: engine_count(2) x
# engine_tp_size(2) == world_size(4)).
# * 4 trainer ranks (FSDP world=4) + 2 engine actors x 2 TP
# scheduler subprocesses = 8 logical ranks (union world 2N=8),
# paired 1:1 and MPS-shared two-per-GPU across 4 GPUs.
# * colocate_loop dispatches per-engine: engine e gets a 2-prompt
# batch for trainers [e*2, e*2+2); TP rank t NCCL-sends batch
# item t to trainer e*2+t. Engine 0 -> trainers 0,1; engine 1 ->
# trainers 2,3. The single-engine tp2 config never exercises the
# `for e in range(n_engines)` dispatch loop with n_engines>1.
#
# Needs 4 GPUs + working MPS. Used by tests/colocate/test_colocate_multi_engine.py.

model:
target_model_path: Qwen/Qwen3-0.6B-Base
trust_remote_code: true

dataset:
train_data_path: ../examples/data/sample_conversations.jsonl
chat_template: qwen
prompt_key: conversations

training:
attention_backend: flex_attention
micro_batch_size: 1
draft_accumulation_steps: 1
learning_rate: 1e-4
max_concurrent_batches: 1
max_grad_norm: 0.5
max_seq_length: 2048
num_epochs: 1
seed: 42
# 4 trainer ranks (FSDP world=4).
training_num_gpus_per_node: 4
training_num_nodes: 1
ttt_length: 7
save_per_epoch: false
warmup_ratio: 0.015

# ─── Colocate flags ─────────────────────────────────────────────
colocate_strategy: mps
transfer_mode: nccl
train_frac: 0.45
infer_frac: 0.45

inference:
inference_engine_type: sgl
# 2 engines, each tensor-parallel across 2 GPUs. engine_count x
# engine_tp_size = 2 x 2 = world_size 4.
inference_num_gpus: 4
inference_num_gpus_per_engine: 2
inference_num_gpus_per_node: 4
max_sample_pool_size: 16
inference_buffer_threshold: 8
inference_batch_size: 2
sglang:
tp_size: 2
mem_fraction_static: 0.45

mooncake:
master_server_address: null
metadata_server: null
protocol: tcp
global_segment_size: 4GB
local_buffer_size: 1GB

output_dir: ./outputs/colocate-qwen0p6b-2eng-tp2-tiny
cache_dir: ./cache/colocate-qwen0p6b-2eng-tp2-tiny
model_download_dir: null

debug:
save_debug_train_data: null
debug_train_only: false
debug_inference_only: false
85 changes: 85 additions & 0 deletions configs/colocate_qwen0p6b_tiny.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Tiny-model colocate config for cheap-host MPS validation.
#
# Same colocate code path as `configs/colocate_qwen3_8b.yaml` (MPS strategy +
# NCCL transfer + Phase-0 invariants), but sized so the entire trainer +
# engine + KV-cache footprint fits inside a single 24 GB consumer/L40S-class
# GPU. The intent is to give people without 4Γ—H100 access a way to actually
# *run* the MPS-required Phase-4/6/7 tests on a $0.30-2.00/hr cheap GPU
# rental (Vast.ai, Lambda spot, Hyperstack, etc.) for a one-shot
# correctness check.
#
# Footprint at a glance (Qwen3-0.6B Base, 600 M params, fp16):
# - trainer (FSDP world=1, no sharding): weights 1.2 GB + grads 1.2 GB
# + AdamW fp32 state 4.8 GB β‰ˆ 7.2 GB β†’ fits in 0.45Γ—24 GB = 10.8 GB.
# - engine (sglang, tp=1): weights 1.2 GB + KV cache for 16 K ctx
# β‰ˆ 4 GB β‰ˆ 5.2 GB β†’ fits in 0.45Γ—24 GB = 10.8 GB.
# - 0.10 headroom = 2.4 GB on a 24 GB card; CUDA context + allocator
# caches comfortably fit.
#
# Phase-0 invariant: engine_count Γ— engine_tp_size == world_size = 1Γ—1 = 1.
#
# Run via the local Docker / Vast.ai runner, not the Modal smoke script:
# bash scripts/colocate/run_smoke_host.sh

model:
target_model_path: Qwen/Qwen3-0.6B-Base
trust_remote_code: true

dataset:
train_data_path: ../examples/data/sample_conversations.jsonl
chat_template: qwen
prompt_key: conversations

training:
attention_backend: flex_attention
micro_batch_size: 1
draft_accumulation_steps: 1
learning_rate: 1e-4
max_concurrent_batches: 1
max_grad_norm: 0.5
# Smaller than the Qwen3-8B config so KV cache fits in 0.45Γ—24 GB.
max_seq_length: 2048
num_epochs: 1
seed: 42
# 1:1 trainer↔engine on a single GPU. world_size = 1.
training_num_gpus_per_node: 1
training_num_nodes: 1
ttt_length: 7
save_per_epoch: false
warmup_ratio: 0.015

# ─── Colocate flags (same as Qwen3-8B config) ────────────────────
colocate_strategy: mps
transfer_mode: nccl
train_frac: 0.45
infer_frac: 0.45

inference:
inference_engine_type: sgl
# 1 engine, 1 GPU, tp=1 β€” the only topology that satisfies the Phase-0
# invariant `engine_count Γ— engine_tp_size == world_size = 1`.
inference_num_gpus: 1
inference_num_gpus_per_engine: 1
inference_num_gpus_per_node: 1
max_sample_pool_size: 8
inference_buffer_threshold: 4
inference_batch_size: 2
sglang:
tp_size: 1
mem_fraction_static: 0.45

mooncake:
master_server_address: null
metadata_server: null
protocol: tcp
global_segment_size: 4GB
local_buffer_size: 1GB

output_dir: ./outputs/colocate-qwen0p6b-tiny
cache_dir: ./cache/colocate-qwen0p6b-tiny
model_download_dir: null

debug:
save_debug_train_data: null
debug_train_only: false
debug_inference_only: false
78 changes: 78 additions & 0 deletions configs/colocate_qwen0p6b_tp2_tiny.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Tiny-model colocate config with engine_tp_size=2 (multi-engine TP).
#
# Sibling of `configs/colocate_qwen0p6b_tiny.yaml`, but the single
# inference engine runs tensor-parallel across 2 GPUs (tp_size=2)
# instead of 1. This exercises the multi-engine TP data plane:
#
# * world_size = 2 (Phase-0 invariant: engine_count(1) x
# engine_tp_size(2) == world_size(2)).
# * 2 trainer ranks (FSDP world=2) + 1 engine actor with 2 TP
# scheduler subprocesses = 4 logical ranks, paired 1:1 and
# MPS-shared two-per-GPU across 2 GPUs.
# * The engine's generate() carries a 2-prompt batch; TP rank t
# NCCL-sends batch item t to trainer t (see colocate.patch
# _send_hidden_states_to_nccl).
#
# Needs 2 GPUs + working MPS. Used by tests/colocate/test_colocate_tp2.py.

model:
target_model_path: Qwen/Qwen3-0.6B-Base
trust_remote_code: true

dataset:
train_data_path: ../examples/data/sample_conversations.jsonl
chat_template: qwen
prompt_key: conversations

training:
attention_backend: flex_attention
micro_batch_size: 1
draft_accumulation_steps: 1
learning_rate: 1e-4
max_concurrent_batches: 1
max_grad_norm: 0.5
max_seq_length: 2048
num_epochs: 1
seed: 42
# 2 trainer ranks (FSDP world=2).
training_num_gpus_per_node: 2
training_num_nodes: 1
ttt_length: 7
save_per_epoch: false
warmup_ratio: 0.015

# ─── Colocate flags ─────────────────────────────────────────────
colocate_strategy: mps
transfer_mode: nccl
train_frac: 0.45
infer_frac: 0.45

inference:
inference_engine_type: sgl
# 1 engine, tensor-parallel across 2 GPUs. engine_count x
# engine_tp_size = 1 x 2 = world_size 2.
inference_num_gpus: 2
inference_num_gpus_per_engine: 2
inference_num_gpus_per_node: 2
max_sample_pool_size: 8
inference_buffer_threshold: 4
inference_batch_size: 2
sglang:
tp_size: 2
mem_fraction_static: 0.45

mooncake:
master_server_address: null
metadata_server: null
protocol: tcp
global_segment_size: 4GB
local_buffer_size: 1GB

output_dir: ./outputs/colocate-qwen0p6b-tp2-tiny
cache_dir: ./cache/colocate-qwen0p6b-tp2-tiny
model_download_dir: null

debug:
save_debug_train_data: null
debug_train_only: false
debug_inference_only: false
Loading
Loading