moreh-dev · hhk7734 · Mar 27, 2026 · Mar 10, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -115,3 +115,7 @@ When working on a specific MIF component, consult the relevant skill:
 - **Dependency version updates**: [`.agents/skills/bump-dependency/SKILL.md`](.agents/skills/bump-dependency/SKILL.md)
 - **Heimdall scheduler**: [`skills/guide-heimdall/SKILL.md`](skills/guide-heimdall/SKILL.md)
 - **Odin inference operator**: [`skills/guide-odin/SKILL.md`](skills/guide-odin/SKILL.md)
+
+## Offline Hugging Face cache
+
+- For air-gapped `trust_remote_code` deployments, pre-download both the model snapshot and the dynamic module sources. `hf download` alone may leave `HF_MODULES_CACHE` incomplete; if the warm-up pod lacks `torch` or other model-side dependencies, populate `HF_MODULES_CACHE` from the local HF snapshot cache rather than relying on `transformers` to import the remote module.
@@ -153,30 +153,30 @@ Odin presets use `mif.moreh.io/*` labels:
 ### Responsibility boundaries
 
 **Presets define** (model/GPU-specific, not user-configurable):
-- vLLM arguments for parallelism within a single rank (`--tensor-parallel-size`, `--enable-expert-parallel`, etc.)
+- `spec.parallelism` values and `mif.moreh.io/parallelism` labels that select the desired TP/PP/DP/EP behavior
 - Model-specific vLLM arguments (`--trust-remote-code`, `--max-model-len`, `--max-num-seqs`, `--kv-cache-type`, `--quantization`, `--gpu-memory-utilization`, etc.)
+- Logging arguments (`--disable-uvicorn-access-log`, `--no-enable-log-requests`) — presets must include these because `ISVC_EXTRA_ARGS` in a preset fully overrides the runtime base's value during Odin strategic merge patch (env vars merge by `name` key)
 - Model-specific environment variables (`VLLM_ROCM_USE_AITER`, `VLLM_MOE_DP_CHUNK_SIZE`, `UCX_*`, `NCCL_*`, etc.)
 - Resources (GPU count, RDMA NICs), tolerations, and nodeSelector
 
 **Runtime bases define** (shared across presets):
 - `spec.framework` (e.g., `vllm`)
 - Execution command(s) and launch logic (for-loop for DP, cleanup traps)
-- Cross-rank parallelism arguments (`--data-parallel-rank`, `--data-parallel-address`, `--data-parallel-rpc-port`)
+- Parallelism flag assembly from `spec.parallelism` (`--tensor-parallel-size`, `--pipeline-parallel-size`, `--enable-expert-parallel`, `--data-parallel-rank`, `--data-parallel-address`, `--data-parallel-rpc-port`)
 - Disaggregation-specific environment variables (`VLLM_NIXL_SIDE_CHANNEL_HOST`, `VLLM_IS_DECODE_WORKER`)
 - Shared memory settings, readiness probes
 - Proxy sidecar configuration (for PD disaggregation)
 
+**Utils define** (shared utility templates, not runtime bases or presets):
+- Offline Hugging Face cache environment (`HF_HOME`, `HF_HUB_OFFLINE`, `HF_MODULES_CACHE`) in `*-hf-hub-offline` templates
+
 **Users configure** (not defined by presets or runtime bases):
 - Image repository and tag (with default provided)
 - Volume mounts and model loading method (HF download vs. PV)
 - Hugging Face token
 - Number of replicas
-- Logging arguments (`--no-enable-log-requests`, `--disable-uvicorn-access-log`, etc.)
 - `--no-enable-prefix-caching`
 
-**Product team templates configure** (must NOT be set in presets):
-- `--prefix-caching-hash-algo`, `--kv-events-config`, `--block-size`
-
 ### PD decode proxy response headers
 
 - `heimdall-proxy --response-header` is a debug flag that adds `X-Decoder-Host-Port` and `X-Prefiller-Host-Port` to responses.

@@ -23,8 +23,6 @@ spec:
         - name: main
           image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.15.1.1
           env:
-            - name: HF_MODULES_CACHE
-              value: /tmp/hf_modules
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code

@@ -23,8 +23,6 @@ spec:
         - name: main
           image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.15.1.1
           env:
-            - name: HF_MODULES_CACHE
-              value: /tmp/hf_modules
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code

@@ -23,8 +23,6 @@ spec:
         - name: main
           image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.15.1.1
           env:
-            - name: HF_MODULES_CACHE
-              value: /tmp/hf_modules
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code

@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-glm5-zai-org-glm-4.7-flash-nvidia-h100-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-4.7-flash
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h100-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-4.7-Flash
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:glm5
+          env:
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --max-model-len 32768
+                --max-num-seqs 32
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype auto
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h100-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
@@ -0,0 +1,48 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-glm5-zai-org-glm-4.7-flash-nvidia-h100-sxm-tp2-moe-tp2
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-4.7-flash
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h100-sxm
+    mif.moreh.io/parallelism: "tp2-moe-tp2"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-4.7-Flash
+  parallelism:
+    tensor: 2
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:glm5
+          env:
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --max-model-len 131072
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype auto
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "2"
+            limits:
+              nvidia.com/gpu: "2"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h100-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
@@ -0,0 +1,48 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-glm5-zai-org-glm-4.7-flash-nvidia-h100-sxm-tp4-moe-tp4
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-4.7-flash
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h100-sxm
+    mif.moreh.io/parallelism: "tp4-moe-tp4"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-4.7-Flash
+  parallelism:
+    tensor: 4
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:glm5
+          env:
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --max-model-len 200000
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype auto
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "4"
+            limits:
+              nvidia.com/gpu: "4"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h100-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-glm5-zai-org-glm-4.7-flash-nvidia-h200-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-4.7-flash
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-4.7-Flash
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:glm5
+          env:
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --max-model-len 131072
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype auto
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
@@ -0,0 +1,48 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-glm5-zai-org-glm-4.7-flash-nvidia-h200-sxm-tp2-moe-tp2
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-4.7-flash
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp2-moe-tp2"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-4.7-Flash
+  parallelism:
+    tensor: 2
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:glm5
+          env:
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --max-model-len 200000
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype auto
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "2"
+            limits:
+              nvidia.com/gpu: "2"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
@@ -0,0 +1,53 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-deepseek-ai-deepseek-r1-nvidia-h100-sxm-dp16-moe-ep16
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: deepseek-ai
+    mif.moreh.io/model.name: deepseek-r1
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h100-sxm
+    mif.moreh.io/parallelism: "dp16-moe-ep16"
+spec:
+  framework: vllm
+  model:
+    name: deepseek-ai/DeepSeek-R1
+  parallelism:
+    data: 16
+    dataLocal: 8
+    expert: true
+  workerTemplate:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: VLLM_USE_DEEP_GEMM
+              value: "1"
+            - name: VLLM_ALL2ALL_BACKEND
+              value: deepep_low_latency
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 32768
+                --max-num-seqs 128
+                --max-num-batched-tokens 57344
+                --gpu-memory-utilization 0.9
+                --reasoning-parser deepseek_r1
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "8"
+            limits:
+              nvidia.com/gpu: "8"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h100-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule