Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 33 additions & 15 deletions config/configs/applications.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,62 +35,80 @@ applications:
SKIP_VERIFICATION: "true"
USE_SYSTEM_PERMISSIONS: "true"
- args:
application_name: GptOss120b
application_name: Gemma431bIt
deployment_configs:
LLMDeployment:
gpu_type_options_override:
H100:
autoscaling_config:
max_replicas: {{.Replicas.Gemma431bIt}}
min_replicas: {{.Replicas.Gemma431bIt}}
target_ongoing_requests: 6
max_ongoing_requests: 8
ray_actor_options:
num_gpus: 1
L40S:
autoscaling_config:
max_replicas: {{.Replicas.Gemma431bIt}}
min_replicas: {{.Replicas.Gemma431bIt}}
target_ongoing_requests: 4
max_ongoing_requests: 6
ray_actor_options:
num_gpus: 2
options:
autoscaling_config:
max_replicas: {{.Replicas.GptOss120b}}
min_replicas: {{.Replicas.GptOss120b}}
max_replicas: {{.Replicas.Gemma431bIt}}
min_replicas: {{.Replicas.Gemma431bIt}}
deployment_type: text_gen_model_deployment
gpu_types: '["{{.AcceleratorType}}"]'
model_definition:
gpu_type_model_config_override:
H100:
engine_args:
gpu_memory_utilization: 0.90
dtype: bfloat16
gpu_memory_utilization: 0.9
max_model_len: 32768
max_num_batched_tokens: 4096
tensor_parallel_size: 1
L40S:
engine_args:
gpu_memory_utilization: 0.90
dtype: bfloat16
gpu_memory_utilization: 0.85
max_model_len: 120000
max_num_batched_tokens: 4096
max_num_seqs: 2
tensor_parallel_size: 2
model_config:
openai_serving_config:
chat:
enable_auto_tools: true
tool_parser: openai
tool_parser: gemma4
responses:
enable_auto_tools: true
tool_parser: openai
model_id: gpt_oss_120b
tool_parser: gemma4
model_id: gemma4_31b_it
model_loader:
blob_storage:
blob_prefix: model_artifacts/gpt-oss-120b
blob_prefix: model_artifacts/gemma-4-31B-it
tokenizer_definition:
model_id: gpt_oss_120b
model_id: gemma4_31b_it
model_loader:
blob_storage:
artifacts_list:
- chat_template.jinja
- config.json
- processor_config.json
- tokenizer_config.json
- tokenizer.json
blob_prefix: model_artifacts/gpt-oss-120b
name: GptOss120b
blob_prefix: model_artifacts/gemma-4-31B-it
name: Gemma431bIt
import_path: main:create_serve_app
route_prefix: /gpt_oss_120b
route_prefix: /gemma4_31b_it
runtime_env:
working_dir: "file:///home/ray/ray/applications/generic_application.zip"
env_vars:
API_VERSION: "v1"
APPLICATION_NAME: gpt_oss_120b
APPLICATION_NAME: gemma4_31b_it
VLLM_ATTENTION_BACKEND: TRITON_ATTN
ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}"
S3_BUCKET: "{{.ArtifactBucketName}}"
Expand Down Expand Up @@ -217,7 +235,7 @@ applications:
SKIP_VERIFICATION: "true"
USE_SYSTEM_PERMISSIONS: "true"
VLLM_WORKER_MULTIPROC_METHOD: spawn
# See GptOss120b above for rationale. Must be "True" in airgap (no
# See Gemma431bIt above for rationale. Must be "True" in airgap (no
# Redis) so vLLM uses NoOpOpenAIServingResponses.
DISABLE_RESPONSES_API_REDIS: "True"
- args:
Expand Down
3 changes: 2 additions & 1 deletion config/configs/features/saia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ applicationScale:
CrossEncoder: 1
E5LanguageClassifier: 1
Entrypoint: 1
Gemma431bIt: 1
GptOss20b: 1
GptOss120b: 1
MbartTranslator: 1
PromptInjectionClassifier: 1
PromptInjectionCrossEncoder: 1
Expand All @@ -17,6 +17,7 @@ instanceScale:
l40s-0-gpu: 1
l40s-1-gpu: 2
l40s-2-gpu: 1
l40s-4-gpu: 0
H100:
h100-0-gpu: 1
h100-1-gpu: 2
Expand Down
10 changes: 10 additions & 0 deletions config/configs/instance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ L40S:
memory: "48Gi"
ephemeral-storage: "100Gi"
nvidia.com/gpu: "2"
- tier: l40s-4-gpu
gpusPerPod: 4
resources:
requests:
cpu: "1"
limits:
cpu: "8"
memory: "96Gi"
ephemeral-storage: "200Gi"
nvidia.com/gpu: "4"
H100:
- tier: h100-0-gpu
gpusPerPod: 0
Expand Down
2 changes: 2 additions & 0 deletions pkg/ai/features/saia/impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap(
"LOG_LEVEL": "info",
"USE_GPT_OSS": "true",
"SCS_TOKEN": "no-auth-required",
"LLM_PROVIDER": "ml-platform",
"LLM_MODEL": "gemma4_31b_it",
}

found := &corev1.ConfigMap{}
Expand Down
15 changes: 9 additions & 6 deletions pkg/ai/raybuilder/configmap_apps_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,15 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) {
}
}

// We expect exactly two text-gen apps today (GptOss120b, GptOss20b). If
// this count changes, someone added a new text-gen model; they MUST also
// add DISABLE_RESPONSES_API_REDIS to the new app.
require.Len(t, textGenApps, 2,
"expected exactly 2 text_gen_model_deployment apps (GptOss120b, GptOss20b); "+
expectedTextGenApps := []string{"Gemma431bIt", "GptOss20b"}

// We expect exactly two text-gen apps today (Gemma431bIt, GptOss20b).
// If this count changes, someone added a new text-gen model; they MUST
// also add DISABLE_RESPONSES_API_REDIS to the new app.
require.Len(t, textGenApps, len(expectedTextGenApps),
"expected exactly %d text_gen_model_deployment app(s) (%s); "+
"found %d. New text-gen apps MUST set DISABLE_RESPONSES_API_REDIS.",
len(expectedTextGenApps), strings.Join(expectedTextGenApps, ", "),
len(textGenApps))

for _, a := range textGenApps {
Expand All @@ -110,7 +113,7 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) {
for _, a := range textGenApps {
names = append(names, a.Name)
}
assert.ElementsMatch(t, []string{"GptOss120b", "GptOss20b"}, names,
assert.ElementsMatch(t, expectedTextGenApps, names,
"unexpected set of text_gen_model_deployment apps: %v", names)

// Hygiene check: non-text-gen apps should NOT carry this env (it's a
Expand Down
2 changes: 1 addition & 1 deletion tools/cluster_setup/K0S_QUICKSTART.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ The S3 bucket must be pre-populated with the following directories before instal

| Model | Purpose |
| --------------------------------- | ----------------------------------------------- |
| `gpt-oss-120b` | Primary LLM for chat, SPL generation, reasoning |
| `gemma-4-31b-it` | Primary LLM for chat, SPL generation, reasoning |
| `gpt-oss-20b` | Field descriptions, conversation titles |
| `all-minilm-l6-v2` | Sentence embeddings (data loader, SAIA) |
| `bi-encoder` | Semantic search ranking |
Expand Down
Loading