From 39cfada019d3f368080b50d6d752a78569f12158 Mon Sep 17 00:00:00 2001 From: vvarshney Date: Wed, 13 May 2026 16:27:54 +0530 Subject: [PATCH 1/2] Changes for adding gemma model config --- config/configs/applications.yaml | 52 ++++++++++++++++-------- config/configs/features/saia.yaml | 2 +- pkg/ai/raybuilder/configmap_apps_test.go | 15 ++++--- tools/cluster_setup/K0S_QUICKSTART.md | 2 +- 4 files changed, 46 insertions(+), 25 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index eb384ce..0089626 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -35,62 +35,80 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - args: - application_name: GptOss120b + application_name: Gemma431bIt deployment_configs: LLMDeployment: gpu_type_options_override: H100: + autoscaling_config: + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} + target_ongoing_requests: 6 + max_ongoing_requests: 8 ray_actor_options: num_gpus: 1 L40S: + autoscaling_config: + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} + target_ongoing_requests: 4 + max_ongoing_requests: 6 ray_actor_options: - num_gpus: 2 + num_gpus: 4 options: autoscaling_config: - max_replicas: {{.Replicas.GptOss120b}} - min_replicas: {{.Replicas.GptOss120b}} + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} deployment_type: text_gen_model_deployment gpu_types: '["{{.AcceleratorType}}"]' model_definition: gpu_type_model_config_override: H100: engine_args: - gpu_memory_utilization: 0.90 + dtype: bfloat16 + gpu_memory_utilization: 0.9 + max_model_len: 32768 + max_num_batched_tokens: 4096 tensor_parallel_size: 1 L40S: engine_args: - gpu_memory_utilization: 0.90 - tensor_parallel_size: 2 + dtype: bfloat16 + gpu_memory_utilization: 0.85 + max_model_len: 240000 + max_num_batched_tokens: 4096 + max_num_seqs: 2 + tensor_parallel_size: 4 model_config: openai_serving_config: chat: enable_auto_tools: true - tool_parser: openai + tool_parser: gemma4 responses: enable_auto_tools: true - tool_parser: openai - model_id: gpt_oss_120b + tool_parser: gemma4 + model_id: gemma4_31b_it model_loader: blob_storage: - blob_prefix: model_artifacts/gpt-oss-120b + blob_prefix: model_artifacts/gemma-4-31b-it tokenizer_definition: - model_id: gpt_oss_120b + model_id: gemma4_31b_it model_loader: blob_storage: artifacts_list: - chat_template.jinja - config.json + - processor_config.json - tokenizer_config.json - tokenizer.json - blob_prefix: model_artifacts/gpt-oss-120b - name: GptOss120b + blob_prefix: model_artifacts/gemma-4-31b-it + name: Gemma431bIt import_path: main:create_serve_app - route_prefix: /gpt_oss_120b + route_prefix: /gemma4_31b_it runtime_env: working_dir: "file:///home/ray/ray/applications/generic_application.zip" env_vars: API_VERSION: "v1" - APPLICATION_NAME: gpt_oss_120b + APPLICATION_NAME: gemma4_31b_it VLLM_ATTENTION_BACKEND: TRITON_ATTN ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" S3_BUCKET: "{{.ArtifactBucketName}}" @@ -217,7 +235,7 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" VLLM_WORKER_MULTIPROC_METHOD: spawn - # See GptOss120b above for rationale. Must be "True" in airgap (no + # See Gemma431bIt above for rationale. Must be "True" in airgap (no # Redis) so vLLM uses NoOpOpenAIServingResponses. DISABLE_RESPONSES_API_REDIS: "True" - args: diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index a9192da..d1c7b96 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -4,8 +4,8 @@ applicationScale: CrossEncoder: 1 E5LanguageClassifier: 1 Entrypoint: 1 + Gemma431bIt: 1 GptOss20b: 1 - GptOss120b: 1 MbartTranslator: 1 PromptInjectionClassifier: 1 PromptInjectionCrossEncoder: 1 diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go index 07711c2..4beb1b4 100644 --- a/pkg/ai/raybuilder/configmap_apps_test.go +++ b/pkg/ai/raybuilder/configmap_apps_test.go @@ -86,12 +86,15 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) { } } - // We expect exactly two text-gen apps today (GptOss120b, GptOss20b). If - // this count changes, someone added a new text-gen model; they MUST also - // add DISABLE_RESPONSES_API_REDIS to the new app. - require.Len(t, textGenApps, 2, - "expected exactly 2 text_gen_model_deployment apps (GptOss120b, GptOss20b); "+ + expectedTextGenApps := []string{"Gemma431bIt", "GptOss20b"} + + // We expect exactly two text-gen apps today (Gemma431bIt, GptOss20b). + // If this count changes, someone added a new text-gen model; they MUST + // also add DISABLE_RESPONSES_API_REDIS to the new app. + require.Len(t, textGenApps, len(expectedTextGenApps), + "expected exactly %d text_gen_model_deployment app(s) (%s); "+ "found %d. New text-gen apps MUST set DISABLE_RESPONSES_API_REDIS.", + len(expectedTextGenApps), strings.Join(expectedTextGenApps, ", "), len(textGenApps)) for _, a := range textGenApps { @@ -110,7 +113,7 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) { for _, a := range textGenApps { names = append(names, a.Name) } - assert.ElementsMatch(t, []string{"GptOss120b", "GptOss20b"}, names, + assert.ElementsMatch(t, expectedTextGenApps, names, "unexpected set of text_gen_model_deployment apps: %v", names) // Hygiene check: non-text-gen apps should NOT carry this env (it's a diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md index 97db8f2..768fac7 100644 --- a/tools/cluster_setup/K0S_QUICKSTART.md +++ b/tools/cluster_setup/K0S_QUICKSTART.md @@ -34,7 +34,7 @@ The S3 bucket must be pre-populated with the following directories before instal | Model | Purpose | | --------------------------------- | ----------------------------------------------- | -| `gpt-oss-120b` | Primary LLM for chat, SPL generation, reasoning | +| `gemma-4-31b-it` | Primary LLM for chat, SPL generation, reasoning | | `gpt-oss-20b` | Field descriptions, conversation titles | | `all-minilm-l6-v2` | Sentence embeddings (data loader, SAIA) | | `bi-encoder` | Semantic search ranking | From da639317b1a8b70856137ee8dff4fc0e0d98d253 Mon Sep 17 00:00:00 2001 From: vvarshney Date: Thu, 21 May 2026 11:34:06 +0530 Subject: [PATCH 2/2] feat: configure Gemma 4 31B for 2-GPU L40S and add LLM defaults to SAIA operator --- config/configs/applications.yaml | 10 +++++----- config/configs/features/saia.yaml | 1 + config/configs/instance.yaml | 10 ++++++++++ pkg/ai/features/saia/impl.go | 2 ++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 0089626..39bba3f 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -54,7 +54,7 @@ applications: target_ongoing_requests: 4 max_ongoing_requests: 6 ray_actor_options: - num_gpus: 4 + num_gpus: 2 options: autoscaling_config: max_replicas: {{.Replicas.Gemma431bIt}} @@ -74,10 +74,10 @@ applications: engine_args: dtype: bfloat16 gpu_memory_utilization: 0.85 - max_model_len: 240000 + max_model_len: 120000 max_num_batched_tokens: 4096 max_num_seqs: 2 - tensor_parallel_size: 4 + tensor_parallel_size: 2 model_config: openai_serving_config: chat: @@ -89,7 +89,7 @@ applications: model_id: gemma4_31b_it model_loader: blob_storage: - blob_prefix: model_artifacts/gemma-4-31b-it + blob_prefix: model_artifacts/gemma-4-31B-it tokenizer_definition: model_id: gemma4_31b_it model_loader: @@ -100,7 +100,7 @@ applications: - processor_config.json - tokenizer_config.json - tokenizer.json - blob_prefix: model_artifacts/gemma-4-31b-it + blob_prefix: model_artifacts/gemma-4-31B-it name: Gemma431bIt import_path: main:create_serve_app route_prefix: /gemma4_31b_it diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index d1c7b96..bfe5d96 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -17,6 +17,7 @@ instanceScale: l40s-0-gpu: 1 l40s-1-gpu: 2 l40s-2-gpu: 1 + l40s-4-gpu: 0 H100: h100-0-gpu: 1 h100-1-gpu: 2 diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index e704fd7..71ea8e7 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -31,6 +31,16 @@ L40S: memory: "48Gi" ephemeral-storage: "100Gi" nvidia.com/gpu: "2" + - tier: l40s-4-gpu + gpusPerPod: 4 + resources: + requests: + cpu: "1" + limits: + cpu: "8" + memory: "96Gi" + ephemeral-storage: "200Gi" + nvidia.com/gpu: "4" H100: - tier: h100-0-gpu gpusPerPod: 0 diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go index c018395..8f1bc65 100644 --- a/pkg/ai/features/saia/impl.go +++ b/pkg/ai/features/saia/impl.go @@ -349,6 +349,8 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap( "LOG_LEVEL": "info", "USE_GPT_OSS": "true", "SCS_TOKEN": "no-auth-required", + "LLM_PROVIDER": "ml-platform", + "LLM_MODEL": "gemma4_31b_it", } found := &corev1.ConfigMap{}