splunk · vavarshn · May 13, 2026 · May 21, 2026
diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml
@@ -35,62 +35,80 @@ applications:
         SKIP_VERIFICATION: "true"
         USE_SYSTEM_PERMISSIONS: "true"
   - args:
-      application_name: GptOss120b
+      application_name: Gemma431bIt
       deployment_configs:
         LLMDeployment:
           gpu_type_options_override:
             H100:
+              autoscaling_config:
+                max_replicas: {{.Replicas.Gemma431bIt}}
+                min_replicas: {{.Replicas.Gemma431bIt}}
+                target_ongoing_requests: 6
+              max_ongoing_requests: 8
               ray_actor_options:
                 num_gpus: 1
             L40S:
+              autoscaling_config:
+                max_replicas: {{.Replicas.Gemma431bIt}}
+                min_replicas: {{.Replicas.Gemma431bIt}}
+                target_ongoing_requests: 4
+              max_ongoing_requests: 6
               ray_actor_options:
                 num_gpus: 2
           options:
             autoscaling_config:
-              max_replicas: {{.Replicas.GptOss120b}}
-              min_replicas: {{.Replicas.GptOss120b}}
+              max_replicas: {{.Replicas.Gemma431bIt}}
+              min_replicas: {{.Replicas.Gemma431bIt}}
       deployment_type: text_gen_model_deployment
       gpu_types: '["{{.AcceleratorType}}"]'
       model_definition:
         gpu_type_model_config_override:
           H100:
             engine_args:
-              gpu_memory_utilization: 0.90
+              dtype: bfloat16
+              gpu_memory_utilization: 0.9
+              max_model_len: 32768
+              max_num_batched_tokens: 4096
               tensor_parallel_size: 1
           L40S:
             engine_args:
-              gpu_memory_utilization: 0.90
+              dtype: bfloat16
+              gpu_memory_utilization: 0.85
+              max_model_len: 120000
+              max_num_batched_tokens: 4096
+              max_num_seqs: 2
               tensor_parallel_size: 2
         model_config:
           openai_serving_config:
             chat:
               enable_auto_tools: true
-              tool_parser: openai
+              tool_parser: gemma4
             responses:
               enable_auto_tools: true
-              tool_parser: openai
-        model_id: gpt_oss_120b
+              tool_parser: gemma4
+        model_id: gemma4_31b_it
         model_loader:
           blob_storage:
-            blob_prefix: model_artifacts/gpt-oss-120b
+            blob_prefix: model_artifacts/gemma-4-31B-it
       tokenizer_definition:
-        model_id: gpt_oss_120b
+        model_id: gemma4_31b_it
         model_loader:
           blob_storage:
             artifacts_list:
               - chat_template.jinja
               - config.json
+              - processor_config.json
               - tokenizer_config.json
               - tokenizer.json
-            blob_prefix: model_artifacts/gpt-oss-120b
-    name: GptOss120b
+            blob_prefix: model_artifacts/gemma-4-31B-it
+    name: Gemma431bIt
     import_path: main:create_serve_app
-    route_prefix: /gpt_oss_120b
+    route_prefix: /gemma4_31b_it
     runtime_env:
       working_dir: "file:///home/ray/ray/applications/generic_application.zip"
       env_vars:
         API_VERSION: "v1"
-        APPLICATION_NAME: gpt_oss_120b
+        APPLICATION_NAME: gemma4_31b_it
         VLLM_ATTENTION_BACKEND: TRITON_ATTN
         ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}"
         S3_BUCKET: "{{.ArtifactBucketName}}"
@@ -217,7 +235,7 @@ applications:
         SKIP_VERIFICATION: "true"
         USE_SYSTEM_PERMISSIONS: "true"
         VLLM_WORKER_MULTIPROC_METHOD: spawn
-        # See GptOss120b above for rationale. Must be "True" in airgap (no
+        # See Gemma431bIt above for rationale. Must be "True" in airgap (no
         # Redis) so vLLM uses NoOpOpenAIServingResponses.
         DISABLE_RESPONSES_API_REDIS: "True"
   - args:

diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml
@@ -4,8 +4,8 @@ applicationScale:
   CrossEncoder: 1
   E5LanguageClassifier: 1
   Entrypoint: 1
+  Gemma431bIt: 1
   GptOss20b: 1
-  GptOss120b: 1
   MbartTranslator: 1
   PromptInjectionClassifier: 1
   PromptInjectionCrossEncoder: 1
@@ -17,6 +17,7 @@ instanceScale:
     l40s-0-gpu: 1
     l40s-1-gpu: 2
     l40s-2-gpu: 1
+    l40s-4-gpu: 0
   H100:
     h100-0-gpu: 1
     h100-1-gpu: 2

diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml
@@ -31,6 +31,16 @@ L40S:
         memory: "48Gi"
         ephemeral-storage: "100Gi"
         nvidia.com/gpu: "2"
+  - tier: l40s-4-gpu
+    gpusPerPod: 4
+    resources:
+      requests:
+        cpu: "1"
+      limits:
+        cpu: "8"
+        memory: "96Gi"
+        ephemeral-storage: "200Gi"
+        nvidia.com/gpu: "4"
 H100:
   - tier: h100-0-gpu
     gpusPerPod: 0

diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go
@@ -349,6 +349,8 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap(
 		"LOG_LEVEL":                       "info",
 		"USE_GPT_OSS":                     "true",
 		"SCS_TOKEN":                       "no-auth-required",
+		"LLM_PROVIDER":                    "ml-platform",
+		"LLM_MODEL":                       "gemma4_31b_it",
 	}
 
 	found := &corev1.ConfigMap{}

diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go
@@ -86,12 +86,15 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) {
 		}
 	}
 
-	// We expect exactly two text-gen apps today (GptOss120b, GptOss20b). If
-	// this count changes, someone added a new text-gen model; they MUST also
-	// add DISABLE_RESPONSES_API_REDIS to the new app.
-	require.Len(t, textGenApps, 2,
-		"expected exactly 2 text_gen_model_deployment apps (GptOss120b, GptOss20b); "+
+	expectedTextGenApps := []string{"Gemma431bIt", "GptOss20b"}
+
+	// We expect exactly two text-gen apps today (Gemma431bIt, GptOss20b).
+	// If this count changes, someone added a new text-gen model; they MUST
+	// also add DISABLE_RESPONSES_API_REDIS to the new app.
+	require.Len(t, textGenApps, len(expectedTextGenApps),
+		"expected exactly %d text_gen_model_deployment app(s) (%s); "+
 			"found %d. New text-gen apps MUST set DISABLE_RESPONSES_API_REDIS.",
+		len(expectedTextGenApps), strings.Join(expectedTextGenApps, ", "),
 		len(textGenApps))
 
 	for _, a := range textGenApps {
@@ -110,7 +113,7 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) {
 	for _, a := range textGenApps {
 		names = append(names, a.Name)
 	}
-	assert.ElementsMatch(t, []string{"GptOss120b", "GptOss20b"}, names,
+	assert.ElementsMatch(t, expectedTextGenApps, names,
 		"unexpected set of text_gen_model_deployment apps: %v", names)
 
 	// Hygiene check: non-text-gen apps should NOT carry this env (it's a

diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md
@@ -34,7 +34,7 @@ The S3 bucket must be pre-populated with the following directories before instal
 
 | Model                             | Purpose                                         |
 | --------------------------------- | ----------------------------------------------- |
-| `gpt-oss-120b`                    | Primary LLM for chat, SPL generation, reasoning |
+| `gemma-4-31b-it`                  | Primary LLM for chat, SPL generation, reasoning |
 | `gpt-oss-20b`                     | Field descriptions, conversation titles         |
 | `all-minilm-l6-v2`                | Sentence embeddings (data loader, SAIA)         |
 | `bi-encoder`                      | Semantic search ranking                         |