From 39cfada019d3f368080b50d6d752a78569f12158 Mon Sep 17 00:00:00 2001
From: vvarshney <vvarshney@splunk.com>
Date: Wed, 13 May 2026 16:27:54 +0530
Subject: [PATCH 1/2] Changes for adding gemma model config

---
 config/configs/applications.yaml         | 52 ++++++++++++++++--------
 config/configs/features/saia.yaml        |  2 +-
 pkg/ai/raybuilder/configmap_apps_test.go | 15 ++++---
 tools/cluster_setup/K0S_QUICKSTART.md    |  2 +-
 4 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml
index eb384ce..0089626 100644
--- a/config/configs/applications.yaml
+++ b/config/configs/applications.yaml
@@ -35,62 +35,80 @@ applications:
         SKIP_VERIFICATION: "true"
         USE_SYSTEM_PERMISSIONS: "true"
   - args:
-      application_name: GptOss120b
+      application_name: Gemma431bIt
       deployment_configs:
         LLMDeployment:
           gpu_type_options_override:
             H100:
+              autoscaling_config:
+                max_replicas: {{.Replicas.Gemma431bIt}}
+                min_replicas: {{.Replicas.Gemma431bIt}}
+                target_ongoing_requests: 6
+              max_ongoing_requests: 8
               ray_actor_options:
                 num_gpus: 1
             L40S:
+              autoscaling_config:
+                max_replicas: {{.Replicas.Gemma431bIt}}
+                min_replicas: {{.Replicas.Gemma431bIt}}
+                target_ongoing_requests: 4
+              max_ongoing_requests: 6
               ray_actor_options:
-                num_gpus: 2
+                num_gpus: 4
           options:
             autoscaling_config:
-              max_replicas: {{.Replicas.GptOss120b}}
-              min_replicas: {{.Replicas.GptOss120b}}
+              max_replicas: {{.Replicas.Gemma431bIt}}
+              min_replicas: {{.Replicas.Gemma431bIt}}
       deployment_type: text_gen_model_deployment
       gpu_types: '["{{.AcceleratorType}}"]'
       model_definition:
         gpu_type_model_config_override:
           H100:
             engine_args:
-              gpu_memory_utilization: 0.90
+              dtype: bfloat16
+              gpu_memory_utilization: 0.9
+              max_model_len: 32768
+              max_num_batched_tokens: 4096
               tensor_parallel_size: 1
           L40S:
             engine_args:
-              gpu_memory_utilization: 0.90
-              tensor_parallel_size: 2
+              dtype: bfloat16
+              gpu_memory_utilization: 0.85
+              max_model_len: 240000
+              max_num_batched_tokens: 4096
+              max_num_seqs: 2
+              tensor_parallel_size: 4
         model_config:
           openai_serving_config:
             chat:
               enable_auto_tools: true
-              tool_parser: openai
+              tool_parser: gemma4
             responses:
               enable_auto_tools: true
-              tool_parser: openai
-        model_id: gpt_oss_120b
+              tool_parser: gemma4
+        model_id: gemma4_31b_it
         model_loader:
           blob_storage:
-            blob_prefix: model_artifacts/gpt-oss-120b
+            blob_prefix: model_artifacts/gemma-4-31b-it
       tokenizer_definition:
-        model_id: gpt_oss_120b
+        model_id: gemma4_31b_it
         model_loader:
           blob_storage:
             artifacts_list:
               - chat_template.jinja
               - config.json
+              - processor_config.json
               - tokenizer_config.json
               - tokenizer.json
-            blob_prefix: model_artifacts/gpt-oss-120b
-    name: GptOss120b
+            blob_prefix: model_artifacts/gemma-4-31b-it
+    name: Gemma431bIt
     import_path: main:create_serve_app
-    route_prefix: /gpt_oss_120b
+    route_prefix: /gemma4_31b_it
     runtime_env:
       working_dir: "file:///home/ray/ray/applications/generic_application.zip"
       env_vars:
         API_VERSION: "v1"
-        APPLICATION_NAME: gpt_oss_120b
+        APPLICATION_NAME: gemma4_31b_it
         VLLM_ATTENTION_BACKEND: TRITON_ATTN
         ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}"
         S3_BUCKET: "{{.ArtifactBucketName}}"
@@ -217,7 +235,7 @@ applications:
         SKIP_VERIFICATION: "true"
         USE_SYSTEM_PERMISSIONS: "true"
         VLLM_WORKER_MULTIPROC_METHOD: spawn
-        # See GptOss120b above for rationale. Must be "True" in airgap (no
+        # See Gemma431bIt above for rationale. Must be "True" in airgap (no
         # Redis) so vLLM uses NoOpOpenAIServingResponses.
         DISABLE_RESPONSES_API_REDIS: "True"
   - args:
diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml
index a9192da..d1c7b96 100644
--- a/config/configs/features/saia.yaml
+++ b/config/configs/features/saia.yaml
@@ -4,8 +4,8 @@ applicationScale:
   CrossEncoder: 1
   E5LanguageClassifier: 1
   Entrypoint: 1
+  Gemma431bIt: 1
   GptOss20b: 1
-  GptOss120b: 1
   MbartTranslator: 1
   PromptInjectionClassifier: 1
   PromptInjectionCrossEncoder: 1
diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go
index 07711c2..4beb1b4 100644
--- a/pkg/ai/raybuilder/configmap_apps_test.go
+++ b/pkg/ai/raybuilder/configmap_apps_test.go
@@ -86,12 +86,15 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) {
 		}
 	}
 
-	// We expect exactly two text-gen apps today (GptOss120b, GptOss20b). If
-	// this count changes, someone added a new text-gen model; they MUST also
-	// add DISABLE_RESPONSES_API_REDIS to the new app.
-	require.Len(t, textGenApps, 2,
-		"expected exactly 2 text_gen_model_deployment apps (GptOss120b, GptOss20b); "+
+	expectedTextGenApps := []string{"Gemma431bIt", "GptOss20b"}
+
+	// We expect exactly two text-gen apps today (Gemma431bIt, GptOss20b).
+	// If this count changes, someone added a new text-gen model; they MUST
+	// also add DISABLE_RESPONSES_API_REDIS to the new app.
+	require.Len(t, textGenApps, len(expectedTextGenApps),
+		"expected exactly %d text_gen_model_deployment app(s) (%s); "+
 			"found %d. New text-gen apps MUST set DISABLE_RESPONSES_API_REDIS.",
+		len(expectedTextGenApps), strings.Join(expectedTextGenApps, ", "),
 		len(textGenApps))
 
 	for _, a := range textGenApps {
@@ -110,7 +113,7 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) {
 	for _, a := range textGenApps {
 		names = append(names, a.Name)
 	}
-	assert.ElementsMatch(t, []string{"GptOss120b", "GptOss20b"}, names,
+	assert.ElementsMatch(t, expectedTextGenApps, names,
 		"unexpected set of text_gen_model_deployment apps: %v", names)
 
 	// Hygiene check: non-text-gen apps should NOT carry this env (it's a
diff --git a/tools/cluster_setup/K0S_QUICKSTART.md b/tools/cluster_setup/K0S_QUICKSTART.md
index 97db8f2..768fac7 100644
--- a/tools/cluster_setup/K0S_QUICKSTART.md
+++ b/tools/cluster_setup/K0S_QUICKSTART.md
@@ -34,7 +34,7 @@ The S3 bucket must be pre-populated with the following directories before instal
 
 | Model                             | Purpose                                         |
 | --------------------------------- | ----------------------------------------------- |
-| `gpt-oss-120b`                    | Primary LLM for chat, SPL generation, reasoning |
+| `gemma-4-31b-it`                  | Primary LLM for chat, SPL generation, reasoning |
 | `gpt-oss-20b`                     | Field descriptions, conversation titles         |
 | `all-minilm-l6-v2`                | Sentence embeddings (data loader, SAIA)         |
 | `bi-encoder`                      | Semantic search ranking                         |

From da639317b1a8b70856137ee8dff4fc0e0d98d253 Mon Sep 17 00:00:00 2001
From: vvarshney <vvarshney@splunk.com>
Date: Thu, 21 May 2026 11:34:06 +0530
Subject: [PATCH 2/2] feat: configure Gemma 4 31B for 2-GPU L40S and add LLM
 defaults to SAIA operator

---
 config/configs/applications.yaml  | 10 +++++-----
 config/configs/features/saia.yaml |  1 +
 config/configs/instance.yaml      | 10 ++++++++++
 pkg/ai/features/saia/impl.go      |  2 ++
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml
index 0089626..39bba3f 100644
--- a/config/configs/applications.yaml
+++ b/config/configs/applications.yaml
@@ -54,7 +54,7 @@ applications:
                 target_ongoing_requests: 4
               max_ongoing_requests: 6
               ray_actor_options:
-                num_gpus: 4
+                num_gpus: 2
           options:
             autoscaling_config:
               max_replicas: {{.Replicas.Gemma431bIt}}
@@ -74,10 +74,10 @@ applications:
             engine_args:
               dtype: bfloat16
               gpu_memory_utilization: 0.85
-              max_model_len: 240000
+              max_model_len: 120000
               max_num_batched_tokens: 4096
               max_num_seqs: 2
-              tensor_parallel_size: 4
+              tensor_parallel_size: 2
         model_config:
           openai_serving_config:
             chat:
@@ -89,7 +89,7 @@ applications:
         model_id: gemma4_31b_it
         model_loader:
           blob_storage:
-            blob_prefix: model_artifacts/gemma-4-31b-it
+            blob_prefix: model_artifacts/gemma-4-31B-it
       tokenizer_definition:
         model_id: gemma4_31b_it
         model_loader:
@@ -100,7 +100,7 @@ applications:
               - processor_config.json
               - tokenizer_config.json
               - tokenizer.json
-            blob_prefix: model_artifacts/gemma-4-31b-it
+            blob_prefix: model_artifacts/gemma-4-31B-it
     name: Gemma431bIt
     import_path: main:create_serve_app
     route_prefix: /gemma4_31b_it
diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml
index d1c7b96..bfe5d96 100644
--- a/config/configs/features/saia.yaml
+++ b/config/configs/features/saia.yaml
@@ -17,6 +17,7 @@ instanceScale:
     l40s-0-gpu: 1
     l40s-1-gpu: 2
     l40s-2-gpu: 1
+    l40s-4-gpu: 0
   H100:
     h100-0-gpu: 1
     h100-1-gpu: 2
diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml
index e704fd7..71ea8e7 100644
--- a/config/configs/instance.yaml
+++ b/config/configs/instance.yaml
@@ -31,6 +31,16 @@ L40S:
         memory: "48Gi"
         ephemeral-storage: "100Gi"
         nvidia.com/gpu: "2"
+  - tier: l40s-4-gpu
+    gpusPerPod: 4
+    resources:
+      requests:
+        cpu: "1"
+      limits:
+        cpu: "8"
+        memory: "96Gi"
+        ephemeral-storage: "200Gi"
+        nvidia.com/gpu: "4"
 H100:
   - tier: h100-0-gpu
     gpusPerPod: 0
diff --git a/pkg/ai/features/saia/impl.go b/pkg/ai/features/saia/impl.go
index c018395..8f1bc65 100644
--- a/pkg/ai/features/saia/impl.go
+++ b/pkg/ai/features/saia/impl.go
@@ -349,6 +349,8 @@ func (r *SaiaReconciler) reconcileSAIAConfigMap(
 		"LOG_LEVEL":                       "info",
 		"USE_GPT_OSS":                     "true",
 		"SCS_TOKEN":                       "no-auth-required",
+		"LLM_PROVIDER":                    "ml-platform",
+		"LLM_MODEL":                       "gemma4_31b_it",
 	}
 
 	found := &corev1.ConfigMap{}