From 18443589462121af2691be3de1761d5312de0b5c Mon Sep 17 00:00:00 2001
From: Gokul A <nargokul@amazon.com>
Date: Thu, 5 Mar 2026 12:14:43 -0800
Subject: [PATCH 1/4] V3 Bug Fixes

---
 .../src/sagemaker/core/processing.py          |   3 +
 sagemaker-core/tests/unit/test_processing.py  | 255 +++++++
 .../serve/mode/local_container_mode.py        |  24 +-
 .../src/sagemaker/serve/model_builder.py      |  13 +-
 .../tests/unit/test_model_builder_methods.py  | 311 +++++++++
 .../train/common_utils/show_results_utils.py  |   7 +-
 .../common_utils/test_show_results_utils.py   | 657 ++++++++++++++++++
 7 files changed, 1264 insertions(+), 6 deletions(-)

diff --git a/sagemaker-core/src/sagemaker/core/processing.py b/sagemaker-core/src/sagemaker/core/processing.py
index 65dadeda33..736eebf01c 100644
--- a/sagemaker-core/src/sagemaker/core/processing.py
+++ b/sagemaker-core/src/sagemaker/core/processing.py
@@ -487,6 +487,9 @@ def _normalize_outputs(self, outputs=None):
                 # If the output's s3_uri is not an s3_uri, create one.
                 parse_result = urlparse(output.s3_output.s3_uri)
                 if parse_result.scheme != "s3":
+                    if getattr(self.sagemaker_session, "local_mode", False) and parse_result.scheme == "file":
+                        normalized_outputs.append(output)
+                        continue
                     if _pipeline_config:
                         s3_uri = Join(
                             on="/",
diff --git a/sagemaker-core/tests/unit/test_processing.py b/sagemaker-core/tests/unit/test_processing.py
index e7dbdc17bc..dbe8d5f9ef 100644
--- a/sagemaker-core/tests/unit/test_processing.py
+++ b/sagemaker-core/tests/unit/test_processing.py
@@ -238,6 +238,261 @@ def test_normalize_outputs_invalid_type(self, mock_session):
             processor._normalize_outputs(["invalid"])
 
 
+
+
+class TestBugConditionFileUriReplacedInLocalMode:
+    """Bug condition exploration test: file:// URIs should be preserved in local mode.
+
+    **Validates: Requirements 1.1, 1.2, 2.1, 2.2**
+
+    EXPECTED TO FAIL on unfixed code — failure confirms the bug exists.
+    The bug is that _normalize_outputs() replaces file:// URIs with s3:// paths
+    even when the session is a LocalSession (local_mode=True).
+    """
+
+    @pytest.fixture
+    def local_mock_session(self):
+        session = Mock()
+        session.boto_session = Mock()
+        session.boto_session.region_name = "us-west-2"
+        session.sagemaker_client = Mock()
+        session.default_bucket = Mock(return_value="default-bucket")
+        session.default_bucket_prefix = "prefix"
+        session.expand_role = Mock(side_effect=lambda x: x)
+        session.sagemaker_config = {}
+        session.local_mode = True
+        return session
+
+    @pytest.mark.parametrize(
+        "file_uri",
+        [
+            "file:///tmp/output",
+            "file:///home/user/results",
+            "file:///data/processed",
+        ],
+    )
+    def test_normalize_outputs_preserves_file_uri_in_local_mode(self, local_mock_session, file_uri):
+        """file:// URIs must be preserved when local_mode=True.
+
+        On unfixed code, _normalize_outputs replaces file:// URIs with
+        s3://default-bucket/prefix/job-name/output/output-1, which is the bug.
+        """
+        processor = Processor(
+            role="arn:aws:iam::123456789012:role/SageMakerRole",
+            image_uri="test-image:latest",
+            instance_count=1,
+            instance_type="ml.m5.xlarge",
+            sagemaker_session=local_mock_session,
+        )
+        processor._current_job_name = "test-job"
+
+        s3_output = ProcessingS3Output(
+            s3_uri=file_uri,
+            local_path="/opt/ml/processing/output",
+            s3_upload_mode="EndOfJob",
+        )
+        outputs = [ProcessingOutput(output_name="my-output", s3_output=s3_output)]
+
+        with patch("sagemaker.core.workflow.utilities._pipeline_config", None):
+            result = processor._normalize_outputs(outputs)
+
+        assert len(result) == 1
+        assert result[0].s3_output.s3_uri == file_uri, (
+            f"Expected file:// URI to be preserved as '{file_uri}' in local mode, "
+            f"but got '{result[0].s3_output.s3_uri}'"
+        )
+
+
+class TestPreservationNonLocalFileBehavior:
+    """Preservation property tests: Non-local-file behavior must remain unchanged.
+
+    **Validates: Requirements 3.1, 3.2, 3.3, 3.4**
+
+    These tests capture baseline behavior on UNFIXED code. They MUST PASS on both
+    unfixed and fixed code, confirming no regressions are introduced by the fix.
+    """
+
+    @pytest.fixture
+    def session_local_mode_true(self):
+        session = Mock()
+        session.boto_session = Mock()
+        session.boto_session.region_name = "us-west-2"
+        session.sagemaker_client = Mock()
+        session.default_bucket = Mock(return_value="default-bucket")
+        session.default_bucket_prefix = "prefix"
+        session.expand_role = Mock(side_effect=lambda x: x)
+        session.sagemaker_config = {}
+        session.local_mode = True
+        return session
+
+    @pytest.fixture
+    def session_local_mode_false(self):
+        session = Mock()
+        session.boto_session = Mock()
+        session.boto_session.region_name = "us-west-2"
+        session.sagemaker_client = Mock()
+        session.default_bucket = Mock(return_value="default-bucket")
+        session.default_bucket_prefix = "prefix"
+        session.expand_role = Mock(side_effect=lambda x: x)
+        session.sagemaker_config = {}
+        session.local_mode = False
+        return session
+
+    def _make_processor(self, session):
+        processor = Processor(
+            role="arn:aws:iam::123456789012:role/SageMakerRole",
+            image_uri="test-image:latest",
+            instance_count=1,
+            instance_type="ml.m5.xlarge",
+            sagemaker_session=session,
+        )
+        processor._current_job_name = "test-job"
+        return processor
+
+    # --- Requirement 3.1: S3 URIs pass through unchanged regardless of local_mode ---
+
+    @pytest.mark.parametrize(
+        "s3_uri,local_mode_fixture",
+        [
+            ("s3://my-bucket/path", "session_local_mode_true"),
+            ("s3://my-bucket/path", "session_local_mode_false"),
+            ("s3://another-bucket/deep/nested/path", "session_local_mode_true"),
+            ("s3://another-bucket/deep/nested/path", "session_local_mode_false"),
+        ],
+    )
+    def test_s3_uri_preserved_regardless_of_local_mode(self, s3_uri, local_mode_fixture, request):
+        """S3 URIs must pass through unchanged regardless of local_mode setting.
+
+        **Validates: Requirements 3.1**
+        """
+        session = request.getfixturevalue(local_mode_fixture)
+        processor = self._make_processor(session)
+
+        s3_output = ProcessingS3Output(
+            s3_uri=s3_uri,
+            local_path="/opt/ml/processing/output",
+            s3_upload_mode="EndOfJob",
+        )
+        outputs = [ProcessingOutput(output_name="my-output", s3_output=s3_output)]
+
+        with patch("sagemaker.core.workflow.utilities._pipeline_config", None):
+            result = processor._normalize_outputs(outputs)
+
+        assert len(result) == 1
+        assert result[0].s3_output.s3_uri == s3_uri
+
+    # --- Requirement 3.2: Non-S3 URIs with local_mode=False replaced with S3 paths ---
+
+    @pytest.mark.parametrize(
+        "non_s3_uri",
+        [
+            "/local/output/path",
+            "http://example.com/output",
+            "ftp://server/output",
+        ],
+    )
+    def test_non_s3_uri_replaced_when_not_local_mode(self, non_s3_uri, session_local_mode_false):
+        """Non-S3 URIs in non-local sessions are replaced with auto-generated S3 paths.
+
+        **Validates: Requirements 3.2**
+        """
+        processor = self._make_processor(session_local_mode_false)
+
+        s3_output = ProcessingS3Output(
+            s3_uri=non_s3_uri,
+            local_path="/opt/ml/processing/output",
+            s3_upload_mode="EndOfJob",
+        )
+        outputs = [ProcessingOutput(output_name="output-1", s3_output=s3_output)]
+
+        with patch("sagemaker.core.workflow.utilities._pipeline_config", None):
+            result = processor._normalize_outputs(outputs)
+
+        assert len(result) == 1
+        assert result[0].s3_output.s3_uri.startswith("s3://default-bucket/")
+
+    # --- Requirement 3.3: Pipeline variable URIs skip normalization ---
+
+    def test_pipeline_variable_uri_skips_normalization(self, session_local_mode_false):
+        """Pipeline variable URIs skip normalization entirely.
+
+        **Validates: Requirements 3.3**
+        """
+        processor = self._make_processor(session_local_mode_false)
+
+        s3_output = ProcessingS3Output(
+            s3_uri="s3://bucket/output",
+            local_path="/opt/ml/processing/output",
+            s3_upload_mode="EndOfJob",
+        )
+        outputs = [ProcessingOutput(output_name="output-1", s3_output=s3_output)]
+
+        with patch("sagemaker.core.processing.is_pipeline_variable", return_value=True):
+            result = processor._normalize_outputs(outputs)
+
+        assert len(result) == 1
+        # Pipeline variable outputs are appended as-is without URI modification
+        assert result[0].s3_output.s3_uri == "s3://bucket/output"
+
+    # --- Requirement 3.4: Non-ProcessingOutput objects raise TypeError ---
+
+    @pytest.mark.parametrize(
+        "invalid_output",
+        [
+            ["a string"],
+            [42],
+            [{"key": "value"}],
+        ],
+    )
+    def test_non_processing_output_raises_type_error(self, invalid_output, session_local_mode_false):
+        """Non-ProcessingOutput objects must raise TypeError.
+
+        **Validates: Requirements 3.4**
+        """
+        processor = self._make_processor(session_local_mode_false)
+
+        with pytest.raises(TypeError, match="must be provided as ProcessingOutput objects"):
+            processor._normalize_outputs(invalid_output)
+
+    # --- Output name auto-generation ---
+
+    def test_multiple_outputs_with_s3_uris_preserved(self, session_local_mode_false):
+        """Multiple outputs with S3 URIs are all preserved unchanged.
+
+        **Validates: Requirements 3.1, 3.2**
+        """
+        processor = self._make_processor(session_local_mode_false)
+
+        outputs = [
+            ProcessingOutput(
+                output_name="first-output",
+                s3_output=ProcessingS3Output(
+                    s3_uri="s3://my-bucket/first",
+                    local_path="/opt/ml/processing/output1",
+                    s3_upload_mode="EndOfJob",
+                ),
+            ),
+            ProcessingOutput(
+                output_name="second-output",
+                s3_output=ProcessingS3Output(
+                    s3_uri="s3://my-bucket/second",
+                    local_path="/opt/ml/processing/output2",
+                    s3_upload_mode="EndOfJob",
+                ),
+            ),
+        ]
+
+        with patch("sagemaker.core.workflow.utilities._pipeline_config", None):
+            result = processor._normalize_outputs(outputs)
+
+        assert len(result) == 2
+        assert result[0].output_name == "first-output"
+        assert result[1].output_name == "second-output"
+        # S3 URIs should be preserved since they already have s3:// scheme
+        assert result[0].s3_output.s3_uri == "s3://my-bucket/first"
+        assert result[1].s3_output.s3_uri == "s3://my-bucket/second"
+
+
 class TestProcessorStartNew:
     def test_start_new_with_pipeline_session(self, mock_session):
         from sagemaker.core.workflow.pipeline_context import PipelineSession
diff --git a/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py b/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py
index bb8fdde960..3633b850e6 100644
--- a/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py
+++ b/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import
 from pathlib import Path
 import logging
+import os
 from datetime import datetime, timedelta
 from typing import Dict, Type
 import base64
@@ -10,6 +11,8 @@
 import subprocess
 import docker
 
+from sagemaker.core.local.utils import check_for_studio
+
 from sagemaker.serve.model_server.tensorflow_serving.server import LocalTensorflowServing
 from sagemaker.serve.spec.inference_spec import InferenceSpec
 from sagemaker.serve.builder.schema_builder import SchemaBuilder
@@ -33,6 +36,25 @@
     + "Please increase container_timeout_seconds or review your inference code."
 )
 
+STUDIO_DOCKER_SOCKET_PATHS = [
+    "/docker/proxy/docker.sock",
+    "/var/run/docker.sock",
+]
+
+
+def _get_docker_client():
+    """Get a Docker client, handling SageMaker Studio's non-standard socket path."""
+    if os.environ.get("DOCKER_HOST"):
+        return docker.from_env()
+    try:
+        if check_for_studio():
+            for socket_path in STUDIO_DOCKER_SOCKET_PATHS:
+                if os.path.exists(socket_path):
+                    return docker.DockerClient(base_url=f"unix://{socket_path}")
+    except (NotImplementedError, Exception):
+        pass
+    return docker.from_env()
+
 
 class LocalContainerMode(
     LocalTorchServe,
@@ -212,7 +234,7 @@ def _pull_image(self, image: str):
         
         # Check if Docker is available first
         try:
-            self.client = docker.from_env()
+            self.client = _get_docker_client()
             self.client.ping()  # Test Docker connection
         except Exception as e:
             raise RuntimeError(
diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py
index 4f36b0dddb..01463729ac 100644
--- a/sagemaker-serve/src/sagemaker/serve/model_builder.py
+++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py
@@ -133,7 +133,7 @@
     ENDPOINT_CONFIG_ASYNC_KMS_KEY_ID_PATH,
     MODEL_CONTAINERS_PATH,
 )
-from sagemaker.serve.constants import SUPPORTED_MODEL_SERVERS, Framework
+from sagemaker.serve.constants import LOCAL_MODES, SUPPORTED_MODEL_SERVERS, Framework
 from sagemaker.core.workflow.pipeline_context import PipelineSession, runnable_by_pipeline
 from sagemaker.core import fw_utils
 from sagemaker.core.helper.session_helper import container_def
@@ -1287,7 +1287,16 @@ def _build_for_passthrough(self) -> Model:
         if not self.image_uri:
             raise ValueError("image_uri is required for pass-through cases")
 
-        self.s3_upload_path = None
+        self.secret_key = ""
+
+        if not self.model_path:
+            self.s3_upload_path = None
+        else:
+            self.s3_upload_path = self.model_path
+
+        if self.mode in LOCAL_MODES:
+            self._prepare_for_mode()
+
         return self._create_model()
 
     def _build_default_async_inference_config(self, async_inference_config):
diff --git a/sagemaker-serve/tests/unit/test_model_builder_methods.py b/sagemaker-serve/tests/unit/test_model_builder_methods.py
index 6c126e4e96..400eb70047 100644
--- a/sagemaker-serve/tests/unit/test_model_builder_methods.py
+++ b/sagemaker-serve/tests/unit/test_model_builder_methods.py
@@ -22,6 +22,7 @@
 from sagemaker.core.serializers import NumpySerializer, TorchTensorSerializer
 from sagemaker.core.deserializers import JSONDeserializer, TorchTensorDeserializer
 from sagemaker.serve.constants import Framework
+from sagemaker.serve.mode.function_pointers import Mode
 
 
 class TestModelBuilderSimpleMethods:
@@ -299,3 +300,313 @@ def test_get_client_translators_raises_on_no_deserializer(self):
         with patch.object(builder, '_fetch_serializer_and_deserializer_for_framework', return_value=(None, None)):
             with pytest.raises(ValueError, match="Cannot determine deserializer"):
                 builder._get_client_translators()
+
+
+class TestBuildForPassthroughLocalContainer:
+    """Tests for _build_for_passthrough() with LOCAL_CONTAINER mode.
+
+    Validates: Requirements 2.1, 2.2, 3.1
+    """
+
+    def _make_mock_session(self):
+        mock_session = Mock()
+        mock_session.boto_region_name = "us-west-2"
+        mock_session.config = {}
+        mock_session.boto_session = Mock()
+        mock_session.boto_session.region_name = "us-west-2"
+        return mock_session
+
+    @patch.object(ModelBuilder, '_create_model')
+    @patch.object(ModelBuilder, '_prepare_for_mode')
+    def test_build_for_passthrough_initializes_secret_key(
+        self, mock_prepare, mock_create
+    ):
+        """Test that _build_for_passthrough initializes secret_key for LOCAL_CONTAINER mode.
+
+        Bug 1 fix: secret_key must be set to empty string so _deploy_local_endpoint()
+        can pass it to LocalEndpoint.create() without raising AttributeError.
+        """
+        mock_create.return_value = Mock()
+
+        builder = ModelBuilder(
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.LOCAL_CONTAINER,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+
+        builder._build_for_passthrough()
+
+        assert builder.secret_key == ""
+
+    @patch.object(ModelBuilder, '_create_model')
+    @patch.object(ModelBuilder, '_prepare_for_mode')
+    def test_build_for_passthrough_calls_prepare_for_mode_local_container(
+        self, mock_prepare, mock_create
+    ):
+        """Test that _build_for_passthrough calls _prepare_for_mode for LOCAL_CONTAINER.
+
+        Bug 2 fix: _prepare_for_mode() must be called so that
+        self.modes[str(Mode.LOCAL_CONTAINER)] contains a LocalContainerMode object,
+        preventing KeyError when _deploy_local_endpoint() accesses it.
+        """
+        mock_create.return_value = Mock()
+
+        builder = ModelBuilder(
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.LOCAL_CONTAINER,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+
+        builder._build_for_passthrough()
+
+        mock_prepare.assert_called_once()
+
+    @patch.object(ModelBuilder, '_create_model')
+    @patch.object(ModelBuilder, '_prepare_for_mode')
+    def test_build_for_passthrough_does_not_call_prepare_for_mode_sagemaker_endpoint(
+        self, mock_prepare, mock_create
+    ):
+        """Test that _build_for_passthrough does NOT call _prepare_for_mode for SAGEMAKER_ENDPOINT.
+
+        Preservation: SAGEMAKER_ENDPOINT mode passthrough must continue to work without
+        calling _prepare_for_mode() (endpoint mode has its own preparation in _create_model).
+        """
+        mock_create.return_value = Mock()
+
+        builder = ModelBuilder(
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.SAGEMAKER_ENDPOINT,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+
+        builder._build_for_passthrough()
+
+        mock_prepare.assert_not_called()
+
+
+class TestBuildForPassthroughS3PathPreservation:
+    """Tests for _build_for_passthrough() S3 path preservation with ModelTrainer.
+
+    Validates: Requirements 2.4, 3.5
+    """
+
+    def _make_mock_session(self):
+        mock_session = Mock()
+        mock_session.boto_region_name = "us-west-2"
+        mock_session.config = {}
+        mock_session.boto_session = Mock()
+        mock_session.boto_session.region_name = "us-west-2"
+        return mock_session
+
+    @patch.object(ModelBuilder, '_create_model')
+    @patch.object(ModelBuilder, '_prepare_for_mode')
+    def test_build_for_passthrough_preserves_model_path_as_s3_upload_path(
+        self, mock_prepare, mock_create
+    ):
+        """Test that _build_for_passthrough preserves model_path as s3_upload_path.
+
+        Bug 4 fix: When ModelTrainer has set model_path to an S3 URI,
+        _build_for_passthrough() must preserve it as s3_upload_path instead of
+        unconditionally setting it to None.
+
+        **Validates: Requirements 2.4**
+        """
+        mock_create.return_value = Mock()
+
+        builder = ModelBuilder(
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.LOCAL_CONTAINER,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+        builder.model_path = "s3://bucket/model.tar.gz"
+
+        builder._build_for_passthrough()
+
+        assert builder.s3_upload_path == "s3://bucket/model.tar.gz"
+
+    @patch.object(ModelBuilder, '_create_model')
+    @patch.object(ModelBuilder, '_prepare_for_mode')
+    def test_build_for_passthrough_sets_s3_upload_path_none_when_no_model_path(
+        self, mock_prepare, mock_create
+    ):
+        """Test that _build_for_passthrough sets s3_upload_path to None when no model_path.
+
+        Preservation: When no model artifacts are involved (model_path is None),
+        s3_upload_path should still be set to None as before.
+
+        **Validates: Requirements 3.5**
+        """
+        mock_create.return_value = Mock()
+
+        builder = ModelBuilder(
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.LOCAL_CONTAINER,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+        builder.model_path = None
+
+        builder._build_for_passthrough()
+
+        assert builder.s3_upload_path is None
+
+
+class TestGetDockerClient:
+    """Tests for _get_docker_client() Studio Docker client initialization.
+
+    Validates: Requirements 2.3, 3.3
+    """
+
+    @patch("sagemaker.serve.mode.local_container_mode.os.path.exists")
+    @patch("sagemaker.serve.mode.local_container_mode.check_for_studio", return_value=True)
+    @patch("sagemaker.serve.mode.local_container_mode.docker")
+    def test_pull_image_studio_uses_socket_path(
+        self, mock_docker, mock_check_studio, mock_path_exists
+    ):
+        """Test that in Studio, _get_docker_client uses the proxy socket path.
+
+        Bug 3 fix: When check_for_studio() returns True and the proxy socket exists,
+        DockerClient should be initialized with base_url pointing to that socket.
+        """
+        from sagemaker.serve.mode.local_container_mode import _get_docker_client
+
+        mock_path_exists.side_effect = lambda p: p == "/docker/proxy/docker.sock"
+        mock_client = Mock()
+        mock_docker.DockerClient.return_value = mock_client
+
+        with patch.dict("os.environ", {}, clear=True):
+            result = _get_docker_client()
+
+        mock_docker.DockerClient.assert_called_once_with(
+            base_url="unix:///docker/proxy/docker.sock"
+        )
+        assert result == mock_client
+
+    @patch("sagemaker.serve.mode.local_container_mode.check_for_studio", return_value=False)
+    @patch("sagemaker.serve.mode.local_container_mode.docker")
+    def test_pull_image_non_studio_uses_from_env(self, mock_docker, mock_check_studio):
+        """Test that in non-Studio environments, _get_docker_client uses docker.from_env().
+
+        Preservation: Standard Docker environments must continue to use docker.from_env().
+        """
+        from sagemaker.serve.mode.local_container_mode import _get_docker_client
+
+        mock_client = Mock()
+        mock_docker.from_env.return_value = mock_client
+
+        with patch.dict("os.environ", {}, clear=True):
+            result = _get_docker_client()
+
+        mock_docker.from_env.assert_called_once()
+        assert result == mock_client
+
+    @patch("sagemaker.serve.mode.local_container_mode.check_for_studio")
+    @patch("sagemaker.serve.mode.local_container_mode.docker")
+    def test_pull_image_with_docker_host_env_var(self, mock_docker, mock_check_studio):
+        """Test that when DOCKER_HOST is set, docker.from_env() is used regardless of Studio.
+
+        When the user explicitly sets DOCKER_HOST, we should respect that and skip
+        Studio detection entirely.
+        """
+        from sagemaker.serve.mode.local_container_mode import _get_docker_client
+
+        mock_client = Mock()
+        mock_docker.from_env.return_value = mock_client
+
+        with patch.dict("os.environ", {"DOCKER_HOST": "tcp://localhost:2375"}):
+            result = _get_docker_client()
+
+        mock_docker.from_env.assert_called_once()
+        mock_check_studio.assert_not_called()
+        assert result == mock_client
+
+
+class TestPreservationNonPassthroughBehavior:
+    """Tests for Property 4: Preservation - Non-Passthrough and Non-LOCAL_CONTAINER Behavior.
+
+    Verifies that the bugfix does not alter behavior for:
+    - Model-server-specific build methods (e.g., _build_for_torchserve)
+    - SAGEMAKER_ENDPOINT passthrough mode
+
+    **Validates: Requirements 3.1, 3.2, 3.3, 3.4, 3.5**
+    """
+
+    def _make_mock_session(self):
+        mock_session = Mock()
+        mock_session.boto_region_name = "us-west-2"
+        mock_session.config = {}
+        mock_session.boto_session = Mock()
+        mock_session.boto_session.region_name = "us-west-2"
+        return mock_session
+
+    @patch('sagemaker.serve.model_builder.ModelBuilder._create_model')
+    @patch('sagemaker.serve.model_builder.ModelBuilder._prepare_for_mode')
+    @patch('sagemaker.serve.model_builder.ModelBuilder._save_model_inference_spec')
+    def test_build_for_torchserve_still_calls_prepare_for_mode(
+        self, mock_save_spec, mock_prepare, mock_create
+    ):
+        """Test that _build_for_torchserve still calls _prepare_for_mode for LOCAL_CONTAINER.
+
+        Preservation: The torchserve build path already calls _prepare_for_mode()
+        for local modes. This must remain unchanged after the passthrough bugfix.
+
+        **Validates: Requirements 3.2**
+        """
+        mock_create.return_value = Mock()
+
+        builder = ModelBuilder(
+            model=Mock(),
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.LOCAL_CONTAINER,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+        builder.model_server = Mock()
+        builder.env_vars = {}
+        builder.shared_libs = []
+        builder.dependencies = {}
+        builder.inference_spec = None
+        builder.model_path = None
+
+        builder._build_for_torchserve()
+
+        mock_prepare.assert_called_once()
+
+    @patch.object(ModelBuilder, '_create_model')
+    @patch.object(ModelBuilder, '_prepare_for_mode')
+    def test_build_for_passthrough_sagemaker_endpoint_unchanged(
+        self, mock_prepare, mock_create
+    ):
+        """Test that SAGEMAKER_ENDPOINT passthrough is fully unchanged by the bugfix.
+
+        Preservation: SAGEMAKER_ENDPOINT passthrough must continue to:
+        - Initialize secret_key to empty string
+        - NOT call _prepare_for_mode()
+        - Set s3_upload_path to None when no model_path
+        - Return the model from _create_model()
+
+        **Validates: Requirements 3.1, 3.5**
+        """
+        mock_model = Mock()
+        mock_create.return_value = mock_model
+
+        builder = ModelBuilder(
+            image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest",
+            mode=Mode.SAGEMAKER_ENDPOINT,
+            role_arn="arn:aws:iam::123456789012:role/TestRole",
+            sagemaker_session=self._make_mock_session(),
+        )
+        builder.model_path = None
+
+        result = builder._build_for_passthrough()
+
+        assert builder.secret_key == ""
+        assert builder.s3_upload_path is None
+        mock_prepare.assert_not_called()
+        mock_create.assert_called_once()
+        assert result == mock_model
+
diff --git a/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py
index f7ab97e1f6..476fa8c1fc 100644
--- a/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py
+++ b/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py
@@ -699,9 +699,10 @@ def _show_llmaj_results(
     
     # Download base model aggregate results if both models exist
     base_aggregate = None
+    base_bedrock_job_name = None
     if not is_single_model and base_job_name:
         try:
-            base_aggregate, _ = _download_bedrock_aggregate_json(
+            base_aggregate, base_bedrock_job_name = _download_bedrock_aggregate_json(
                 pipeline_execution, base_job_name
             )
             logger.info(f"Successfully downloaded base model aggregate results")
@@ -776,9 +777,9 @@ def _show_llmaj_results(
         )
     
     # Download base model per-example results if base_job_name exists
-    if base_job_name and bedrock_job_name:
+    if base_job_name and base_bedrock_job_name:
         try:
-            base_results = _download_llmaj_results_from_s3(pipeline_execution, bedrock_job_name)
+            base_results = _download_llmaj_results_from_s3(pipeline_execution, base_bedrock_job_name)
             logger.info(f"Successfully downloaded {len(base_results)} base model per-example results")
         except FileNotFoundError as e:
             s3_path = pipeline_execution.s3_output_path if pipeline_execution.s3_output_path else "unknown"
diff --git a/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py b/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py
index 8b038fd974..ccad436b26 100644
--- a/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py
+++ b/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py
@@ -1140,3 +1140,660 @@ def test_show_results_aggregate_not_found(
         # Verify per-example results were still attempted
         # Note: This will fail because bedrock_job_name is None, but that's expected behavior
         # The function should log a warning and continue
+
+
+class TestBugConditionExploration:
+    """Bug condition exploration tests for the base model per-example results S3 path bug.
+
+    **Validates: Requirements 1.1, 1.4, 2.1, 2.4**
+
+    Property 1: Fault Condition - Base Model Per-Example Results Downloaded From Wrong S3 Path
+
+    In a two-model LLM-as-Judge evaluation where custom and base models have distinct
+    bedrock_job_name values, _download_llmaj_results_from_s3 MUST be called with the
+    base model's bedrock_job_name for the base model download, not the custom model's.
+
+    EXPECTED: These tests FAIL on unfixed code, confirming the bug exists.
+    The bug causes both calls to _download_llmaj_results_from_s3 to use
+    "custom-bedrock-job" instead of using "base-bedrock-job" for the base model.
+    """
+
+    @pytest.mark.parametrize(
+        "custom_scores,base_scores,description",
+        [
+            (
+                [1.0, 0.9, 0.8, 0.7, 0.6],
+                [0.3, 0.4, 0.2, 0.5, 0.1],
+                "custom_wins_all",
+            ),
+            (
+                [0.2, 0.3, 0.1, 0.4, 0.15],
+                [0.9, 0.8, 0.95, 0.7, 0.85],
+                "base_wins_all",
+            ),
+            (
+                [0.9, 0.3, 0.8, 0.2, 0.7],
+                [0.4, 0.8, 0.3, 0.9, 0.6],
+                "mixed_wins",
+            ),
+        ],
+        ids=["custom_wins", "base_wins", "mixed"],
+    )
+    @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_base_model_per_example_uses_correct_bedrock_job_name(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_calculate_win,
+        mock_display_win,
+        mock_display_aggregate,
+        mock_pipeline_execution,
+        custom_scores,
+        base_scores,
+        description,
+    ):
+        """Assert _download_llmaj_results_from_s3 is called with 'base-bedrock-job' for base model.
+
+        On unfixed code, this FAILS because both calls use 'custom-bedrock-job'.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        # Return distinct job names for custom and base steps
+        mock_extract_job.side_effect = ["custom-training-job", "base-training-job"]
+
+        # Return distinct bedrock_job_name values for custom and base aggregates
+        custom_aggregate = {"results": {"Metric1": {"score": 0.8, "total_evaluations": 5}}}
+        base_aggregate = {"results": {"Metric1": {"score": 0.5, "total_evaluations": 5}}}
+        mock_download_aggregate.side_effect = [
+            ("custom_agg", "custom-bedrock-job"),
+            ("base_agg", "base-bedrock-job"),
+        ]
+
+        # Build per-example result sets keyed by bedrock_job_name
+        def make_results(scores):
+            return [
+                {
+                    "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                    "modelResponses": [{"response": "['A']"}],
+                    "automatedEvaluationResult": {
+                        "scores": [{"metricName": "Metric1", "result": s}]
+                    },
+                }
+                for s in scores
+            ]
+
+        custom_results = make_results(custom_scores)
+        base_results = make_results(base_scores)
+
+        # Return different results depending on which bedrock_job_name is passed
+        def download_side_effect(pipeline_exec, bedrock_job_name):
+            if bedrock_job_name == "custom-bedrock-job":
+                return custom_results
+            elif bedrock_job_name == "base-bedrock-job":
+                return base_results
+            raise ValueError(f"Unexpected bedrock_job_name: {bedrock_job_name}")
+
+        mock_download_results.side_effect = download_side_effect
+
+        mock_calculate_win.return_value = {
+            "custom_wins": 0, "base_wins": 0, "ties": 0, "total": 0,
+            "custom_win_rate": 0.0, "base_win_rate": 0.0, "tie_rate": 0.0,
+        }
+
+        # Execute
+        _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0)
+
+        # CRITICAL ASSERTION: base model download must use "base-bedrock-job"
+        # On unfixed code, both calls use "custom-bedrock-job" — this assertion FAILS.
+        assert mock_download_results.call_count == 2
+        download_calls = mock_download_results.call_args_list
+        # First call: custom model per-example results
+        assert download_calls[0] == call(mock_pipeline_execution, "custom-bedrock-job"), (
+            f"Expected custom download with 'custom-bedrock-job', "
+            f"got {download_calls[0]}"
+        )
+        # Second call: base model per-example results — MUST use base bedrock job name
+        assert download_calls[1] == call(mock_pipeline_execution, "base-bedrock-job"), (
+            f"BUG CONFIRMED: base model download used '{download_calls[1]}' "
+            f"instead of call(mock_pipeline_execution, 'base-bedrock-job'). "
+            f"Both downloads used the custom model's bedrock_job_name."
+        )
+
+    @pytest.mark.parametrize(
+        "custom_scores,base_scores,description",
+        [
+            (
+                [1.0, 0.9, 0.8, 0.7, 0.6],
+                [0.3, 0.4, 0.2, 0.5, 0.1],
+                "custom_wins_all",
+            ),
+            (
+                [0.2, 0.3, 0.1, 0.4, 0.15],
+                [0.9, 0.8, 0.95, 0.7, 0.85],
+                "base_wins_all",
+            ),
+            (
+                [0.9, 0.3, 0.8, 0.2, 0.7],
+                [0.4, 0.8, 0.3, 0.9, 0.6],
+                "mixed_wins",
+            ),
+        ],
+        ids=["custom_wins", "base_wins", "mixed"],
+    )
+    @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_calculate_win_rates_receives_distinct_datasets(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_calculate_win,
+        mock_display_win,
+        mock_display_aggregate,
+        mock_pipeline_execution,
+        custom_scores,
+        base_scores,
+        description,
+    ):
+        """Assert _calculate_win_rates receives two genuinely distinct datasets.
+
+        On unfixed code, this FAILS because both datasets come from the same S3 path
+        (custom model's bedrock_job_name), so _calculate_win_rates gets identical data.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        mock_extract_job.side_effect = ["custom-training-job", "base-training-job"]
+
+        mock_download_aggregate.side_effect = [
+            ("custom_agg", "custom-bedrock-job"),
+            ("base_agg", "base-bedrock-job"),
+        ]
+
+        def make_results(scores):
+            return [
+                {
+                    "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                    "modelResponses": [{"response": "['A']"}],
+                    "automatedEvaluationResult": {
+                        "scores": [{"metricName": "Metric1", "result": s}]
+                    },
+                }
+                for s in scores
+            ]
+
+        custom_results = make_results(custom_scores)
+        base_results = make_results(base_scores)
+
+        def download_side_effect(pipeline_exec, bedrock_job_name):
+            if bedrock_job_name == "custom-bedrock-job":
+                return custom_results
+            elif bedrock_job_name == "base-bedrock-job":
+                return base_results
+            raise ValueError(f"Unexpected bedrock_job_name: {bedrock_job_name}")
+
+        mock_download_results.side_effect = download_side_effect
+
+        mock_calculate_win.return_value = {
+            "custom_wins": 0, "base_wins": 0, "ties": 0, "total": 0,
+            "custom_win_rate": 0.0, "base_win_rate": 0.0, "tie_rate": 0.0,
+        }
+
+        _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0)
+
+        # _calculate_win_rates must receive genuinely distinct datasets
+        mock_calculate_win.assert_called_once()
+        actual_custom, actual_base = mock_calculate_win.call_args[0]
+
+        # On unfixed code, both arguments are custom_results (identical),
+        # so this assertion FAILS.
+        assert actual_custom is not actual_base or custom_scores == base_scores, (
+            "BUG CONFIRMED: _calculate_win_rates received identical objects for "
+            "custom and base results — both downloaded from custom model's S3 path."
+        )
+
+        # Verify the actual score values differ (stronger check)
+        custom_first_score = actual_custom[0]["automatedEvaluationResult"]["scores"][0]["result"]
+        base_first_score = actual_base[0]["automatedEvaluationResult"]["scores"][0]["result"]
+        assert custom_first_score == custom_scores[0], (
+            f"Custom results first score should be {custom_scores[0]}, got {custom_first_score}"
+        )
+        assert base_first_score == base_scores[0], (
+            f"BUG CONFIRMED: Base results first score is {base_first_score} "
+            f"(same as custom {custom_scores[0]}) instead of expected {base_scores[0]}. "
+            f"Base per-example results were downloaded from the custom model's S3 path."
+        )
+
+
+
+class TestPreservationProperty:
+    """Preservation property tests for _show_llmaj_results behavior.
+
+    **Validates: Requirements 3.1, 3.2, 3.3, 3.4, 3.5, 3.6**
+
+    Property 2: Preservation - Non-Two-Model and Non-Win-Rate Behavior Unchanged
+
+    These tests observe and assert the current behavior of _show_llmaj_results for
+    code paths that are NOT affected by the two-model win-rate bug. They must PASS
+    on both unfixed and fixed code, confirming no regressions.
+    """
+
+    # --- Requirement 3.1: Single-model evaluation displays aggregate + per-example, no win rates ---
+
+    @pytest.mark.parametrize(
+        "result_count",
+        [1, 3, 5, 10],
+        ids=["1_result", "3_results", "5_results", "10_results"],
+    )
+    @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics")
+    @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_single_model_no_win_rates(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_display_single,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_calculate_win,
+        mock_display_aggregate,
+        mock_pipeline_execution,
+        result_count,
+    ):
+        """Single-model evaluation displays aggregate and per-example results without win rates.
+
+        Observed behavior: When only custom_job_name exists (no base_job_name),
+        _calculate_win_rates is never called.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        # Only custom model, no base
+        mock_extract_job.side_effect = ["custom-job", None]
+
+        custom_aggregate = {"results": {"Metric1": {"score": 0.9, "total_evaluations": result_count}}}
+        mock_download_aggregate.return_value = (custom_aggregate, "bedrock-job-custom")
+
+        mock_results = [
+            {
+                "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                "modelResponses": [{"response": "['A']"}],
+                "automatedEvaluationResult": {
+                    "scores": [{"metricName": "Metric1", "result": 0.9}]
+                },
+            }
+        ] * result_count
+        mock_download_results.return_value = mock_results
+
+        _show_llmaj_results(mock_pipeline_execution, limit=result_count, offset=0)
+
+        # _calculate_win_rates must NOT be called for single-model evaluations
+        mock_calculate_win.assert_not_called()
+        # Aggregate metrics should be displayed with None for base
+        mock_display_aggregate.assert_called_once_with(custom_aggregate, None, mock_console)
+        # Per-example results should be displayed
+        assert mock_display_single.call_count == result_count
+
+    # --- Requirement 3.2: Base aggregate FileNotFoundError -> custom-only results ---
+
+    @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics")
+    @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_base_aggregate_not_found_displays_custom_only(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_display_single,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_calculate_win,
+        mock_display_aggregate,
+        mock_pipeline_execution,
+    ):
+        """When base aggregate raises FileNotFoundError, function continues with custom-only.
+
+        Observed behavior: base_aggregate stays None, _display_aggregate_metrics is called
+        with custom_aggregate and None for base.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        # Two-model evaluation
+        mock_extract_job.side_effect = ["custom-job", "base-job"]
+
+        custom_aggregate = {"results": {"Metric1": {"score": 0.85, "total_evaluations": 5}}}
+
+        # Custom aggregate succeeds, base aggregate raises FileNotFoundError
+        mock_download_aggregate.side_effect = [
+            (custom_aggregate, "bedrock-job-custom"),
+            FileNotFoundError("Base aggregate not found in S3"),
+        ]
+
+        mock_results = [
+            {
+                "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                "modelResponses": [{"response": "['A']"}],
+                "automatedEvaluationResult": {
+                    "scores": [{"metricName": "Metric1", "result": 0.85}]
+                },
+            }
+        ] * 3
+        mock_download_results.return_value = mock_results
+
+        # Should NOT raise — function continues gracefully
+        _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0)
+
+        # Aggregate displayed with None for base
+        mock_display_aggregate.assert_called_once_with(custom_aggregate, None, mock_console)
+        # Per-example results still displayed
+        assert mock_display_single.call_count == 3
+
+    # --- Requirement 3.3: Pagination with limit and offset ---
+
+    @pytest.mark.parametrize(
+        "limit,offset,total,expected_display_count",
+        [
+            (3, 2, 10, 3),
+            (1, 0, 5, 1),
+            (5, 0, 3, 3),
+            (20, 5, 10, 5),
+            (2, 8, 10, 2),
+            (10, 0, 10, 10),
+        ],
+        ids=[
+            "limit3_offset2_total10",
+            "limit1_offset0_total5",
+            "limit5_offset0_total3",
+            "limit20_offset5_total10",
+            "limit2_offset8_total10",
+            "limit10_offset0_total10",
+        ],
+    )
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_pagination_display_count(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_display_single,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_pipeline_execution,
+        limit,
+        offset,
+        total,
+        expected_display_count,
+    ):
+        """Pagination calls _display_single_llmaj_evaluation the correct number of times.
+
+        Observed behavior: display count = min(limit, total - offset) when offset < total.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        mock_extract_job.side_effect = ["custom-job", None]
+        mock_download_aggregate.return_value = ({"results": {}}, "bedrock-job")
+
+        mock_results = [
+            {
+                "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                "modelResponses": [{"response": "['A']"}],
+                "automatedEvaluationResult": {
+                    "scores": [{"metricName": "M", "result": 0.5}]
+                },
+            }
+        ] * total
+        mock_download_results.return_value = mock_results
+
+        _show_llmaj_results(mock_pipeline_execution, limit=limit, offset=offset)
+
+        assert mock_display_single.call_count == expected_display_count
+
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_pagination_starts_at_correct_index(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_display_single,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_pipeline_execution,
+    ):
+        """Pagination with limit=3, offset=2 on 10 results starts at index 2.
+
+        Observed behavior: _display_single_llmaj_evaluation is called with indices 2, 3, 4.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        mock_extract_job.side_effect = ["custom-job", None]
+        mock_download_aggregate.return_value = ({"results": {}}, "bedrock-job")
+
+        # Create 10 distinct results so we can verify indices
+        mock_results = [
+            {
+                "inputRecord": {"prompt": f"[{{'role': 'user', 'content': 'Q{i}'}}]"},
+                "modelResponses": [{"response": f"['A{i}']"}],
+                "automatedEvaluationResult": {
+                    "scores": [{"metricName": "M", "result": float(i) / 10}]
+                },
+            }
+            for i in range(10)
+        ]
+        mock_download_results.return_value = mock_results
+
+        _show_llmaj_results(mock_pipeline_execution, limit=3, offset=2)
+
+        assert mock_display_single.call_count == 3
+        # Verify the indices passed to _display_single_llmaj_evaluation
+        for call_idx, expected_i in enumerate([2, 3, 4]):
+            actual_call = mock_display_single.call_args_list[call_idx]
+            # Args: (result, index, total, console, show_explanations=...)
+            assert actual_call[0][0] == mock_results[expected_i], (
+                f"Call {call_idx}: expected result at index {expected_i}"
+            )
+            assert actual_call[0][1] == expected_i, (
+                f"Call {call_idx}: expected index arg {expected_i}, got {actual_call[0][1]}"
+            )
+
+    # --- Requirement 3.4: show_explanations passthrough ---
+
+    @pytest.mark.parametrize(
+        "show_explanations",
+        [True, False],
+        ids=["explanations_on", "explanations_off"],
+    )
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_show_explanations_passthrough(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_display_single,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_pipeline_execution,
+        show_explanations,
+    ):
+        """show_explanations value is passed through to _display_single_llmaj_evaluation.
+
+        Observed behavior: Each call to _display_single_llmaj_evaluation receives
+        show_explanations as a keyword argument matching the value passed to _show_llmaj_results.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        mock_extract_job.side_effect = ["custom-job", None]
+        mock_download_aggregate.return_value = ({"results": {}}, "bedrock-job")
+
+        mock_results = [
+            {
+                "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                "modelResponses": [{"response": "['A']"}],
+                "automatedEvaluationResult": {
+                    "scores": [{"metricName": "M", "result": 0.8}]
+                },
+            }
+        ] * 3
+        mock_download_results.return_value = mock_results
+
+        _show_llmaj_results(
+            mock_pipeline_execution, limit=3, offset=0, show_explanations=show_explanations
+        )
+
+        assert mock_display_single.call_count == 3
+        for c in mock_display_single.call_args_list:
+            assert c[1]["show_explanations"] == show_explanations, (
+                f"Expected show_explanations={show_explanations}, got {c[1]}"
+            )
+
+    # --- Requirement 3.5: Per-example FileNotFoundError -> warning + aggregate display ---
+
+    @pytest.mark.parametrize(
+        "has_aggregate",
+        [True, False],
+        ids=["with_aggregate", "without_aggregate"],
+    )
+    @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_per_example_not_found_displays_aggregate_if_available(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_display_single,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_display_aggregate,
+        mock_pipeline_execution,
+        has_aggregate,
+    ):
+        """When per-example results raise FileNotFoundError, aggregate metrics still display.
+
+        Observed behavior: custom_results stays None, _display_single_llmaj_evaluation is
+        never called, but _display_aggregate_metrics is called if aggregate was downloaded.
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        mock_extract_job.side_effect = ["custom-job", None]
+
+        if has_aggregate:
+            custom_aggregate = {"results": {"M": {"score": 0.9, "total_evaluations": 5}}}
+            mock_download_aggregate.return_value = (custom_aggregate, "bedrock-job")
+        else:
+            mock_download_aggregate.side_effect = FileNotFoundError("Aggregate not found")
+
+        # Per-example download fails
+        mock_download_results.side_effect = FileNotFoundError("Per-example results not found")
+
+        # Should NOT raise
+        _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0)
+
+        # No per-example results displayed
+        mock_display_single.assert_not_called()
+
+        if has_aggregate:
+            mock_display_aggregate.assert_called_once()
+        else:
+            mock_display_aggregate.assert_not_called()
+
+    # --- Requirement 3.6: Genuinely identical data -> 100% ties is legitimate ---
+    # (This is tested via _calculate_win_rates directly in TestCalculateWinRates.test_calculate_ties)
+    # Here we verify the integration: when both models return identical per-example data,
+    # _calculate_win_rates IS called and its result is displayed.
+
+    @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics")
+    @patch("sagemaker.train.common_utils.show_results_utils._display_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3")
+    @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json")
+    @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps")
+    @patch("rich.console.Console")
+    def test_identical_data_win_rates_still_calculated(
+        self,
+        mock_console_class,
+        mock_extract_job,
+        mock_download_aggregate,
+        mock_download_results,
+        mock_calculate_win,
+        mock_display_win,
+        mock_display_aggregate,
+        mock_pipeline_execution,
+    ):
+        """When both models have identical per-example data, win rates are still calculated.
+
+        Observed behavior: _calculate_win_rates is called when both custom_results and
+        base_results are non-None, regardless of whether the data is identical.
+        This preserves the legitimate 100% ties case (Requirement 3.6).
+        """
+        mock_console = MagicMock()
+        mock_console_class.return_value = mock_console
+
+        mock_extract_job.side_effect = ["custom-job", "base-job"]
+
+        mock_download_aggregate.side_effect = [
+            ({"results": {"M": {"score": 0.8}}}, "same-bedrock-job"),
+            ({"results": {"M": {"score": 0.8}}}, "same-bedrock-job"),
+        ]
+
+        identical_results = [
+            {
+                "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"},
+                "modelResponses": [{"response": "['A']"}],
+                "automatedEvaluationResult": {
+                    "scores": [{"metricName": "M", "result": 0.8}]
+                },
+            }
+        ] * 5
+        # Both calls return the same data (this is the current buggy behavior AND
+        # the legitimate case when models genuinely tie)
+        mock_download_results.return_value = identical_results
+
+        win_rates = {
+            "custom_wins": 0, "base_wins": 0, "ties": 5, "total": 5,
+            "custom_win_rate": 0.0, "base_win_rate": 0.0, "tie_rate": 1.0,
+        }
+        mock_calculate_win.return_value = win_rates
+
+        _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0)
+
+        # Win rates should still be calculated and displayed
+        mock_calculate_win.assert_called_once()
+        mock_display_win.assert_called_once_with(win_rates, mock_console)

From 7a13971fb5c5c7daaaefb31fa18470ab57020dcc Mon Sep 17 00:00:00 2001
From: Gokul A <nargokul@amazon.com>
Date: Thu, 5 Mar 2026 13:03:18 -0800
Subject: [PATCH 2/4] fix(model_builder): Only set s3_upload_path for S3 URIs
 in passthrough

In _build_for_passthrough(), model_path could be a local /tmp path.
Setting s3_upload_path to a local path caused CreateModel API to reject
the modelDataUrl with a validation error since it requires s3:// or
https:// URIs. Now only S3 URIs are assigned to s3_upload_path; local
paths are handled separately by _prepare_for_mode() in LOCAL_CONTAINER
mode.
---
 sagemaker-serve/src/sagemaker/serve/model_builder.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py
index 01463729ac..c2ba0c36eb 100644
--- a/sagemaker-serve/src/sagemaker/serve/model_builder.py
+++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py
@@ -1289,10 +1289,10 @@ def _build_for_passthrough(self) -> Model:
 
         self.secret_key = ""
 
-        if not self.model_path:
-            self.s3_upload_path = None
-        else:
+        if self.model_path and self.model_path.startswith("s3://"):
             self.s3_upload_path = self.model_path
+        else:
+            self.s3_upload_path = None
 
         if self.mode in LOCAL_MODES:
             self._prepare_for_mode()

From 78cff41736e99ff08c022dcfb847d91ef70d3a28 Mon Sep 17 00:00:00 2001
From: Gokul A <nargokul@amazon.com>
Date: Thu, 5 Mar 2026 15:56:31 -0800
Subject: [PATCH 3/4] Test fixes

---
 .../src/sagemaker/train/evaluate/pipeline_templates.py     | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
index b14145ff31..3848ef0d5c 100644
--- a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
+++ b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
@@ -1028,9 +1028,6 @@
                 {% if kms_key_id %},
                 "KmsKeyId": "{{ kms_key_id }}"
                 {% endif %}
-                },
-                "ModelPackageConfig": {
-                    "ModelPackageGroupArn": "{{ model_package_group_arn }}"
                 }{% if dataset_uri %},
                 "InputDataConfig": [
                     {
@@ -1182,10 +1179,6 @@
                 {% if kms_key_id %},
                 "KmsKeyId": "{{ kms_key_id }}"
                 {% endif %}
-                },
-                "ModelPackageConfig": {
-                    "ModelPackageGroupArn": "{{ model_package_group_arn }}",
-                    "SourceModelPackageArn": "{{ source_model_package_arn }}"
                 }
             }
         },{% endif %}

From c1d8d1a0258bb8b248f97ef200abf3e95c4b912c Mon Sep 17 00:00:00 2001
From: Gokul A <nargokul@amazon.com>
Date: Fri, 6 Mar 2026 12:03:04 -0800
Subject: [PATCH 4/4] Bug fix 3 and 4

---
 .../modules/local_core/local_container.py     |  20 ++-
 .../sagemaker/train/local/local_container.py  |  20 ++-
 .../tests/unit/train/test_defaults.py         | 170 ++++++++++++++++++
 3 files changed, 202 insertions(+), 8 deletions(-)

diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py
index 52624c7216..43a5ca502d 100644
--- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py
+++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py
@@ -17,6 +17,7 @@
 import os
 import re
 import shutil
+import stat
 import subprocess
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Optional
@@ -57,6 +58,17 @@
 SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
 
 
+def _rmtree(path):
+    """Remove a directory tree, handling root-owned files from Docker containers."""
+    def _onerror(func, path, exc_info):
+        if isinstance(exc_info[1], PermissionError):
+            os.chmod(path, stat.S_IRWXU)
+            func(path)
+        else:
+            raise exc_info[1]
+    shutil.rmtree(path, onerror=_onerror)
+
+
 class _LocalContainer(BaseModel):
     """A local training job class for local mode model trainer.
 
@@ -209,12 +221,12 @@ def train(
         # Print our Job Complete line
         logger.info("Local training job completed, output artifacts saved to %s", artifacts)
 
-        shutil.rmtree(os.path.join(self.container_root, "input"))
-        shutil.rmtree(os.path.join(self.container_root, "shared"))
+        _rmtree(os.path.join(self.container_root, "input"))
+        _rmtree(os.path.join(self.container_root, "shared"))
         for host in self.hosts:
-            shutil.rmtree(os.path.join(self.container_root, host))
+            _rmtree(os.path.join(self.container_root, host))
         for folder in self._temporary_folders:
-            shutil.rmtree(os.path.join(self.container_root, folder))
+            _rmtree(os.path.join(self.container_root, folder))
         return artifacts
 
     def retrieve_artifacts(
diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py
index 89cf11c873..bcf84395ce 100644
--- a/sagemaker-train/src/sagemaker/train/local/local_container.py
+++ b/sagemaker-train/src/sagemaker/train/local/local_container.py
@@ -18,6 +18,7 @@
 import os
 import re
 import shutil
+import stat
 import subprocess
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Optional
@@ -65,6 +66,17 @@
 SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
 
 
+def _rmtree(path):
+    """Remove a directory tree, handling root-owned files from Docker containers."""
+    def _onerror(func, path, exc_info):
+        if isinstance(exc_info[1], PermissionError):
+            os.chmod(path, stat.S_IRWXU)
+            func(path)
+        else:
+            raise exc_info[1]
+    shutil.rmtree(path, onerror=_onerror)
+
+
 class _LocalContainer(BaseModel):
     """A local training job class for local mode model trainer.
 
@@ -217,12 +229,12 @@ def train(
         # Print our Job Complete line
         logger.info("Local training job completed, output artifacts saved to %s", artifacts)
 
-        shutil.rmtree(os.path.join(self.container_root, "input"))
-        shutil.rmtree(os.path.join(self.container_root, "shared"))
+        _rmtree(os.path.join(self.container_root, "input"))
+        _rmtree(os.path.join(self.container_root, "shared"))
         for host in self.hosts:
-            shutil.rmtree(os.path.join(self.container_root, host))
+            _rmtree(os.path.join(self.container_root, host))
         for folder in self._temporary_folders:
-            shutil.rmtree(os.path.join(self.container_root, folder))
+            _rmtree(os.path.join(self.container_root, folder))
         return artifacts
 
     def retrieve_artifacts(
diff --git a/sagemaker-train/tests/unit/train/test_defaults.py b/sagemaker-train/tests/unit/train/test_defaults.py
index 848e46b2c5..bf71e2247e 100644
--- a/sagemaker-train/tests/unit/train/test_defaults.py
+++ b/sagemaker-train/tests/unit/train/test_defaults.py
@@ -25,6 +25,7 @@
     DEFAULT_MAX_RUNTIME_IN_SECONDS,
 )
 from sagemaker.train.configs import Compute, StoppingCondition
+from sagemaker.core.shapes import InstanceGroup
 
 
 class TestDefaultConstants:
@@ -435,3 +436,172 @@ def test_uses_default_volume_size_when_not_in_document(
         )
 
         assert result.volume_size_in_gb == DEFAULT_VOLUME_SIZE
+
+
+    def test_does_not_set_instance_type_when_instance_groups_configured(self):
+        """Test instance_type is not overwritten when instance_groups are set."""
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=30,
+        )
+        result = TrainDefaults.get_compute(compute=compute)
+        assert result.instance_type is None
+
+    def test_does_not_set_instance_count_when_instance_groups_configured(self):
+        """Test instance_count is not overwritten when instance_groups are set."""
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=30,
+        )
+        result = TrainDefaults.get_compute(compute=compute)
+        assert result.instance_count is None
+
+    def test_sets_volume_size_when_instance_groups_configured(self):
+        """Test volume_size_in_gb is still set when instance_groups are configured."""
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=None,
+        )
+        result = TrainDefaults.get_compute(compute=compute)
+        assert result.volume_size_in_gb == DEFAULT_VOLUME_SIZE
+
+    def test_preserves_existing_volume_size_with_instance_groups(self):
+        """Test existing volume_size_in_gb is preserved when instance_groups are configured."""
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=100,
+        )
+        result = TrainDefaults.get_compute(compute=compute)
+        assert result.volume_size_in_gb == 100
+
+
+class TestJumpStartTrainDefaultsGetComputeHeterogeneousCluster:
+    """Test JumpStartTrainDefaults.get_compute with heterogeneous cluster (instance_groups)."""
+
+    @patch("sagemaker.train.defaults.get_hub_content_and_document")
+    @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session")
+    def test_does_not_set_instance_type_when_instance_groups_configured(
+        self, mock_get_session, mock_get_hub_content
+    ):
+        """Test instance_type is not overwritten when instance_groups are set."""
+        mock_session = MagicMock()
+        mock_get_session.return_value = mock_session
+
+        mock_document = MagicMock()
+        mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge"
+        mock_document.TrainingVolumeSize = 100
+        mock_get_hub_content.return_value = (None, mock_document)
+
+        mock_config = MagicMock()
+        mock_config.training_config_name = None
+
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=30,
+        )
+        result = JumpStartTrainDefaults.get_compute(
+            jumpstart_config=mock_config,
+            compute=compute,
+            sagemaker_session=mock_session,
+        )
+        assert result.instance_type is None
+
+    @patch("sagemaker.train.defaults.get_hub_content_and_document")
+    @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session")
+    def test_does_not_set_instance_count_when_instance_groups_configured(
+        self, mock_get_session, mock_get_hub_content
+    ):
+        """Test instance_count is not overwritten when instance_groups are set."""
+        mock_session = MagicMock()
+        mock_get_session.return_value = mock_session
+
+        mock_document = MagicMock()
+        mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge"
+        mock_document.TrainingVolumeSize = 100
+        mock_get_hub_content.return_value = (None, mock_document)
+
+        mock_config = MagicMock()
+        mock_config.training_config_name = None
+
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=30,
+        )
+        result = JumpStartTrainDefaults.get_compute(
+            jumpstart_config=mock_config,
+            compute=compute,
+            sagemaker_session=mock_session,
+        )
+        assert result.instance_count is None
+
+    @patch("sagemaker.train.defaults.get_hub_content_and_document")
+    @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session")
+    def test_sets_volume_size_from_document_when_instance_groups_configured(
+        self, mock_get_session, mock_get_hub_content
+    ):
+        """Test volume_size_in_gb is set from document even when instance_groups are configured."""
+        mock_session = MagicMock()
+        mock_get_session.return_value = mock_session
+
+        mock_document = MagicMock()
+        mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge"
+        mock_document.TrainingVolumeSize = 100
+        mock_get_hub_content.return_value = (None, mock_document)
+
+        mock_config = MagicMock()
+        mock_config.training_config_name = None
+
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=None,
+        )
+        result = JumpStartTrainDefaults.get_compute(
+            jumpstart_config=mock_config,
+            compute=compute,
+            sagemaker_session=mock_session,
+        )
+        assert result.volume_size_in_gb == 100
+
+    @patch("sagemaker.train.defaults.get_hub_content_and_document")
+    @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session")
+    def test_sets_default_volume_size_when_instance_groups_and_no_document_volume(
+        self, mock_get_session, mock_get_hub_content
+    ):
+        """Test DEFAULT_VOLUME_SIZE is used when instance_groups set and document has no volume."""
+        mock_session = MagicMock()
+        mock_get_session.return_value = mock_session
+
+        mock_document = MagicMock()
+        mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge"
+        mock_document.TrainingVolumeSize = None
+        mock_get_hub_content.return_value = (None, mock_document)
+
+        mock_config = MagicMock()
+        mock_config.training_config_name = None
+
+        compute = Compute(
+            instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")],
+            instance_type=None,
+            instance_count=None,
+            volume_size_in_gb=None,
+        )
+        result = JumpStartTrainDefaults.get_compute(
+            jumpstart_config=mock_config,
+            compute=compute,
+            sagemaker_session=mock_session,
+        )
+        assert result.volume_size_in_gb == DEFAULT_VOLUME_SIZE