From 18443589462121af2691be3de1761d5312de0b5c Mon Sep 17 00:00:00 2001 From: Gokul A Date: Thu, 5 Mar 2026 12:14:43 -0800 Subject: [PATCH 1/4] V3 Bug Fixes --- .../src/sagemaker/core/processing.py | 3 + sagemaker-core/tests/unit/test_processing.py | 255 +++++++ .../serve/mode/local_container_mode.py | 24 +- .../src/sagemaker/serve/model_builder.py | 13 +- .../tests/unit/test_model_builder_methods.py | 311 +++++++++ .../train/common_utils/show_results_utils.py | 7 +- .../common_utils/test_show_results_utils.py | 657 ++++++++++++++++++ 7 files changed, 1264 insertions(+), 6 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/processing.py b/sagemaker-core/src/sagemaker/core/processing.py index 65dadeda33..736eebf01c 100644 --- a/sagemaker-core/src/sagemaker/core/processing.py +++ b/sagemaker-core/src/sagemaker/core/processing.py @@ -487,6 +487,9 @@ def _normalize_outputs(self, outputs=None): # If the output's s3_uri is not an s3_uri, create one. parse_result = urlparse(output.s3_output.s3_uri) if parse_result.scheme != "s3": + if getattr(self.sagemaker_session, "local_mode", False) and parse_result.scheme == "file": + normalized_outputs.append(output) + continue if _pipeline_config: s3_uri = Join( on="/", diff --git a/sagemaker-core/tests/unit/test_processing.py b/sagemaker-core/tests/unit/test_processing.py index e7dbdc17bc..dbe8d5f9ef 100644 --- a/sagemaker-core/tests/unit/test_processing.py +++ b/sagemaker-core/tests/unit/test_processing.py @@ -238,6 +238,261 @@ def test_normalize_outputs_invalid_type(self, mock_session): processor._normalize_outputs(["invalid"]) + + +class TestBugConditionFileUriReplacedInLocalMode: + """Bug condition exploration test: file:// URIs should be preserved in local mode. + + **Validates: Requirements 1.1, 1.2, 2.1, 2.2** + + EXPECTED TO FAIL on unfixed code — failure confirms the bug exists. + The bug is that _normalize_outputs() replaces file:// URIs with s3:// paths + even when the session is a LocalSession (local_mode=True). + """ + + @pytest.fixture + def local_mock_session(self): + session = Mock() + session.boto_session = Mock() + session.boto_session.region_name = "us-west-2" + session.sagemaker_client = Mock() + session.default_bucket = Mock(return_value="default-bucket") + session.default_bucket_prefix = "prefix" + session.expand_role = Mock(side_effect=lambda x: x) + session.sagemaker_config = {} + session.local_mode = True + return session + + @pytest.mark.parametrize( + "file_uri", + [ + "file:///tmp/output", + "file:///home/user/results", + "file:///data/processed", + ], + ) + def test_normalize_outputs_preserves_file_uri_in_local_mode(self, local_mock_session, file_uri): + """file:// URIs must be preserved when local_mode=True. + + On unfixed code, _normalize_outputs replaces file:// URIs with + s3://default-bucket/prefix/job-name/output/output-1, which is the bug. + """ + processor = Processor( + role="arn:aws:iam::123456789012:role/SageMakerRole", + image_uri="test-image:latest", + instance_count=1, + instance_type="ml.m5.xlarge", + sagemaker_session=local_mock_session, + ) + processor._current_job_name = "test-job" + + s3_output = ProcessingS3Output( + s3_uri=file_uri, + local_path="/opt/ml/processing/output", + s3_upload_mode="EndOfJob", + ) + outputs = [ProcessingOutput(output_name="my-output", s3_output=s3_output)] + + with patch("sagemaker.core.workflow.utilities._pipeline_config", None): + result = processor._normalize_outputs(outputs) + + assert len(result) == 1 + assert result[0].s3_output.s3_uri == file_uri, ( + f"Expected file:// URI to be preserved as '{file_uri}' in local mode, " + f"but got '{result[0].s3_output.s3_uri}'" + ) + + +class TestPreservationNonLocalFileBehavior: + """Preservation property tests: Non-local-file behavior must remain unchanged. + + **Validates: Requirements 3.1, 3.2, 3.3, 3.4** + + These tests capture baseline behavior on UNFIXED code. They MUST PASS on both + unfixed and fixed code, confirming no regressions are introduced by the fix. + """ + + @pytest.fixture + def session_local_mode_true(self): + session = Mock() + session.boto_session = Mock() + session.boto_session.region_name = "us-west-2" + session.sagemaker_client = Mock() + session.default_bucket = Mock(return_value="default-bucket") + session.default_bucket_prefix = "prefix" + session.expand_role = Mock(side_effect=lambda x: x) + session.sagemaker_config = {} + session.local_mode = True + return session + + @pytest.fixture + def session_local_mode_false(self): + session = Mock() + session.boto_session = Mock() + session.boto_session.region_name = "us-west-2" + session.sagemaker_client = Mock() + session.default_bucket = Mock(return_value="default-bucket") + session.default_bucket_prefix = "prefix" + session.expand_role = Mock(side_effect=lambda x: x) + session.sagemaker_config = {} + session.local_mode = False + return session + + def _make_processor(self, session): + processor = Processor( + role="arn:aws:iam::123456789012:role/SageMakerRole", + image_uri="test-image:latest", + instance_count=1, + instance_type="ml.m5.xlarge", + sagemaker_session=session, + ) + processor._current_job_name = "test-job" + return processor + + # --- Requirement 3.1: S3 URIs pass through unchanged regardless of local_mode --- + + @pytest.mark.parametrize( + "s3_uri,local_mode_fixture", + [ + ("s3://my-bucket/path", "session_local_mode_true"), + ("s3://my-bucket/path", "session_local_mode_false"), + ("s3://another-bucket/deep/nested/path", "session_local_mode_true"), + ("s3://another-bucket/deep/nested/path", "session_local_mode_false"), + ], + ) + def test_s3_uri_preserved_regardless_of_local_mode(self, s3_uri, local_mode_fixture, request): + """S3 URIs must pass through unchanged regardless of local_mode setting. + + **Validates: Requirements 3.1** + """ + session = request.getfixturevalue(local_mode_fixture) + processor = self._make_processor(session) + + s3_output = ProcessingS3Output( + s3_uri=s3_uri, + local_path="/opt/ml/processing/output", + s3_upload_mode="EndOfJob", + ) + outputs = [ProcessingOutput(output_name="my-output", s3_output=s3_output)] + + with patch("sagemaker.core.workflow.utilities._pipeline_config", None): + result = processor._normalize_outputs(outputs) + + assert len(result) == 1 + assert result[0].s3_output.s3_uri == s3_uri + + # --- Requirement 3.2: Non-S3 URIs with local_mode=False replaced with S3 paths --- + + @pytest.mark.parametrize( + "non_s3_uri", + [ + "/local/output/path", + "http://example.com/output", + "ftp://server/output", + ], + ) + def test_non_s3_uri_replaced_when_not_local_mode(self, non_s3_uri, session_local_mode_false): + """Non-S3 URIs in non-local sessions are replaced with auto-generated S3 paths. + + **Validates: Requirements 3.2** + """ + processor = self._make_processor(session_local_mode_false) + + s3_output = ProcessingS3Output( + s3_uri=non_s3_uri, + local_path="/opt/ml/processing/output", + s3_upload_mode="EndOfJob", + ) + outputs = [ProcessingOutput(output_name="output-1", s3_output=s3_output)] + + with patch("sagemaker.core.workflow.utilities._pipeline_config", None): + result = processor._normalize_outputs(outputs) + + assert len(result) == 1 + assert result[0].s3_output.s3_uri.startswith("s3://default-bucket/") + + # --- Requirement 3.3: Pipeline variable URIs skip normalization --- + + def test_pipeline_variable_uri_skips_normalization(self, session_local_mode_false): + """Pipeline variable URIs skip normalization entirely. + + **Validates: Requirements 3.3** + """ + processor = self._make_processor(session_local_mode_false) + + s3_output = ProcessingS3Output( + s3_uri="s3://bucket/output", + local_path="/opt/ml/processing/output", + s3_upload_mode="EndOfJob", + ) + outputs = [ProcessingOutput(output_name="output-1", s3_output=s3_output)] + + with patch("sagemaker.core.processing.is_pipeline_variable", return_value=True): + result = processor._normalize_outputs(outputs) + + assert len(result) == 1 + # Pipeline variable outputs are appended as-is without URI modification + assert result[0].s3_output.s3_uri == "s3://bucket/output" + + # --- Requirement 3.4: Non-ProcessingOutput objects raise TypeError --- + + @pytest.mark.parametrize( + "invalid_output", + [ + ["a string"], + [42], + [{"key": "value"}], + ], + ) + def test_non_processing_output_raises_type_error(self, invalid_output, session_local_mode_false): + """Non-ProcessingOutput objects must raise TypeError. + + **Validates: Requirements 3.4** + """ + processor = self._make_processor(session_local_mode_false) + + with pytest.raises(TypeError, match="must be provided as ProcessingOutput objects"): + processor._normalize_outputs(invalid_output) + + # --- Output name auto-generation --- + + def test_multiple_outputs_with_s3_uris_preserved(self, session_local_mode_false): + """Multiple outputs with S3 URIs are all preserved unchanged. + + **Validates: Requirements 3.1, 3.2** + """ + processor = self._make_processor(session_local_mode_false) + + outputs = [ + ProcessingOutput( + output_name="first-output", + s3_output=ProcessingS3Output( + s3_uri="s3://my-bucket/first", + local_path="/opt/ml/processing/output1", + s3_upload_mode="EndOfJob", + ), + ), + ProcessingOutput( + output_name="second-output", + s3_output=ProcessingS3Output( + s3_uri="s3://my-bucket/second", + local_path="/opt/ml/processing/output2", + s3_upload_mode="EndOfJob", + ), + ), + ] + + with patch("sagemaker.core.workflow.utilities._pipeline_config", None): + result = processor._normalize_outputs(outputs) + + assert len(result) == 2 + assert result[0].output_name == "first-output" + assert result[1].output_name == "second-output" + # S3 URIs should be preserved since they already have s3:// scheme + assert result[0].s3_output.s3_uri == "s3://my-bucket/first" + assert result[1].s3_output.s3_uri == "s3://my-bucket/second" + + class TestProcessorStartNew: def test_start_new_with_pipeline_session(self, mock_session): from sagemaker.core.workflow.pipeline_context import PipelineSession diff --git a/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py b/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py index bb8fdde960..3633b850e6 100644 --- a/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py +++ b/sagemaker-serve/src/sagemaker/serve/mode/local_container_mode.py @@ -3,6 +3,7 @@ from __future__ import absolute_import from pathlib import Path import logging +import os from datetime import datetime, timedelta from typing import Dict, Type import base64 @@ -10,6 +11,8 @@ import subprocess import docker +from sagemaker.core.local.utils import check_for_studio + from sagemaker.serve.model_server.tensorflow_serving.server import LocalTensorflowServing from sagemaker.serve.spec.inference_spec import InferenceSpec from sagemaker.serve.builder.schema_builder import SchemaBuilder @@ -33,6 +36,25 @@ + "Please increase container_timeout_seconds or review your inference code." ) +STUDIO_DOCKER_SOCKET_PATHS = [ + "/docker/proxy/docker.sock", + "/var/run/docker.sock", +] + + +def _get_docker_client(): + """Get a Docker client, handling SageMaker Studio's non-standard socket path.""" + if os.environ.get("DOCKER_HOST"): + return docker.from_env() + try: + if check_for_studio(): + for socket_path in STUDIO_DOCKER_SOCKET_PATHS: + if os.path.exists(socket_path): + return docker.DockerClient(base_url=f"unix://{socket_path}") + except (NotImplementedError, Exception): + pass + return docker.from_env() + class LocalContainerMode( LocalTorchServe, @@ -212,7 +234,7 @@ def _pull_image(self, image: str): # Check if Docker is available first try: - self.client = docker.from_env() + self.client = _get_docker_client() self.client.ping() # Test Docker connection except Exception as e: raise RuntimeError( diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py index 4f36b0dddb..01463729ac 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py @@ -133,7 +133,7 @@ ENDPOINT_CONFIG_ASYNC_KMS_KEY_ID_PATH, MODEL_CONTAINERS_PATH, ) -from sagemaker.serve.constants import SUPPORTED_MODEL_SERVERS, Framework +from sagemaker.serve.constants import LOCAL_MODES, SUPPORTED_MODEL_SERVERS, Framework from sagemaker.core.workflow.pipeline_context import PipelineSession, runnable_by_pipeline from sagemaker.core import fw_utils from sagemaker.core.helper.session_helper import container_def @@ -1287,7 +1287,16 @@ def _build_for_passthrough(self) -> Model: if not self.image_uri: raise ValueError("image_uri is required for pass-through cases") - self.s3_upload_path = None + self.secret_key = "" + + if not self.model_path: + self.s3_upload_path = None + else: + self.s3_upload_path = self.model_path + + if self.mode in LOCAL_MODES: + self._prepare_for_mode() + return self._create_model() def _build_default_async_inference_config(self, async_inference_config): diff --git a/sagemaker-serve/tests/unit/test_model_builder_methods.py b/sagemaker-serve/tests/unit/test_model_builder_methods.py index 6c126e4e96..400eb70047 100644 --- a/sagemaker-serve/tests/unit/test_model_builder_methods.py +++ b/sagemaker-serve/tests/unit/test_model_builder_methods.py @@ -22,6 +22,7 @@ from sagemaker.core.serializers import NumpySerializer, TorchTensorSerializer from sagemaker.core.deserializers import JSONDeserializer, TorchTensorDeserializer from sagemaker.serve.constants import Framework +from sagemaker.serve.mode.function_pointers import Mode class TestModelBuilderSimpleMethods: @@ -299,3 +300,313 @@ def test_get_client_translators_raises_on_no_deserializer(self): with patch.object(builder, '_fetch_serializer_and_deserializer_for_framework', return_value=(None, None)): with pytest.raises(ValueError, match="Cannot determine deserializer"): builder._get_client_translators() + + +class TestBuildForPassthroughLocalContainer: + """Tests for _build_for_passthrough() with LOCAL_CONTAINER mode. + + Validates: Requirements 2.1, 2.2, 3.1 + """ + + def _make_mock_session(self): + mock_session = Mock() + mock_session.boto_region_name = "us-west-2" + mock_session.config = {} + mock_session.boto_session = Mock() + mock_session.boto_session.region_name = "us-west-2" + return mock_session + + @patch.object(ModelBuilder, '_create_model') + @patch.object(ModelBuilder, '_prepare_for_mode') + def test_build_for_passthrough_initializes_secret_key( + self, mock_prepare, mock_create + ): + """Test that _build_for_passthrough initializes secret_key for LOCAL_CONTAINER mode. + + Bug 1 fix: secret_key must be set to empty string so _deploy_local_endpoint() + can pass it to LocalEndpoint.create() without raising AttributeError. + """ + mock_create.return_value = Mock() + + builder = ModelBuilder( + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.LOCAL_CONTAINER, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + + builder._build_for_passthrough() + + assert builder.secret_key == "" + + @patch.object(ModelBuilder, '_create_model') + @patch.object(ModelBuilder, '_prepare_for_mode') + def test_build_for_passthrough_calls_prepare_for_mode_local_container( + self, mock_prepare, mock_create + ): + """Test that _build_for_passthrough calls _prepare_for_mode for LOCAL_CONTAINER. + + Bug 2 fix: _prepare_for_mode() must be called so that + self.modes[str(Mode.LOCAL_CONTAINER)] contains a LocalContainerMode object, + preventing KeyError when _deploy_local_endpoint() accesses it. + """ + mock_create.return_value = Mock() + + builder = ModelBuilder( + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.LOCAL_CONTAINER, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + + builder._build_for_passthrough() + + mock_prepare.assert_called_once() + + @patch.object(ModelBuilder, '_create_model') + @patch.object(ModelBuilder, '_prepare_for_mode') + def test_build_for_passthrough_does_not_call_prepare_for_mode_sagemaker_endpoint( + self, mock_prepare, mock_create + ): + """Test that _build_for_passthrough does NOT call _prepare_for_mode for SAGEMAKER_ENDPOINT. + + Preservation: SAGEMAKER_ENDPOINT mode passthrough must continue to work without + calling _prepare_for_mode() (endpoint mode has its own preparation in _create_model). + """ + mock_create.return_value = Mock() + + builder = ModelBuilder( + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.SAGEMAKER_ENDPOINT, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + + builder._build_for_passthrough() + + mock_prepare.assert_not_called() + + +class TestBuildForPassthroughS3PathPreservation: + """Tests for _build_for_passthrough() S3 path preservation with ModelTrainer. + + Validates: Requirements 2.4, 3.5 + """ + + def _make_mock_session(self): + mock_session = Mock() + mock_session.boto_region_name = "us-west-2" + mock_session.config = {} + mock_session.boto_session = Mock() + mock_session.boto_session.region_name = "us-west-2" + return mock_session + + @patch.object(ModelBuilder, '_create_model') + @patch.object(ModelBuilder, '_prepare_for_mode') + def test_build_for_passthrough_preserves_model_path_as_s3_upload_path( + self, mock_prepare, mock_create + ): + """Test that _build_for_passthrough preserves model_path as s3_upload_path. + + Bug 4 fix: When ModelTrainer has set model_path to an S3 URI, + _build_for_passthrough() must preserve it as s3_upload_path instead of + unconditionally setting it to None. + + **Validates: Requirements 2.4** + """ + mock_create.return_value = Mock() + + builder = ModelBuilder( + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.LOCAL_CONTAINER, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + builder.model_path = "s3://bucket/model.tar.gz" + + builder._build_for_passthrough() + + assert builder.s3_upload_path == "s3://bucket/model.tar.gz" + + @patch.object(ModelBuilder, '_create_model') + @patch.object(ModelBuilder, '_prepare_for_mode') + def test_build_for_passthrough_sets_s3_upload_path_none_when_no_model_path( + self, mock_prepare, mock_create + ): + """Test that _build_for_passthrough sets s3_upload_path to None when no model_path. + + Preservation: When no model artifacts are involved (model_path is None), + s3_upload_path should still be set to None as before. + + **Validates: Requirements 3.5** + """ + mock_create.return_value = Mock() + + builder = ModelBuilder( + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.LOCAL_CONTAINER, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + builder.model_path = None + + builder._build_for_passthrough() + + assert builder.s3_upload_path is None + + +class TestGetDockerClient: + """Tests for _get_docker_client() Studio Docker client initialization. + + Validates: Requirements 2.3, 3.3 + """ + + @patch("sagemaker.serve.mode.local_container_mode.os.path.exists") + @patch("sagemaker.serve.mode.local_container_mode.check_for_studio", return_value=True) + @patch("sagemaker.serve.mode.local_container_mode.docker") + def test_pull_image_studio_uses_socket_path( + self, mock_docker, mock_check_studio, mock_path_exists + ): + """Test that in Studio, _get_docker_client uses the proxy socket path. + + Bug 3 fix: When check_for_studio() returns True and the proxy socket exists, + DockerClient should be initialized with base_url pointing to that socket. + """ + from sagemaker.serve.mode.local_container_mode import _get_docker_client + + mock_path_exists.side_effect = lambda p: p == "/docker/proxy/docker.sock" + mock_client = Mock() + mock_docker.DockerClient.return_value = mock_client + + with patch.dict("os.environ", {}, clear=True): + result = _get_docker_client() + + mock_docker.DockerClient.assert_called_once_with( + base_url="unix:///docker/proxy/docker.sock" + ) + assert result == mock_client + + @patch("sagemaker.serve.mode.local_container_mode.check_for_studio", return_value=False) + @patch("sagemaker.serve.mode.local_container_mode.docker") + def test_pull_image_non_studio_uses_from_env(self, mock_docker, mock_check_studio): + """Test that in non-Studio environments, _get_docker_client uses docker.from_env(). + + Preservation: Standard Docker environments must continue to use docker.from_env(). + """ + from sagemaker.serve.mode.local_container_mode import _get_docker_client + + mock_client = Mock() + mock_docker.from_env.return_value = mock_client + + with patch.dict("os.environ", {}, clear=True): + result = _get_docker_client() + + mock_docker.from_env.assert_called_once() + assert result == mock_client + + @patch("sagemaker.serve.mode.local_container_mode.check_for_studio") + @patch("sagemaker.serve.mode.local_container_mode.docker") + def test_pull_image_with_docker_host_env_var(self, mock_docker, mock_check_studio): + """Test that when DOCKER_HOST is set, docker.from_env() is used regardless of Studio. + + When the user explicitly sets DOCKER_HOST, we should respect that and skip + Studio detection entirely. + """ + from sagemaker.serve.mode.local_container_mode import _get_docker_client + + mock_client = Mock() + mock_docker.from_env.return_value = mock_client + + with patch.dict("os.environ", {"DOCKER_HOST": "tcp://localhost:2375"}): + result = _get_docker_client() + + mock_docker.from_env.assert_called_once() + mock_check_studio.assert_not_called() + assert result == mock_client + + +class TestPreservationNonPassthroughBehavior: + """Tests for Property 4: Preservation - Non-Passthrough and Non-LOCAL_CONTAINER Behavior. + + Verifies that the bugfix does not alter behavior for: + - Model-server-specific build methods (e.g., _build_for_torchserve) + - SAGEMAKER_ENDPOINT passthrough mode + + **Validates: Requirements 3.1, 3.2, 3.3, 3.4, 3.5** + """ + + def _make_mock_session(self): + mock_session = Mock() + mock_session.boto_region_name = "us-west-2" + mock_session.config = {} + mock_session.boto_session = Mock() + mock_session.boto_session.region_name = "us-west-2" + return mock_session + + @patch('sagemaker.serve.model_builder.ModelBuilder._create_model') + @patch('sagemaker.serve.model_builder.ModelBuilder._prepare_for_mode') + @patch('sagemaker.serve.model_builder.ModelBuilder._save_model_inference_spec') + def test_build_for_torchserve_still_calls_prepare_for_mode( + self, mock_save_spec, mock_prepare, mock_create + ): + """Test that _build_for_torchserve still calls _prepare_for_mode for LOCAL_CONTAINER. + + Preservation: The torchserve build path already calls _prepare_for_mode() + for local modes. This must remain unchanged after the passthrough bugfix. + + **Validates: Requirements 3.2** + """ + mock_create.return_value = Mock() + + builder = ModelBuilder( + model=Mock(), + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.LOCAL_CONTAINER, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + builder.model_server = Mock() + builder.env_vars = {} + builder.shared_libs = [] + builder.dependencies = {} + builder.inference_spec = None + builder.model_path = None + + builder._build_for_torchserve() + + mock_prepare.assert_called_once() + + @patch.object(ModelBuilder, '_create_model') + @patch.object(ModelBuilder, '_prepare_for_mode') + def test_build_for_passthrough_sagemaker_endpoint_unchanged( + self, mock_prepare, mock_create + ): + """Test that SAGEMAKER_ENDPOINT passthrough is fully unchanged by the bugfix. + + Preservation: SAGEMAKER_ENDPOINT passthrough must continue to: + - Initialize secret_key to empty string + - NOT call _prepare_for_mode() + - Set s3_upload_path to None when no model_path + - Return the model from _create_model() + + **Validates: Requirements 3.1, 3.5** + """ + mock_model = Mock() + mock_create.return_value = mock_model + + builder = ModelBuilder( + image_uri="123456789.dkr.ecr.us-west-2.amazonaws.com/my-image:latest", + mode=Mode.SAGEMAKER_ENDPOINT, + role_arn="arn:aws:iam::123456789012:role/TestRole", + sagemaker_session=self._make_mock_session(), + ) + builder.model_path = None + + result = builder._build_for_passthrough() + + assert builder.secret_key == "" + assert builder.s3_upload_path is None + mock_prepare.assert_not_called() + mock_create.assert_called_once() + assert result == mock_model + diff --git a/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py index f7ab97e1f6..476fa8c1fc 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py @@ -699,9 +699,10 @@ def _show_llmaj_results( # Download base model aggregate results if both models exist base_aggregate = None + base_bedrock_job_name = None if not is_single_model and base_job_name: try: - base_aggregate, _ = _download_bedrock_aggregate_json( + base_aggregate, base_bedrock_job_name = _download_bedrock_aggregate_json( pipeline_execution, base_job_name ) logger.info(f"Successfully downloaded base model aggregate results") @@ -776,9 +777,9 @@ def _show_llmaj_results( ) # Download base model per-example results if base_job_name exists - if base_job_name and bedrock_job_name: + if base_job_name and base_bedrock_job_name: try: - base_results = _download_llmaj_results_from_s3(pipeline_execution, bedrock_job_name) + base_results = _download_llmaj_results_from_s3(pipeline_execution, base_bedrock_job_name) logger.info(f"Successfully downloaded {len(base_results)} base model per-example results") except FileNotFoundError as e: s3_path = pipeline_execution.s3_output_path if pipeline_execution.s3_output_path else "unknown" diff --git a/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py b/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py index 8b038fd974..ccad436b26 100644 --- a/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py +++ b/sagemaker-train/tests/unit/train/common_utils/test_show_results_utils.py @@ -1140,3 +1140,660 @@ def test_show_results_aggregate_not_found( # Verify per-example results were still attempted # Note: This will fail because bedrock_job_name is None, but that's expected behavior # The function should log a warning and continue + + +class TestBugConditionExploration: + """Bug condition exploration tests for the base model per-example results S3 path bug. + + **Validates: Requirements 1.1, 1.4, 2.1, 2.4** + + Property 1: Fault Condition - Base Model Per-Example Results Downloaded From Wrong S3 Path + + In a two-model LLM-as-Judge evaluation where custom and base models have distinct + bedrock_job_name values, _download_llmaj_results_from_s3 MUST be called with the + base model's bedrock_job_name for the base model download, not the custom model's. + + EXPECTED: These tests FAIL on unfixed code, confirming the bug exists. + The bug causes both calls to _download_llmaj_results_from_s3 to use + "custom-bedrock-job" instead of using "base-bedrock-job" for the base model. + """ + + @pytest.mark.parametrize( + "custom_scores,base_scores,description", + [ + ( + [1.0, 0.9, 0.8, 0.7, 0.6], + [0.3, 0.4, 0.2, 0.5, 0.1], + "custom_wins_all", + ), + ( + [0.2, 0.3, 0.1, 0.4, 0.15], + [0.9, 0.8, 0.95, 0.7, 0.85], + "base_wins_all", + ), + ( + [0.9, 0.3, 0.8, 0.2, 0.7], + [0.4, 0.8, 0.3, 0.9, 0.6], + "mixed_wins", + ), + ], + ids=["custom_wins", "base_wins", "mixed"], + ) + @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics") + @patch("sagemaker.train.common_utils.show_results_utils._display_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_base_model_per_example_uses_correct_bedrock_job_name( + self, + mock_console_class, + mock_extract_job, + mock_download_aggregate, + mock_download_results, + mock_calculate_win, + mock_display_win, + mock_display_aggregate, + mock_pipeline_execution, + custom_scores, + base_scores, + description, + ): + """Assert _download_llmaj_results_from_s3 is called with 'base-bedrock-job' for base model. + + On unfixed code, this FAILS because both calls use 'custom-bedrock-job'. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + # Return distinct job names for custom and base steps + mock_extract_job.side_effect = ["custom-training-job", "base-training-job"] + + # Return distinct bedrock_job_name values for custom and base aggregates + custom_aggregate = {"results": {"Metric1": {"score": 0.8, "total_evaluations": 5}}} + base_aggregate = {"results": {"Metric1": {"score": 0.5, "total_evaluations": 5}}} + mock_download_aggregate.side_effect = [ + ("custom_agg", "custom-bedrock-job"), + ("base_agg", "base-bedrock-job"), + ] + + # Build per-example result sets keyed by bedrock_job_name + def make_results(scores): + return [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "Metric1", "result": s}] + }, + } + for s in scores + ] + + custom_results = make_results(custom_scores) + base_results = make_results(base_scores) + + # Return different results depending on which bedrock_job_name is passed + def download_side_effect(pipeline_exec, bedrock_job_name): + if bedrock_job_name == "custom-bedrock-job": + return custom_results + elif bedrock_job_name == "base-bedrock-job": + return base_results + raise ValueError(f"Unexpected bedrock_job_name: {bedrock_job_name}") + + mock_download_results.side_effect = download_side_effect + + mock_calculate_win.return_value = { + "custom_wins": 0, "base_wins": 0, "ties": 0, "total": 0, + "custom_win_rate": 0.0, "base_win_rate": 0.0, "tie_rate": 0.0, + } + + # Execute + _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0) + + # CRITICAL ASSERTION: base model download must use "base-bedrock-job" + # On unfixed code, both calls use "custom-bedrock-job" — this assertion FAILS. + assert mock_download_results.call_count == 2 + download_calls = mock_download_results.call_args_list + # First call: custom model per-example results + assert download_calls[0] == call(mock_pipeline_execution, "custom-bedrock-job"), ( + f"Expected custom download with 'custom-bedrock-job', " + f"got {download_calls[0]}" + ) + # Second call: base model per-example results — MUST use base bedrock job name + assert download_calls[1] == call(mock_pipeline_execution, "base-bedrock-job"), ( + f"BUG CONFIRMED: base model download used '{download_calls[1]}' " + f"instead of call(mock_pipeline_execution, 'base-bedrock-job'). " + f"Both downloads used the custom model's bedrock_job_name." + ) + + @pytest.mark.parametrize( + "custom_scores,base_scores,description", + [ + ( + [1.0, 0.9, 0.8, 0.7, 0.6], + [0.3, 0.4, 0.2, 0.5, 0.1], + "custom_wins_all", + ), + ( + [0.2, 0.3, 0.1, 0.4, 0.15], + [0.9, 0.8, 0.95, 0.7, 0.85], + "base_wins_all", + ), + ( + [0.9, 0.3, 0.8, 0.2, 0.7], + [0.4, 0.8, 0.3, 0.9, 0.6], + "mixed_wins", + ), + ], + ids=["custom_wins", "base_wins", "mixed"], + ) + @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics") + @patch("sagemaker.train.common_utils.show_results_utils._display_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_calculate_win_rates_receives_distinct_datasets( + self, + mock_console_class, + mock_extract_job, + mock_download_aggregate, + mock_download_results, + mock_calculate_win, + mock_display_win, + mock_display_aggregate, + mock_pipeline_execution, + custom_scores, + base_scores, + description, + ): + """Assert _calculate_win_rates receives two genuinely distinct datasets. + + On unfixed code, this FAILS because both datasets come from the same S3 path + (custom model's bedrock_job_name), so _calculate_win_rates gets identical data. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + mock_extract_job.side_effect = ["custom-training-job", "base-training-job"] + + mock_download_aggregate.side_effect = [ + ("custom_agg", "custom-bedrock-job"), + ("base_agg", "base-bedrock-job"), + ] + + def make_results(scores): + return [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "Metric1", "result": s}] + }, + } + for s in scores + ] + + custom_results = make_results(custom_scores) + base_results = make_results(base_scores) + + def download_side_effect(pipeline_exec, bedrock_job_name): + if bedrock_job_name == "custom-bedrock-job": + return custom_results + elif bedrock_job_name == "base-bedrock-job": + return base_results + raise ValueError(f"Unexpected bedrock_job_name: {bedrock_job_name}") + + mock_download_results.side_effect = download_side_effect + + mock_calculate_win.return_value = { + "custom_wins": 0, "base_wins": 0, "ties": 0, "total": 0, + "custom_win_rate": 0.0, "base_win_rate": 0.0, "tie_rate": 0.0, + } + + _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0) + + # _calculate_win_rates must receive genuinely distinct datasets + mock_calculate_win.assert_called_once() + actual_custom, actual_base = mock_calculate_win.call_args[0] + + # On unfixed code, both arguments are custom_results (identical), + # so this assertion FAILS. + assert actual_custom is not actual_base or custom_scores == base_scores, ( + "BUG CONFIRMED: _calculate_win_rates received identical objects for " + "custom and base results — both downloaded from custom model's S3 path." + ) + + # Verify the actual score values differ (stronger check) + custom_first_score = actual_custom[0]["automatedEvaluationResult"]["scores"][0]["result"] + base_first_score = actual_base[0]["automatedEvaluationResult"]["scores"][0]["result"] + assert custom_first_score == custom_scores[0], ( + f"Custom results first score should be {custom_scores[0]}, got {custom_first_score}" + ) + assert base_first_score == base_scores[0], ( + f"BUG CONFIRMED: Base results first score is {base_first_score} " + f"(same as custom {custom_scores[0]}) instead of expected {base_scores[0]}. " + f"Base per-example results were downloaded from the custom model's S3 path." + ) + + + +class TestPreservationProperty: + """Preservation property tests for _show_llmaj_results behavior. + + **Validates: Requirements 3.1, 3.2, 3.3, 3.4, 3.5, 3.6** + + Property 2: Preservation - Non-Two-Model and Non-Win-Rate Behavior Unchanged + + These tests observe and assert the current behavior of _show_llmaj_results for + code paths that are NOT affected by the two-model win-rate bug. They must PASS + on both unfixed and fixed code, confirming no regressions. + """ + + # --- Requirement 3.1: Single-model evaluation displays aggregate + per-example, no win rates --- + + @pytest.mark.parametrize( + "result_count", + [1, 3, 5, 10], + ids=["1_result", "3_results", "5_results", "10_results"], + ) + @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics") + @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_single_model_no_win_rates( + self, + mock_console_class, + mock_extract_job, + mock_display_single, + mock_download_aggregate, + mock_download_results, + mock_calculate_win, + mock_display_aggregate, + mock_pipeline_execution, + result_count, + ): + """Single-model evaluation displays aggregate and per-example results without win rates. + + Observed behavior: When only custom_job_name exists (no base_job_name), + _calculate_win_rates is never called. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + # Only custom model, no base + mock_extract_job.side_effect = ["custom-job", None] + + custom_aggregate = {"results": {"Metric1": {"score": 0.9, "total_evaluations": result_count}}} + mock_download_aggregate.return_value = (custom_aggregate, "bedrock-job-custom") + + mock_results = [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "Metric1", "result": 0.9}] + }, + } + ] * result_count + mock_download_results.return_value = mock_results + + _show_llmaj_results(mock_pipeline_execution, limit=result_count, offset=0) + + # _calculate_win_rates must NOT be called for single-model evaluations + mock_calculate_win.assert_not_called() + # Aggregate metrics should be displayed with None for base + mock_display_aggregate.assert_called_once_with(custom_aggregate, None, mock_console) + # Per-example results should be displayed + assert mock_display_single.call_count == result_count + + # --- Requirement 3.2: Base aggregate FileNotFoundError -> custom-only results --- + + @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics") + @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_base_aggregate_not_found_displays_custom_only( + self, + mock_console_class, + mock_extract_job, + mock_display_single, + mock_download_aggregate, + mock_download_results, + mock_calculate_win, + mock_display_aggregate, + mock_pipeline_execution, + ): + """When base aggregate raises FileNotFoundError, function continues with custom-only. + + Observed behavior: base_aggregate stays None, _display_aggregate_metrics is called + with custom_aggregate and None for base. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + # Two-model evaluation + mock_extract_job.side_effect = ["custom-job", "base-job"] + + custom_aggregate = {"results": {"Metric1": {"score": 0.85, "total_evaluations": 5}}} + + # Custom aggregate succeeds, base aggregate raises FileNotFoundError + mock_download_aggregate.side_effect = [ + (custom_aggregate, "bedrock-job-custom"), + FileNotFoundError("Base aggregate not found in S3"), + ] + + mock_results = [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "Metric1", "result": 0.85}] + }, + } + ] * 3 + mock_download_results.return_value = mock_results + + # Should NOT raise — function continues gracefully + _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0) + + # Aggregate displayed with None for base + mock_display_aggregate.assert_called_once_with(custom_aggregate, None, mock_console) + # Per-example results still displayed + assert mock_display_single.call_count == 3 + + # --- Requirement 3.3: Pagination with limit and offset --- + + @pytest.mark.parametrize( + "limit,offset,total,expected_display_count", + [ + (3, 2, 10, 3), + (1, 0, 5, 1), + (5, 0, 3, 3), + (20, 5, 10, 5), + (2, 8, 10, 2), + (10, 0, 10, 10), + ], + ids=[ + "limit3_offset2_total10", + "limit1_offset0_total5", + "limit5_offset0_total3", + "limit20_offset5_total10", + "limit2_offset8_total10", + "limit10_offset0_total10", + ], + ) + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_pagination_display_count( + self, + mock_console_class, + mock_extract_job, + mock_display_single, + mock_download_aggregate, + mock_download_results, + mock_pipeline_execution, + limit, + offset, + total, + expected_display_count, + ): + """Pagination calls _display_single_llmaj_evaluation the correct number of times. + + Observed behavior: display count = min(limit, total - offset) when offset < total. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + mock_extract_job.side_effect = ["custom-job", None] + mock_download_aggregate.return_value = ({"results": {}}, "bedrock-job") + + mock_results = [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "M", "result": 0.5}] + }, + } + ] * total + mock_download_results.return_value = mock_results + + _show_llmaj_results(mock_pipeline_execution, limit=limit, offset=offset) + + assert mock_display_single.call_count == expected_display_count + + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_pagination_starts_at_correct_index( + self, + mock_console_class, + mock_extract_job, + mock_display_single, + mock_download_aggregate, + mock_download_results, + mock_pipeline_execution, + ): + """Pagination with limit=3, offset=2 on 10 results starts at index 2. + + Observed behavior: _display_single_llmaj_evaluation is called with indices 2, 3, 4. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + mock_extract_job.side_effect = ["custom-job", None] + mock_download_aggregate.return_value = ({"results": {}}, "bedrock-job") + + # Create 10 distinct results so we can verify indices + mock_results = [ + { + "inputRecord": {"prompt": f"[{{'role': 'user', 'content': 'Q{i}'}}]"}, + "modelResponses": [{"response": f"['A{i}']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "M", "result": float(i) / 10}] + }, + } + for i in range(10) + ] + mock_download_results.return_value = mock_results + + _show_llmaj_results(mock_pipeline_execution, limit=3, offset=2) + + assert mock_display_single.call_count == 3 + # Verify the indices passed to _display_single_llmaj_evaluation + for call_idx, expected_i in enumerate([2, 3, 4]): + actual_call = mock_display_single.call_args_list[call_idx] + # Args: (result, index, total, console, show_explanations=...) + assert actual_call[0][0] == mock_results[expected_i], ( + f"Call {call_idx}: expected result at index {expected_i}" + ) + assert actual_call[0][1] == expected_i, ( + f"Call {call_idx}: expected index arg {expected_i}, got {actual_call[0][1]}" + ) + + # --- Requirement 3.4: show_explanations passthrough --- + + @pytest.mark.parametrize( + "show_explanations", + [True, False], + ids=["explanations_on", "explanations_off"], + ) + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_show_explanations_passthrough( + self, + mock_console_class, + mock_extract_job, + mock_display_single, + mock_download_aggregate, + mock_download_results, + mock_pipeline_execution, + show_explanations, + ): + """show_explanations value is passed through to _display_single_llmaj_evaluation. + + Observed behavior: Each call to _display_single_llmaj_evaluation receives + show_explanations as a keyword argument matching the value passed to _show_llmaj_results. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + mock_extract_job.side_effect = ["custom-job", None] + mock_download_aggregate.return_value = ({"results": {}}, "bedrock-job") + + mock_results = [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "M", "result": 0.8}] + }, + } + ] * 3 + mock_download_results.return_value = mock_results + + _show_llmaj_results( + mock_pipeline_execution, limit=3, offset=0, show_explanations=show_explanations + ) + + assert mock_display_single.call_count == 3 + for c in mock_display_single.call_args_list: + assert c[1]["show_explanations"] == show_explanations, ( + f"Expected show_explanations={show_explanations}, got {c[1]}" + ) + + # --- Requirement 3.5: Per-example FileNotFoundError -> warning + aggregate display --- + + @pytest.mark.parametrize( + "has_aggregate", + [True, False], + ids=["with_aggregate", "without_aggregate"], + ) + @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics") + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._display_single_llmaj_evaluation") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_per_example_not_found_displays_aggregate_if_available( + self, + mock_console_class, + mock_extract_job, + mock_display_single, + mock_download_aggregate, + mock_download_results, + mock_display_aggregate, + mock_pipeline_execution, + has_aggregate, + ): + """When per-example results raise FileNotFoundError, aggregate metrics still display. + + Observed behavior: custom_results stays None, _display_single_llmaj_evaluation is + never called, but _display_aggregate_metrics is called if aggregate was downloaded. + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + mock_extract_job.side_effect = ["custom-job", None] + + if has_aggregate: + custom_aggregate = {"results": {"M": {"score": 0.9, "total_evaluations": 5}}} + mock_download_aggregate.return_value = (custom_aggregate, "bedrock-job") + else: + mock_download_aggregate.side_effect = FileNotFoundError("Aggregate not found") + + # Per-example download fails + mock_download_results.side_effect = FileNotFoundError("Per-example results not found") + + # Should NOT raise + _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0) + + # No per-example results displayed + mock_display_single.assert_not_called() + + if has_aggregate: + mock_display_aggregate.assert_called_once() + else: + mock_display_aggregate.assert_not_called() + + # --- Requirement 3.6: Genuinely identical data -> 100% ties is legitimate --- + # (This is tested via _calculate_win_rates directly in TestCalculateWinRates.test_calculate_ties) + # Here we verify the integration: when both models return identical per-example data, + # _calculate_win_rates IS called and its result is displayed. + + @patch("sagemaker.train.common_utils.show_results_utils._display_aggregate_metrics") + @patch("sagemaker.train.common_utils.show_results_utils._display_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._calculate_win_rates") + @patch("sagemaker.train.common_utils.show_results_utils._download_llmaj_results_from_s3") + @patch("sagemaker.train.common_utils.show_results_utils._download_bedrock_aggregate_json") + @patch("sagemaker.train.common_utils.show_results_utils._extract_training_job_name_from_steps") + @patch("rich.console.Console") + def test_identical_data_win_rates_still_calculated( + self, + mock_console_class, + mock_extract_job, + mock_download_aggregate, + mock_download_results, + mock_calculate_win, + mock_display_win, + mock_display_aggregate, + mock_pipeline_execution, + ): + """When both models have identical per-example data, win rates are still calculated. + + Observed behavior: _calculate_win_rates is called when both custom_results and + base_results are non-None, regardless of whether the data is identical. + This preserves the legitimate 100% ties case (Requirement 3.6). + """ + mock_console = MagicMock() + mock_console_class.return_value = mock_console + + mock_extract_job.side_effect = ["custom-job", "base-job"] + + mock_download_aggregate.side_effect = [ + ({"results": {"M": {"score": 0.8}}}, "same-bedrock-job"), + ({"results": {"M": {"score": 0.8}}}, "same-bedrock-job"), + ] + + identical_results = [ + { + "inputRecord": {"prompt": "[{'role': 'user', 'content': 'Q'}]"}, + "modelResponses": [{"response": "['A']"}], + "automatedEvaluationResult": { + "scores": [{"metricName": "M", "result": 0.8}] + }, + } + ] * 5 + # Both calls return the same data (this is the current buggy behavior AND + # the legitimate case when models genuinely tie) + mock_download_results.return_value = identical_results + + win_rates = { + "custom_wins": 0, "base_wins": 0, "ties": 5, "total": 5, + "custom_win_rate": 0.0, "base_win_rate": 0.0, "tie_rate": 1.0, + } + mock_calculate_win.return_value = win_rates + + _show_llmaj_results(mock_pipeline_execution, limit=5, offset=0) + + # Win rates should still be calculated and displayed + mock_calculate_win.assert_called_once() + mock_display_win.assert_called_once_with(win_rates, mock_console) From 7a13971fb5c5c7daaaefb31fa18470ab57020dcc Mon Sep 17 00:00:00 2001 From: Gokul A Date: Thu, 5 Mar 2026 13:03:18 -0800 Subject: [PATCH 2/4] fix(model_builder): Only set s3_upload_path for S3 URIs in passthrough In _build_for_passthrough(), model_path could be a local /tmp path. Setting s3_upload_path to a local path caused CreateModel API to reject the modelDataUrl with a validation error since it requires s3:// or https:// URIs. Now only S3 URIs are assigned to s3_upload_path; local paths are handled separately by _prepare_for_mode() in LOCAL_CONTAINER mode. --- sagemaker-serve/src/sagemaker/serve/model_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py index 01463729ac..c2ba0c36eb 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py @@ -1289,10 +1289,10 @@ def _build_for_passthrough(self) -> Model: self.secret_key = "" - if not self.model_path: - self.s3_upload_path = None - else: + if self.model_path and self.model_path.startswith("s3://"): self.s3_upload_path = self.model_path + else: + self.s3_upload_path = None if self.mode in LOCAL_MODES: self._prepare_for_mode() From 78cff41736e99ff08c022dcfb847d91ef70d3a28 Mon Sep 17 00:00:00 2001 From: Gokul A Date: Thu, 5 Mar 2026 15:56:31 -0800 Subject: [PATCH 3/4] Test fixes --- .../src/sagemaker/train/evaluate/pipeline_templates.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py index b14145ff31..3848ef0d5c 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py @@ -1028,9 +1028,6 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, - "ModelPackageConfig": { - "ModelPackageGroupArn": "{{ model_package_group_arn }}" }{% if dataset_uri %}, "InputDataConfig": [ { @@ -1182,10 +1179,6 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, - "ModelPackageConfig": { - "ModelPackageGroupArn": "{{ model_package_group_arn }}", - "SourceModelPackageArn": "{{ source_model_package_arn }}" } } },{% endif %} From c1d8d1a0258bb8b248f97ef200abf3e95c4b912c Mon Sep 17 00:00:00 2001 From: Gokul A Date: Fri, 6 Mar 2026 12:03:04 -0800 Subject: [PATCH 4/4] Bug fix 3 and 4 --- .../modules/local_core/local_container.py | 20 ++- .../sagemaker/train/local/local_container.py | 20 ++- .../tests/unit/train/test_defaults.py | 170 ++++++++++++++++++ 3 files changed, 202 insertions(+), 8 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py index 52624c7216..43a5ca502d 100644 --- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py +++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py @@ -17,6 +17,7 @@ import os import re import shutil +import stat import subprocess from tempfile import TemporaryDirectory from typing import Any, Dict, List, Optional @@ -57,6 +58,17 @@ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" +def _rmtree(path): + """Remove a directory tree, handling root-owned files from Docker containers.""" + def _onerror(func, path, exc_info): + if isinstance(exc_info[1], PermissionError): + os.chmod(path, stat.S_IRWXU) + func(path) + else: + raise exc_info[1] + shutil.rmtree(path, onerror=_onerror) + + class _LocalContainer(BaseModel): """A local training job class for local mode model trainer. @@ -209,12 +221,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - shutil.rmtree(os.path.join(self.container_root, "input")) - shutil.rmtree(os.path.join(self.container_root, "shared")) + _rmtree(os.path.join(self.container_root, "input")) + _rmtree(os.path.join(self.container_root, "shared")) for host in self.hosts: - shutil.rmtree(os.path.join(self.container_root, host)) + _rmtree(os.path.join(self.container_root, host)) for folder in self._temporary_folders: - shutil.rmtree(os.path.join(self.container_root, folder)) + _rmtree(os.path.join(self.container_root, folder)) return artifacts def retrieve_artifacts( diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py index 89cf11c873..bcf84395ce 100644 --- a/sagemaker-train/src/sagemaker/train/local/local_container.py +++ b/sagemaker-train/src/sagemaker/train/local/local_container.py @@ -18,6 +18,7 @@ import os import re import shutil +import stat import subprocess from tempfile import TemporaryDirectory from typing import Any, Dict, List, Optional @@ -65,6 +66,17 @@ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" +def _rmtree(path): + """Remove a directory tree, handling root-owned files from Docker containers.""" + def _onerror(func, path, exc_info): + if isinstance(exc_info[1], PermissionError): + os.chmod(path, stat.S_IRWXU) + func(path) + else: + raise exc_info[1] + shutil.rmtree(path, onerror=_onerror) + + class _LocalContainer(BaseModel): """A local training job class for local mode model trainer. @@ -217,12 +229,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - shutil.rmtree(os.path.join(self.container_root, "input")) - shutil.rmtree(os.path.join(self.container_root, "shared")) + _rmtree(os.path.join(self.container_root, "input")) + _rmtree(os.path.join(self.container_root, "shared")) for host in self.hosts: - shutil.rmtree(os.path.join(self.container_root, host)) + _rmtree(os.path.join(self.container_root, host)) for folder in self._temporary_folders: - shutil.rmtree(os.path.join(self.container_root, folder)) + _rmtree(os.path.join(self.container_root, folder)) return artifacts def retrieve_artifacts( diff --git a/sagemaker-train/tests/unit/train/test_defaults.py b/sagemaker-train/tests/unit/train/test_defaults.py index 848e46b2c5..bf71e2247e 100644 --- a/sagemaker-train/tests/unit/train/test_defaults.py +++ b/sagemaker-train/tests/unit/train/test_defaults.py @@ -25,6 +25,7 @@ DEFAULT_MAX_RUNTIME_IN_SECONDS, ) from sagemaker.train.configs import Compute, StoppingCondition +from sagemaker.core.shapes import InstanceGroup class TestDefaultConstants: @@ -435,3 +436,172 @@ def test_uses_default_volume_size_when_not_in_document( ) assert result.volume_size_in_gb == DEFAULT_VOLUME_SIZE + + + def test_does_not_set_instance_type_when_instance_groups_configured(self): + """Test instance_type is not overwritten when instance_groups are set.""" + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=30, + ) + result = TrainDefaults.get_compute(compute=compute) + assert result.instance_type is None + + def test_does_not_set_instance_count_when_instance_groups_configured(self): + """Test instance_count is not overwritten when instance_groups are set.""" + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=30, + ) + result = TrainDefaults.get_compute(compute=compute) + assert result.instance_count is None + + def test_sets_volume_size_when_instance_groups_configured(self): + """Test volume_size_in_gb is still set when instance_groups are configured.""" + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=None, + ) + result = TrainDefaults.get_compute(compute=compute) + assert result.volume_size_in_gb == DEFAULT_VOLUME_SIZE + + def test_preserves_existing_volume_size_with_instance_groups(self): + """Test existing volume_size_in_gb is preserved when instance_groups are configured.""" + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=100, + ) + result = TrainDefaults.get_compute(compute=compute) + assert result.volume_size_in_gb == 100 + + +class TestJumpStartTrainDefaultsGetComputeHeterogeneousCluster: + """Test JumpStartTrainDefaults.get_compute with heterogeneous cluster (instance_groups).""" + + @patch("sagemaker.train.defaults.get_hub_content_and_document") + @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session") + def test_does_not_set_instance_type_when_instance_groups_configured( + self, mock_get_session, mock_get_hub_content + ): + """Test instance_type is not overwritten when instance_groups are set.""" + mock_session = MagicMock() + mock_get_session.return_value = mock_session + + mock_document = MagicMock() + mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge" + mock_document.TrainingVolumeSize = 100 + mock_get_hub_content.return_value = (None, mock_document) + + mock_config = MagicMock() + mock_config.training_config_name = None + + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=30, + ) + result = JumpStartTrainDefaults.get_compute( + jumpstart_config=mock_config, + compute=compute, + sagemaker_session=mock_session, + ) + assert result.instance_type is None + + @patch("sagemaker.train.defaults.get_hub_content_and_document") + @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session") + def test_does_not_set_instance_count_when_instance_groups_configured( + self, mock_get_session, mock_get_hub_content + ): + """Test instance_count is not overwritten when instance_groups are set.""" + mock_session = MagicMock() + mock_get_session.return_value = mock_session + + mock_document = MagicMock() + mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge" + mock_document.TrainingVolumeSize = 100 + mock_get_hub_content.return_value = (None, mock_document) + + mock_config = MagicMock() + mock_config.training_config_name = None + + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=30, + ) + result = JumpStartTrainDefaults.get_compute( + jumpstart_config=mock_config, + compute=compute, + sagemaker_session=mock_session, + ) + assert result.instance_count is None + + @patch("sagemaker.train.defaults.get_hub_content_and_document") + @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session") + def test_sets_volume_size_from_document_when_instance_groups_configured( + self, mock_get_session, mock_get_hub_content + ): + """Test volume_size_in_gb is set from document even when instance_groups are configured.""" + mock_session = MagicMock() + mock_get_session.return_value = mock_session + + mock_document = MagicMock() + mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge" + mock_document.TrainingVolumeSize = 100 + mock_get_hub_content.return_value = (None, mock_document) + + mock_config = MagicMock() + mock_config.training_config_name = None + + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=None, + ) + result = JumpStartTrainDefaults.get_compute( + jumpstart_config=mock_config, + compute=compute, + sagemaker_session=mock_session, + ) + assert result.volume_size_in_gb == 100 + + @patch("sagemaker.train.defaults.get_hub_content_and_document") + @patch("sagemaker.train.defaults.TrainDefaults.get_sagemaker_session") + def test_sets_default_volume_size_when_instance_groups_and_no_document_volume( + self, mock_get_session, mock_get_hub_content + ): + """Test DEFAULT_VOLUME_SIZE is used when instance_groups set and document has no volume.""" + mock_session = MagicMock() + mock_get_session.return_value = mock_session + + mock_document = MagicMock() + mock_document.DefaultTrainingInstanceType = "ml.p3.2xlarge" + mock_document.TrainingVolumeSize = None + mock_get_hub_content.return_value = (None, mock_document) + + mock_config = MagicMock() + mock_config.training_config_name = None + + compute = Compute( + instance_groups=[InstanceGroup(instance_type="ml.p3.2xlarge", instance_count=1, instance_group_name="group1")], + instance_type=None, + instance_count=None, + volume_size_in_gb=None, + ) + result = JumpStartTrainDefaults.get_compute( + jumpstart_config=mock_config, + compute=compute, + sagemaker_session=mock_session, + ) + assert result.volume_size_in_gb == DEFAULT_VOLUME_SIZE