From df6bde33c04acb1d02a2c77ffe8ddc43c2254ba8 Mon Sep 17 00:00:00 2001 From: bwd Date: Thu, 1 May 2025 16:55:18 +0000 Subject: [PATCH 1/7] =?UTF-8?q?=F0=9F=AA=B3providers=20dont=20return=20jso?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .devcontainer/image/stagefiles/layer.sh | 4 ++- src/evaluation_instruments/_evaluation.py | 29 +++++++++++++-------- src/evaluation_instruments/post/__init__.py | 23 +++++++++++++++- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/.devcontainer/image/stagefiles/layer.sh b/.devcontainer/image/stagefiles/layer.sh index 4024fa8..ffc8c3c 100644 --- a/.devcontainer/image/stagefiles/layer.sh +++ b/.devcontainer/image/stagefiles/layer.sh @@ -14,7 +14,9 @@ apt-get install --no-install-recommends --yes --quiet \ graphviz \ libgraphviz-dev \ pandoc \ - virtualenv + virtualenv \ + ca-certificates + # Make vim tiny default ln -sfn /usr/bin/vi /usr/bin/vim diff --git a/src/evaluation_instruments/_evaluation.py b/src/evaluation_instruments/_evaluation.py index 3944acd..c5ffe64 100644 --- a/src/evaluation_instruments/_evaluation.py +++ b/src/evaluation_instruments/_evaluation.py @@ -66,7 +66,9 @@ def __init__( self.log_enabled = log_enabled self._model_args = model_args or {} - self.capacity = TokenUsage(None, None, max_tokens) + + self.tmp_dir: Optional[Path] = None + self.capacity: TokenUsage = TokenUsage(None, None, max_tokens) logger.debug(f"Set up with {log_enabled=} and capacity {max_tokens}") @@ -142,9 +144,7 @@ def run_dataset(self, df: "pd.DataFrame", model: str = None, capacity: int = Non response, usage = self._post_fn(sample_ix, raw_output) accumulated_usage += TokenUsage(**usage) - tmp_dir = self._dump_to_temp(sample_ix=sample_ix, raw_content=raw_output) outputs[sample_ix] = response - logger.debug(f"{sample_ix}-Completed evaluation") # abort if beyond capacity @@ -152,7 +152,7 @@ def run_dataset(self, df: "pd.DataFrame", model: str = None, capacity: int = Non logger.warning(f"Aborting run after {sample_ix}. Capacity exceeded: {accumulated_usage} > {max_usage}") break - if tmp_dir is not None: + if self.tmp_dir is not None: logger.info(f"Dumped raw content to {tmp_dir}") return outputs, accumulated_usage @@ -178,20 +178,21 @@ def _dump_to_temp(self, sample_ix, raw_content) -> Optional[Path]: return None datestamp = datetime.now().strftime("%Y%m%d-%Hh") # Generate a timestamp in the format YYYYMMDD-hhmm - tmp_dir = Path(tempfile.gettempdir()) / "evaluation_logs" / f"{datestamp}" - tmp_dir.mkdir(parents=True, exist_ok=True) + + if self.tmp_dir is None: + tmp_dir = Path(tempfile.gettempdir()) / "evaluation_logs" / f"{datestamp}" + tmp_dir.mkdir(parents=True, exist_ok=True) + self.tmp_dir = tmp_dir timestamp = datetime.now().strftime("%H%M%S") # Generate a timestamp in the format hhmmss - filepath = tmp_dir / f"{sample_ix}_raw_{timestamp}.json" + filepath = self.tmp_dir / f"{sample_ix}_raw_{timestamp}.json" with open(filepath, "w") as f: # Write the raw content to the file if isinstance(raw_content, dict): json.dump(raw_content, f) else: - f.write(raw_content) - - return tmp_dir + f.write(str(raw_content)) def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, TokenUsage]: """ @@ -214,9 +215,13 @@ def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, Toke as well as the usage information from response['usage']. """ ix = 0 # assume N=1 + try: # Many providers have their own response objects, try to convert + openai_json = openai_json.dict() + except AttributeError: + pass + try: raw_content = openai_json["choices"][ix]["message"]["content"] - # Assume no nesting {} response = json.loads(raw_content[raw_content.find("{") : raw_content.find("}") + 1]) # noqa: E203 except Exception: @@ -224,6 +229,8 @@ def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, Toke response = {} usage = openai_json.get("usage", {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}) + + self._dump_to_temp(sample_ix=sample_ix, raw_content=openai_json) return response, usage diff --git a/src/evaluation_instruments/post/__init__.py b/src/evaluation_instruments/post/__init__.py index ca43f8e..75d1094 100644 --- a/src/evaluation_instruments/post/__init__.py +++ b/src/evaluation_instruments/post/__init__.py @@ -2,10 +2,31 @@ def frame_from_evals(full_output: dict, criteria_outputs: list[str] = None) -> pd.DataFrame: + """ + Convert the full output of the evaluation into a DataFrame. + + Useful for when the evaluation returns multiple outputs per criteria, such as a score plus evidence. + + Parameters + ---------- + full_output : dict + The full output of the evaluation, typically a dictionary with keys as the primary keys and values as dictionaries + containing the criteria and their respective outputs. + criteria_outputs : list[str], optional + The names of the outputs to include in the DataFrame. If not provided, defaults to ["evidence", "score"]. + Specify [] or a single valued list to return a DataFrame with only one level, the criteria names. + + Returns + ------- + pd.DataFrame + _description_ + """ if not full_output: return pd.DataFrame() - criteria_outputs = criteria_outputs or ["evidence", "score"] + criteria_outputs = criteria_outputs if criteria_outputs is not None else ["evidence", "score"] + if len(criteria_outputs) < 2: + return pd.DataFrame.from_dict(full_output, orient="index") reformatted = {} for pk, criteria in full_output.items(): From b806870778dda0848721dfddb7590ed4bb1df042 Mon Sep 17 00:00:00 2001 From: bwd Date: Thu, 1 May 2025 17:31:58 +0000 Subject: [PATCH 2/7] test and log --- changelog/5.fix.rst | 2 ++ tests/test_evaluation.py | 77 +++++++++++++++++++++++++++++++--------- tests/test_post.py | 24 +++++++++++++ 3 files changed, 87 insertions(+), 16 deletions(-) create mode 100644 changelog/5.fix.rst diff --git a/changelog/5.fix.rst b/changelog/5.fix.rst new file mode 100644 index 0000000..6a4e64c --- /dev/null +++ b/changelog/5.fix.rst @@ -0,0 +1,2 @@ +Update default postproccessing to handle the objects returned by more providers. +Update the results transform to handle single score objects. diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 51f1627..6b292b0 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -7,24 +7,54 @@ from evaluation_instruments._evaluation import Evaluation from evaluation_instruments.model import TokenUsage - -def fake_completion(prompt, **kwargs): - """Fake completion function to simulate OpenAI API response.""" +def example_dict(): return { - "choices": [{"message": {"content": json.dumps({"result": "success"})}}], - "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, - } + "choices": [{"message": {"content": json.dumps({"result": "success"})}}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, + } + +class CompletionObj: + def __init__(self, response_dict): + self.response_dict = response_dict + def dict(self): + return self.response_dict + + def json(self): + return json.dumps(self.response_dict) + +@pytest.fixture +def sample_evaluation_obj(): + """ + Create a basic evaluation instance with mock functions + Returns an object with a dict method, like Openai + """ + prep_fn = MagicMock(return_value="test prompt") + completion_fn = MagicMock( + return_value=CompletionObj(example_dict()) + ) + post_fn = MagicMock( + return_value=({"result": "success"}, {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}) + ) + + return Evaluation( + prep_fn=prep_fn, + completion_fn=completion_fn, + post_process_fn=post_fn, + log_enabled=False, + model_args={}, + max_tokens=1000, + ) @pytest.fixture def sample_evaluation(): - """Create a basic evaluation instance with mock functions.""" + """ + Create a basic evaluation instance with mock functions. + Directly returns a dictionary + """ prep_fn = MagicMock(return_value="test prompt") completion_fn = MagicMock( - return_value={ - "choices": [{"message": {"content": '{"result": "success"}'}}], - "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, - } + return_value=example_dict() ) post_fn = MagicMock( return_value=({"result": "success"}, {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}) @@ -42,10 +72,7 @@ def sample_evaluation(): def static_completion(prompt, **kwargs): """Fake completion function to simulate OpenAI API response.""" - return { - "choices": [{"message": {"content": json.dumps({"result": "success"})}}], - "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, - } + return example_dict() def static_prep(sample): @@ -115,7 +142,7 @@ def test_toggle_logging(self, initial_state, sample_evaluation): assert result == (not initial_state) def test_run_dataset(self, sample_evaluation): - """Test the run_dataset method processes samples correctly.""" + """Test that run_object returns the expected output.""" # Create a mock DataFrame df = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]}) @@ -132,6 +159,24 @@ def test_run_dataset(self, sample_evaluation): assert all(v == {"result": "success"} for v in outputs.values()) assert usage == TokenUsage(20, 10, 30) + def test_run_dataset(self, sample_evaluation_obj): + """Test that run_object returns the expected output.""" + # Create a mock DataFrame + df = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]}) + + # Run the dataset + outputs, usage = sample_evaluation_obj.run_dataset(df) + + # Check that prep_fn, completion_fn, and post_fn were called for each sample + assert sample_evaluation_obj._prep_fn.call_count == 2 + assert sample_evaluation_obj._completion_fn.call_count == 2 + assert sample_evaluation_obj._post_fn.call_count == 2 + + # Check outputs and usage + assert len(outputs) == 2 + assert all(v == {"result": "success"} for v in outputs.values()) + assert usage == TokenUsage(20, 10, 30) + def test_run_dataset_empty(self, sample_evaluation, caplog): """Test the run_dataset method processes samples correctly.""" diff --git a/tests/test_post.py b/tests/test_post.py index b0fe789..a72994d 100644 --- a/tests/test_post.py +++ b/tests/test_post.py @@ -89,3 +89,27 @@ def test_frame_from_evals_single_item(): # Verify exact match with expected DataFrame assert_frame_equal(result_df, expected_df) + +def test_frame_from_evals_single_score_multiple_items(): + """Test frame_from_evals with multiple items but only single score values (no evidence).""" + # Sample with single score per criteria, multiple items + eval_output = { + "sample1": { + "criteria1": 4, + "criteria2": 3, + }, + "sample2": { + "criteria1": 5, + "criteria2": 2, + }, + } + expected_df = pd.DataFrame( + {"criteria1": [4, 5], "criteria2": [3, 2]}, + index=["sample1", "sample2"] + ) + + # Act + result_df = frame_from_evals(eval_output, []) + + # Verify exact match with expected DataFrame + assert_frame_equal(result_df, expected_df) From 28134e4baa871c9f3e707316e18884d4c719fcac Mon Sep 17 00:00:00 2001 From: bwd Date: Thu, 1 May 2025 17:38:12 +0000 Subject: [PATCH 3/7] have completion_fn use all kwargs --- src/evaluation_instruments/_evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluation_instruments/_evaluation.py b/src/evaluation_instruments/_evaluation.py index c5ffe64..c383ccb 100644 --- a/src/evaluation_instruments/_evaluation.py +++ b/src/evaluation_instruments/_evaluation.py @@ -139,7 +139,7 @@ def run_dataset(self, df: "pd.DataFrame", model: str = None, capacity: int = Non prompt = self.prep_fn(sample) # Delegate - raw_output = self.completion_fn(model, prompt, **self._model_args) + raw_output = self.completion_fn(model=model, messages=prompt, **self._model_args) response, usage = self._post_fn(sample_ix, raw_output) accumulated_usage += TokenUsage(**usage) From 4fbe26d6597cfca82ce880df51fae8e3d13b84a7 Mon Sep 17 00:00:00 2001 From: bwd Date: Thu, 1 May 2025 17:48:20 +0000 Subject: [PATCH 4/7] bad fragment type --- changelog/{5.fix.rst => 7.bugfix.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename changelog/{5.fix.rst => 7.bugfix.rst} (100%) diff --git a/changelog/5.fix.rst b/changelog/7.bugfix.rst similarity index 100% rename from changelog/5.fix.rst rename to changelog/7.bugfix.rst From 41d08caaef2198161f4d7c5589aab0e23d1ba87d Mon Sep 17 00:00:00 2001 From: bwd Date: Thu, 1 May 2025 18:22:12 +0000 Subject: [PATCH 5/7] more robust handling --- src/evaluation_instruments/_evaluation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/evaluation_instruments/_evaluation.py b/src/evaluation_instruments/_evaluation.py index c383ccb..6950fce 100644 --- a/src/evaluation_instruments/_evaluation.py +++ b/src/evaluation_instruments/_evaluation.py @@ -216,10 +216,13 @@ def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, Toke """ ix = 0 # assume N=1 try: # Many providers have their own response objects, try to convert - openai_json = openai_json.dict() + openai_json = openai_json.json() except AttributeError: pass + if isinstance(openai_json, str): + openai_json = json.loads(openai_json) + try: raw_content = openai_json["choices"][ix]["message"]["content"] # Assume no nesting {} From 7aefe6487357a552b9608beb72ec2e67be2190bb Mon Sep 17 00:00:00 2001 From: bwd Date: Fri, 2 May 2025 14:27:39 +0000 Subject: [PATCH 6/7] =?UTF-8?q?=F0=9F=93=A6automate=20a=20path=20to=20keep?= =?UTF-8?q?=20keys=20in=20secrets=20&=20env?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .devcontainer/devcontainer.json | 2 +- .devcontainer/docker-compose.yml | 3 +++ .devcontainer/image/stagefiles/layer.sh | 8 ++++++++ .devcontainer/image/stagefiles/python/requirements.txt | 1 + .devcontainer/post-create.sh | 1 + 5 files changed, 14 insertions(+), 1 deletion(-) create mode 100755 .devcontainer/post-create.sh diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 5975fb6..548d5dd 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -44,7 +44,7 @@ // Uncomment the next line if you want to keep your containers running after VS Code shuts down. // "shutdownAction": "none", // The next line to run commands after the container is created, it auto-installs precommit hooks and the library - // "postCreateCommand": "echo \"\\nif [ -f /home/seismo/workspace/.devcontainer/.bashrc ]; then\\n . /home/seismo/workspace/.devcontainer/.bashrc;\\nfi\" >> $HOME/.bashrc", + "postCreateCommand": "/home/seismo/workspace/.devcontainer/post-create.sh", // Uncomment the next line to have VS Code connect as an existing non-root user in the container. See // https://aka.ms/vscode-remote/containers/non-root for details on adding a non-root user if none exist. //"remoteUser": "seismo" diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 93b536b..5433bf8 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -11,6 +11,9 @@ services: # Update this to wherever you want VS Code to mount the folder of your project - ..:/home/seismo/workspace + # Mount certificates read-only + - /usr/local/share/ca-certificates:/usr/local/share/ca-certificates:ro + # [From Extension] Forwards the local Docker socket to the container. - /var/run/docker.sock:/var/run/docker.sock diff --git a/.devcontainer/image/stagefiles/layer.sh b/.devcontainer/image/stagefiles/layer.sh index ffc8c3c..15dbeab 100644 --- a/.devcontainer/image/stagefiles/layer.sh +++ b/.devcontainer/image/stagefiles/layer.sh @@ -26,3 +26,11 @@ apt-get autoremove -y apt-get clean rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Append to seismo user's .bashrc to source secrets/.env if it exists +echo ' +# Source secrets/.env if it exists +if [ -f "$HOME/workspace/secrets/.env" ]; then + source "$HOME/workspace/secrets/.env" +fi +' >> /home/seismo/.bashrc diff --git a/.devcontainer/image/stagefiles/python/requirements.txt b/.devcontainer/image/stagefiles/python/requirements.txt index 5d077d6..5ff268d 100644 --- a/.devcontainer/image/stagefiles/python/requirements.txt +++ b/.devcontainer/image/stagefiles/python/requirements.txt @@ -1,4 +1,5 @@ #seismometer #litellm==1.65.4 +pandas==2.2.3 jupyterlab==4.4.1 pre-commit==4.2.0 diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh new file mode 100755 index 0000000..8cbd569 --- /dev/null +++ b/.devcontainer/post-create.sh @@ -0,0 +1 @@ +pip install -e . From e463c0b8e0b0d7bf5491bac0290038eef0355814 Mon Sep 17 00:00:00 2001 From: bwd Date: Fri, 2 May 2025 16:53:08 +0000 Subject: [PATCH 7/7] use pathlib, os-agnostic --- changelog/7.bugfix.rst | 1 + .../prep/data_handler.py | 14 ++-- tests/test_prep.py | 64 ++++++++----------- 3 files changed, 35 insertions(+), 44 deletions(-) diff --git a/changelog/7.bugfix.rst b/changelog/7.bugfix.rst index 6a4e64c..33aa613 100644 --- a/changelog/7.bugfix.rst +++ b/changelog/7.bugfix.rst @@ -1,2 +1,3 @@ Update default postproccessing to handle the objects returned by more providers. Update the results transform to handle single score objects. +Update decorator to use pathlib not os diff --git a/src/evaluation_instruments/prep/data_handler.py b/src/evaluation_instruments/prep/data_handler.py index 4c5c47d..3a5891a 100644 --- a/src/evaluation_instruments/prep/data_handler.py +++ b/src/evaluation_instruments/prep/data_handler.py @@ -1,8 +1,8 @@ import json import logging -import os from functools import wraps from typing import Callable, Optional +from pathlib import Path logger = logging.getLogger("evaluation") @@ -38,14 +38,18 @@ def wrapped(sample: "namedtuple"): """ # Get the file path from the namedtuple using the key - filename = getattr(sample, namedtuple_key) + ".json" + filename = Path(getattr(sample, namedtuple_key)) + if not filename: + return fn(filename) + + filename = filename.with_suffix(".json") if data_path: - filename = os.path.join(data_path, filename) + filename = Path(data_path) / filename - if not filename or not os.path.isfile(filename): + if not filename.is_file(): raw_json = {} else: # Open the file and read its contents - with open(filename, "r") as file: + with filename.open("r") as file: raw_json = json.load(file) # Call the original function with the file contents diff --git a/tests/test_prep.py b/tests/test_prep.py index c6f4b7b..44d0145 100644 --- a/tests/test_prep.py +++ b/tests/test_prep.py @@ -1,6 +1,7 @@ import json from collections import namedtuple from unittest.mock import mock_open, patch +from pathlib import PureWindowsPath, PurePosixPath import pytest @@ -10,63 +11,48 @@ class TestJsonFromColumnDecorator: """Tests for the json_from_column decorator functionality.""" - def test_basic_functionality(self): - """Test basic json_from_column decorator functionality.""" - - # Create a test function to decorate - @undertest.json_from_column(namedtuple_key="file_id") - def test_fn(json_data): - return json_data + @pytest.mark.parametrize("data_path",[ + "data", + "subdir/data", + "subdir\\data", + PurePosixPath("subdir/data"), + PureWindowsPath("subdir\\data"), + ]) + def test_with_data_path(self, data_path, tmp_path): + """Test json_from_column with a data_path.""" + expected_data = {"testkey": "value"} + expected_file = (tmp_path / data_path).with_suffix(".json") + (expected_file.parent).mkdir(parents=True, exist_ok=True) + with expected_file.open("w") as f: + f.write(json.dumps(expected_data)) - # Create a mock sample with file_id + # Create a mock sample Sample = namedtuple("Sample", ["file_id"]) - sample = Sample(file_id="test_id") - - # Mock the open function to return a specific JSON - test_json = {"key": "value"} - m = mock_open(read_data=json.dumps(test_json)) + sample = Sample(file_id=data_path) - with patch("builtins.open", m), patch("os.path.isfile", return_value=True): - result = test_fn(sample) - assert result == test_json - m.assert_called_once_with("test_id.json", "r") - - def test_with_data_path(self): - """Test json_from_column with a data_path.""" + # Act # Create a test function with data path - @undertest.json_from_column(namedtuple_key="file_id", data_path="/data") + @undertest.json_from_column(namedtuple_key="file_id", data_path=tmp_path) def test_fn(json_data): return json_data + result = test_fn(sample) - # Create a mock sample - Sample = namedtuple("Sample", ["file_id"]) - sample = Sample(file_id="test_id") - - # Mock the open function - test_json = {"key": "value"} - m = mock_open(read_data=json.dumps(test_json)) - - with patch("builtins.open", m), patch("os.path.isfile", return_value=True), patch( - "os.path.join", return_value="/data/test_id.json" - ): - result = test_fn(sample) - assert result == test_json - m.assert_called_once_with("/data/test_id.json", "r") + # Assert + assert result == {"testkey": "value"} def test_file_not_found(self): """Test json_from_column when the file is not found.""" - @undertest.json_from_column(namedtuple_key="file_id") + @undertest.json_from_column(namedtuple_key="file_id", data_path="nonexistent_path") def test_fn(json_data): return json_data Sample = namedtuple("Sample", ["file_id"]) sample = Sample(file_id="nonexistent_id") - with patch("os.path.isfile", return_value=False): - result = test_fn(sample) - assert result == {} + result = test_fn(sample) + assert result == {} def test_missing_key(self): """Test json_from_column raises ValueError if namedtuple_key is not provided."""