epic-open-source · diehlbw · May 2, 2025 · May 1, 2025 · May 1, 2025 · May 1, 2025
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -44,7 +44,7 @@
     // Uncomment the next line if you want to keep your containers running after VS Code shuts down.
     // "shutdownAction": "none",
     // The next line to run commands after the container is created, it auto-installs precommit hooks and the library
-    // "postCreateCommand": "echo \"\\nif [ -f /home/seismo/workspace/.devcontainer/.bashrc ]; then\\n  . /home/seismo/workspace/.devcontainer/.bashrc;\\nfi\" >> $HOME/.bashrc",
+    "postCreateCommand": "/home/seismo/workspace/.devcontainer/post-create.sh",
     // Uncomment the next line to have VS Code connect as an existing non-root user in the container. See
     // https://aka.ms/vscode-remote/containers/non-root for details on adding a non-root user if none exist.
     //"remoteUser": "seismo"

diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
@@ -11,6 +11,9 @@ services:
       # Update this to wherever you want VS Code to mount the folder of your project
       - ..:/home/seismo/workspace
 
+      # Mount certificates read-only
+      - /usr/local/share/ca-certificates:/usr/local/share/ca-certificates:ro
+
       # [From Extension] Forwards the local Docker socket to the container.
       - /var/run/docker.sock:/var/run/docker.sock
 

diff --git a/.devcontainer/image/stagefiles/layer.sh b/.devcontainer/image/stagefiles/layer.sh
@@ -14,7 +14,9 @@ apt-get install --no-install-recommends --yes --quiet \
     graphviz \
     libgraphviz-dev \
     pandoc \
-    virtualenv
+    virtualenv \
+    ca-certificates
+
 
 # Make vim tiny default
 ln -sfn /usr/bin/vi /usr/bin/vim
@@ -24,3 +26,11 @@ apt-get autoremove -y
 apt-get clean
 
 rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Append to seismo user's .bashrc to source secrets/.env if it exists
+echo '
+# Source secrets/.env if it exists
+if [ -f "$HOME/workspace/secrets/.env" ]; then
+    source "$HOME/workspace/secrets/.env"
+fi
+' >> /home/seismo/.bashrc
diff --git a/.devcontainer/image/stagefiles/python/requirements.txt b/.devcontainer/image/stagefiles/python/requirements.txt
@@ -1,4 +1,5 @@
 #seismometer
 #litellm==1.65.4
+pandas==2.2.3
 jupyterlab==4.4.1
 pre-commit==4.2.0
diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh
@@ -0,0 +1 @@
+pip install -e .
diff --git a/changelog/7.bugfix.rst b/changelog/7.bugfix.rst
@@ -0,0 +1,3 @@
+Update default postproccessing to handle the objects returned by more providers.
+Update the results transform to handle single score objects.
+Update decorator to use pathlib not os
diff --git a/src/evaluation_instruments/_evaluation.py b/src/evaluation_instruments/_evaluation.py
@@ -66,7 +66,9 @@ def __init__(
 
         self.log_enabled = log_enabled
         self._model_args = model_args or {}
-        self.capacity = TokenUsage(None, None, max_tokens)
+
+        self.tmp_dir: Optional[Path] = None
+        self.capacity: TokenUsage = TokenUsage(None, None, max_tokens)
 
         logger.debug(f"Set up with {log_enabled=} and capacity {max_tokens}")
 
@@ -137,22 +139,20 @@ def run_dataset(self, df: "pd.DataFrame", model: str = None, capacity: int = Non
             prompt = self.prep_fn(sample)
 
             # Delegate
-            raw_output = self.completion_fn(model, prompt, **self._model_args)
+            raw_output = self.completion_fn(model=model, messages=prompt, **self._model_args)
 
             response, usage = self._post_fn(sample_ix, raw_output)
             accumulated_usage += TokenUsage(**usage)
 
-            tmp_dir = self._dump_to_temp(sample_ix=sample_ix, raw_content=raw_output)
             outputs[sample_ix] = response
-
             logger.debug(f"{sample_ix}-Completed evaluation")
 
             # abort if beyond capacity
             if accumulated_usage > max_usage:
                 logger.warning(f"Aborting run after {sample_ix}. Capacity exceeded: {accumulated_usage} > {max_usage}")
                 break
 
-        if tmp_dir is not None:
+        if self.tmp_dir is not None:
             logger.info(f"Dumped raw content to {tmp_dir}")
 
         return outputs, accumulated_usage
@@ -178,20 +178,21 @@ def _dump_to_temp(self, sample_ix, raw_content) -> Optional[Path]:
             return None
 
         datestamp = datetime.now().strftime("%Y%m%d-%Hh")  # Generate a timestamp in the format YYYYMMDD-hhmm
-        tmp_dir = Path(tempfile.gettempdir()) / "evaluation_logs" / f"{datestamp}"
-        tmp_dir.mkdir(parents=True, exist_ok=True)
+
+        if self.tmp_dir is None:
+            tmp_dir = Path(tempfile.gettempdir()) / "evaluation_logs" / f"{datestamp}"
+            tmp_dir.mkdir(parents=True, exist_ok=True)
+            self.tmp_dir = tmp_dir
 
         timestamp = datetime.now().strftime("%H%M%S")  # Generate a timestamp in the format hhmmss
-        filepath = tmp_dir / f"{sample_ix}_raw_{timestamp}.json"
+        filepath = self.tmp_dir / f"{sample_ix}_raw_{timestamp}.json"
 
         with open(filepath, "w") as f:
             # Write the raw content to the file
             if isinstance(raw_content, dict):
                 json.dump(raw_content, f)
             else:
-                f.write(raw_content)
-
-        return tmp_dir
+                f.write(str(raw_content))
 
     def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, TokenUsage]:
         """
@@ -214,16 +215,25 @@ def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, Toke
             as well as the usage information from response['usage'].
         """
         ix = 0  # assume N=1
+        try: #  Many providers have their own response objects, try to convert
+            openai_json = openai_json.json()
+        except AttributeError:
+            pass
+
+        if isinstance(openai_json, str):
+            openai_json = json.loads(openai_json)
+
         try:
             raw_content = openai_json["choices"][ix]["message"]["content"]
-
             # Assume no nesting {}
             response = json.loads(raw_content[raw_content.find("{") : raw_content.find("}") + 1])  # noqa: E203
         except Exception:
             logger.info(f"Failed to parse {sample_ix} response content as JSON.")
             response = {}
 
         usage = openai_json.get("usage", {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0})
+
+        self._dump_to_temp(sample_ix=sample_ix, raw_content=openai_json)
         return response, usage
 
 

diff --git a/src/evaluation_instruments/post/__init__.py b/src/evaluation_instruments/post/__init__.py
@@ -2,10 +2,31 @@
 
 
 def frame_from_evals(full_output: dict, criteria_outputs: list[str] = None) -> pd.DataFrame:
+    """
+    Convert the full output of the evaluation into a DataFrame.
+
+    Useful for when the evaluation returns multiple outputs per criteria, such as a score plus evidence.
+
+    Parameters
+    ----------
+    full_output : dict
+        The full output of the evaluation, typically a dictionary with keys as the primary keys and values as dictionaries
+        containing the criteria and their respective outputs.
+    criteria_outputs : list[str], optional
+        The names of the outputs to include in the DataFrame. If not provided, defaults to ["evidence", "score"].
+        Specify [] or a single valued list to return a DataFrame with only one level, the criteria names.
+
+    Returns
+    -------
+    pd.DataFrame
+        _description_
+    """
     if not full_output:
         return pd.DataFrame()
 
-    criteria_outputs = criteria_outputs or ["evidence", "score"]
+    criteria_outputs = criteria_outputs if criteria_outputs is not None else ["evidence", "score"]
+    if len(criteria_outputs) < 2:
+        return pd.DataFrame.from_dict(full_output, orient="index")
 
     reformatted = {}
     for pk, criteria in full_output.items():

diff --git a/src/evaluation_instruments/prep/data_handler.py b/src/evaluation_instruments/prep/data_handler.py
@@ -1,8 +1,8 @@
 import json
 import logging
-import os
 from functools import wraps
 from typing import Callable, Optional
+from pathlib import Path
 
 logger = logging.getLogger("evaluation")
 
@@ -38,14 +38,18 @@ def wrapped(sample: "namedtuple"):
             """
 
             # Get the file path from the namedtuple using the key
-            filename = getattr(sample, namedtuple_key) + ".json"
+            filename = Path(getattr(sample, namedtuple_key))
+            if not filename:
+                return fn(filename)
+
+            filename = filename.with_suffix(".json")
             if data_path:
-                filename = os.path.join(data_path, filename)
+                filename = Path(data_path) / filename
 
-            if not filename or not os.path.isfile(filename):
+            if not filename.is_file():
                 raw_json = {}
             else:  # Open the file and read its contents
-                with open(filename, "r") as file:
+                with filename.open("r") as file:
                     raw_json = json.load(file)
 
             # Call the original function with the file contents

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -7,24 +7,54 @@
 from evaluation_instruments._evaluation import Evaluation
 from evaluation_instruments.model import TokenUsage
 
-
-def fake_completion(prompt, **kwargs):
-    """Fake completion function to simulate OpenAI API response."""
+def example_dict():
     return {
-        "choices": [{"message": {"content": json.dumps({"result": "success"})}}],
-        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
-    }
+            "choices": [{"message": {"content": json.dumps({"result": "success"})}}],
+            "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
+        }
+
+class CompletionObj:
+    def __init__(self, response_dict):
+        self.response_dict = response_dict
 
+    def dict(self):
+        return self.response_dict
+
+    def json(self):
+        return json.dumps(self.response_dict)
+
+@pytest.fixture
+def sample_evaluation_obj():
+    """
+    Create a basic evaluation instance with mock functions
+    Returns an object with a dict method, like Openai
+    """
+    prep_fn = MagicMock(return_value="test prompt")
+    completion_fn = MagicMock(
+        return_value=CompletionObj(example_dict())
+    )
+    post_fn = MagicMock(
+        return_value=({"result": "success"}, {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15})
+    )
+
+    return Evaluation(
+        prep_fn=prep_fn,
+        completion_fn=completion_fn,
+        post_process_fn=post_fn,
+        log_enabled=False,
+        model_args={},
+        max_tokens=1000,
+    )
 
 @pytest.fixture
 def sample_evaluation():
-    """Create a basic evaluation instance with mock functions."""
+    """
+    Create a basic evaluation instance with mock functions.
+    Directly returns a dictionary
+    """
     prep_fn = MagicMock(return_value="test prompt")
     completion_fn = MagicMock(
-        return_value={
-            "choices": [{"message": {"content": '{"result": "success"}'}}],
-            "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
-        }
+        return_value=example_dict()
     )
     post_fn = MagicMock(
         return_value=({"result": "success"}, {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15})
@@ -42,10 +72,7 @@ def sample_evaluation():
 
 def static_completion(prompt, **kwargs):
     """Fake completion function to simulate OpenAI API response."""
-    return {
-        "choices": [{"message": {"content": json.dumps({"result": "success"})}}],
-        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
-    }
+    return example_dict()
 
 
 def static_prep(sample):
@@ -115,7 +142,7 @@ def test_toggle_logging(self, initial_state, sample_evaluation):
         assert result == (not initial_state)
 
     def test_run_dataset(self, sample_evaluation):
-        """Test the run_dataset method processes samples correctly."""
+        """Test that run_object returns the expected output."""
         # Create a mock DataFrame
         df = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]})
 
@@ -132,6 +159,24 @@ def test_run_dataset(self, sample_evaluation):
         assert all(v == {"result": "success"} for v in outputs.values())
         assert usage == TokenUsage(20, 10, 30)
 
+    def test_run_dataset(self, sample_evaluation_obj):
+        """Test that run_object returns the expected output."""
+        # Create a mock DataFrame
+        df = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]})
+
+        # Run the dataset
+        outputs, usage = sample_evaluation_obj.run_dataset(df)
+
+        # Check that prep_fn, completion_fn, and post_fn were called for each sample
+        assert sample_evaluation_obj._prep_fn.call_count == 2
+        assert sample_evaluation_obj._completion_fn.call_count == 2
+        assert sample_evaluation_obj._post_fn.call_count == 2
+
+        # Check outputs and usage
+        assert len(outputs) == 2
+        assert all(v == {"result": "success"} for v in outputs.values())
+        assert usage == TokenUsage(20, 10, 30)
+
     def test_run_dataset_empty(self, sample_evaluation, caplog):
         """Test the run_dataset method processes samples correctly."""
 

diff --git a/tests/test_post.py b/tests/test_post.py
@@ -89,3 +89,27 @@ def test_frame_from_evals_single_item():
 
     # Verify exact match with expected DataFrame
     assert_frame_equal(result_df, expected_df)
+
+def test_frame_from_evals_single_score_multiple_items():
+    """Test frame_from_evals with multiple items but only single score values (no evidence)."""
+    # Sample with single score per criteria, multiple items
+    eval_output = {
+        "sample1": {
+            "criteria1": 4,
+            "criteria2": 3,
+        },
+        "sample2": {
+            "criteria1": 5,
+            "criteria2": 2,
+        },
+    }
+    expected_df = pd.DataFrame(
+        {"criteria1": [4, 5], "criteria2": [3, 2]},
+        index=["sample1", "sample2"]
+    )
+
+    # Act
+    result_df = frame_from_evals(eval_output, [])
+
+    # Verify exact match with expected DataFrame
+    assert_frame_equal(result_df, expected_df)