Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
// "shutdownAction": "none",
// The next line to run commands after the container is created, it auto-installs precommit hooks and the library
// "postCreateCommand": "echo \"\\nif [ -f /home/seismo/workspace/.devcontainer/.bashrc ]; then\\n . /home/seismo/workspace/.devcontainer/.bashrc;\\nfi\" >> $HOME/.bashrc",
"postCreateCommand": "/home/seismo/workspace/.devcontainer/post-create.sh",
// Uncomment the next line to have VS Code connect as an existing non-root user in the container. See
// https://aka.ms/vscode-remote/containers/non-root for details on adding a non-root user if none exist.
//"remoteUser": "seismo"
Expand Down
3 changes: 3 additions & 0 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ services:
# Update this to wherever you want VS Code to mount the folder of your project
- ..:/home/seismo/workspace

# Mount certificates read-only
- /usr/local/share/ca-certificates:/usr/local/share/ca-certificates:ro

# [From Extension] Forwards the local Docker socket to the container.
- /var/run/docker.sock:/var/run/docker.sock

Expand Down
12 changes: 11 additions & 1 deletion .devcontainer/image/stagefiles/layer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ apt-get install --no-install-recommends --yes --quiet \
graphviz \
libgraphviz-dev \
pandoc \
virtualenv
virtualenv \
ca-certificates


# Make vim tiny default
ln -sfn /usr/bin/vi /usr/bin/vim
Expand All @@ -24,3 +26,11 @@ apt-get autoremove -y
apt-get clean

rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Append to seismo user's .bashrc to source secrets/.env if it exists
echo '
# Source secrets/.env if it exists
if [ -f "$HOME/workspace/secrets/.env" ]; then
source "$HOME/workspace/secrets/.env"
fi
' >> /home/seismo/.bashrc
1 change: 1 addition & 0 deletions .devcontainer/image/stagefiles/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#seismometer
#litellm==1.65.4
pandas==2.2.3
jupyterlab==4.4.1
pre-commit==4.2.0
1 change: 1 addition & 0 deletions .devcontainer/post-create.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pip install -e .
3 changes: 3 additions & 0 deletions changelog/7.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Update default postproccessing to handle the objects returned by more providers.
Update the results transform to handle single score objects.
Update decorator to use pathlib not os
34 changes: 22 additions & 12 deletions src/evaluation_instruments/_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ def __init__(

self.log_enabled = log_enabled
self._model_args = model_args or {}
self.capacity = TokenUsage(None, None, max_tokens)

self.tmp_dir: Optional[Path] = None
self.capacity: TokenUsage = TokenUsage(None, None, max_tokens)

logger.debug(f"Set up with {log_enabled=} and capacity {max_tokens}")

Expand Down Expand Up @@ -137,22 +139,20 @@ def run_dataset(self, df: "pd.DataFrame", model: str = None, capacity: int = Non
prompt = self.prep_fn(sample)

# Delegate
raw_output = self.completion_fn(model, prompt, **self._model_args)
raw_output = self.completion_fn(model=model, messages=prompt, **self._model_args)

response, usage = self._post_fn(sample_ix, raw_output)
accumulated_usage += TokenUsage(**usage)

tmp_dir = self._dump_to_temp(sample_ix=sample_ix, raw_content=raw_output)
outputs[sample_ix] = response

logger.debug(f"{sample_ix}-Completed evaluation")

# abort if beyond capacity
if accumulated_usage > max_usage:
logger.warning(f"Aborting run after {sample_ix}. Capacity exceeded: {accumulated_usage} > {max_usage}")
break

if tmp_dir is not None:
if self.tmp_dir is not None:
logger.info(f"Dumped raw content to {tmp_dir}")

return outputs, accumulated_usage
Expand All @@ -178,20 +178,21 @@ def _dump_to_temp(self, sample_ix, raw_content) -> Optional[Path]:
return None

datestamp = datetime.now().strftime("%Y%m%d-%Hh") # Generate a timestamp in the format YYYYMMDD-hhmm
tmp_dir = Path(tempfile.gettempdir()) / "evaluation_logs" / f"{datestamp}"
tmp_dir.mkdir(parents=True, exist_ok=True)

if self.tmp_dir is None:
tmp_dir = Path(tempfile.gettempdir()) / "evaluation_logs" / f"{datestamp}"
tmp_dir.mkdir(parents=True, exist_ok=True)
self.tmp_dir = tmp_dir

timestamp = datetime.now().strftime("%H%M%S") # Generate a timestamp in the format hhmmss
filepath = tmp_dir / f"{sample_ix}_raw_{timestamp}.json"
filepath = self.tmp_dir / f"{sample_ix}_raw_{timestamp}.json"

with open(filepath, "w") as f:
# Write the raw content to the file
if isinstance(raw_content, dict):
json.dump(raw_content, f)
else:
f.write(raw_content)

return tmp_dir
f.write(str(raw_content))

def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, TokenUsage]:
"""
Expand All @@ -214,16 +215,25 @@ def post_process_default(self, sample_ix, openai_json: dict) -> tuple[dict, Toke
as well as the usage information from response['usage'].
"""
ix = 0 # assume N=1
try: # Many providers have their own response objects, try to convert
openai_json = openai_json.json()
except AttributeError:
pass

if isinstance(openai_json, str):
openai_json = json.loads(openai_json)

try:
raw_content = openai_json["choices"][ix]["message"]["content"]

# Assume no nesting {}
response = json.loads(raw_content[raw_content.find("{") : raw_content.find("}") + 1]) # noqa: E203
except Exception:
logger.info(f"Failed to parse {sample_ix} response content as JSON.")
response = {}

usage = openai_json.get("usage", {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0})

self._dump_to_temp(sample_ix=sample_ix, raw_content=openai_json)
return response, usage


Expand Down
23 changes: 22 additions & 1 deletion src/evaluation_instruments/post/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,31 @@


def frame_from_evals(full_output: dict, criteria_outputs: list[str] = None) -> pd.DataFrame:
"""
Convert the full output of the evaluation into a DataFrame.

Useful for when the evaluation returns multiple outputs per criteria, such as a score plus evidence.

Parameters
----------
full_output : dict
The full output of the evaluation, typically a dictionary with keys as the primary keys and values as dictionaries
containing the criteria and their respective outputs.
criteria_outputs : list[str], optional
The names of the outputs to include in the DataFrame. If not provided, defaults to ["evidence", "score"].
Specify [] or a single valued list to return a DataFrame with only one level, the criteria names.

Returns
-------
pd.DataFrame
_description_
"""
if not full_output:
return pd.DataFrame()

criteria_outputs = criteria_outputs or ["evidence", "score"]
criteria_outputs = criteria_outputs if criteria_outputs is not None else ["evidence", "score"]
if len(criteria_outputs) < 2:
return pd.DataFrame.from_dict(full_output, orient="index")

reformatted = {}
for pk, criteria in full_output.items():
Expand Down
14 changes: 9 additions & 5 deletions src/evaluation_instruments/prep/data_handler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import logging
import os
from functools import wraps
from typing import Callable, Optional
from pathlib import Path

logger = logging.getLogger("evaluation")

Expand Down Expand Up @@ -38,14 +38,18 @@ def wrapped(sample: "namedtuple"):
"""

# Get the file path from the namedtuple using the key
filename = getattr(sample, namedtuple_key) + ".json"
filename = Path(getattr(sample, namedtuple_key))
if not filename:
return fn(filename)

filename = filename.with_suffix(".json")
if data_path:
filename = os.path.join(data_path, filename)
filename = Path(data_path) / filename

if not filename or not os.path.isfile(filename):
if not filename.is_file():
raw_json = {}
else: # Open the file and read its contents
with open(filename, "r") as file:
with filename.open("r") as file:
raw_json = json.load(file)

# Call the original function with the file contents
Expand Down
77 changes: 61 additions & 16 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,54 @@
from evaluation_instruments._evaluation import Evaluation
from evaluation_instruments.model import TokenUsage


def fake_completion(prompt, **kwargs):
"""Fake completion function to simulate OpenAI API response."""
def example_dict():
return {
"choices": [{"message": {"content": json.dumps({"result": "success"})}}],
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
}
"choices": [{"message": {"content": json.dumps({"result": "success"})}}],
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
}

class CompletionObj:
def __init__(self, response_dict):
self.response_dict = response_dict

def dict(self):
return self.response_dict

def json(self):
return json.dumps(self.response_dict)

@pytest.fixture
def sample_evaluation_obj():
"""
Create a basic evaluation instance with mock functions
Returns an object with a dict method, like Openai
"""
prep_fn = MagicMock(return_value="test prompt")
completion_fn = MagicMock(
return_value=CompletionObj(example_dict())
)
post_fn = MagicMock(
return_value=({"result": "success"}, {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15})
)

return Evaluation(
prep_fn=prep_fn,
completion_fn=completion_fn,
post_process_fn=post_fn,
log_enabled=False,
model_args={},
max_tokens=1000,
)

@pytest.fixture
def sample_evaluation():
"""Create a basic evaluation instance with mock functions."""
"""
Create a basic evaluation instance with mock functions.
Directly returns a dictionary
"""
prep_fn = MagicMock(return_value="test prompt")
completion_fn = MagicMock(
return_value={
"choices": [{"message": {"content": '{"result": "success"}'}}],
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
}
return_value=example_dict()
)
post_fn = MagicMock(
return_value=({"result": "success"}, {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15})
Expand All @@ -42,10 +72,7 @@ def sample_evaluation():

def static_completion(prompt, **kwargs):
"""Fake completion function to simulate OpenAI API response."""
return {
"choices": [{"message": {"content": json.dumps({"result": "success"})}}],
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
}
return example_dict()


def static_prep(sample):
Expand Down Expand Up @@ -115,7 +142,7 @@ def test_toggle_logging(self, initial_state, sample_evaluation):
assert result == (not initial_state)

def test_run_dataset(self, sample_evaluation):
"""Test the run_dataset method processes samples correctly."""
"""Test that run_object returns the expected output."""
# Create a mock DataFrame
df = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]})

Expand All @@ -132,6 +159,24 @@ def test_run_dataset(self, sample_evaluation):
assert all(v == {"result": "success"} for v in outputs.values())
assert usage == TokenUsage(20, 10, 30)

def test_run_dataset(self, sample_evaluation_obj):
"""Test that run_object returns the expected output."""
# Create a mock DataFrame
df = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]})

# Run the dataset
outputs, usage = sample_evaluation_obj.run_dataset(df)

# Check that prep_fn, completion_fn, and post_fn were called for each sample
assert sample_evaluation_obj._prep_fn.call_count == 2
assert sample_evaluation_obj._completion_fn.call_count == 2
assert sample_evaluation_obj._post_fn.call_count == 2

# Check outputs and usage
assert len(outputs) == 2
assert all(v == {"result": "success"} for v in outputs.values())
assert usage == TokenUsage(20, 10, 30)

def test_run_dataset_empty(self, sample_evaluation, caplog):
"""Test the run_dataset method processes samples correctly."""

Expand Down
24 changes: 24 additions & 0 deletions tests/test_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,27 @@ def test_frame_from_evals_single_item():

# Verify exact match with expected DataFrame
assert_frame_equal(result_df, expected_df)

def test_frame_from_evals_single_score_multiple_items():
"""Test frame_from_evals with multiple items but only single score values (no evidence)."""
# Sample with single score per criteria, multiple items
eval_output = {
"sample1": {
"criteria1": 4,
"criteria2": 3,
},
"sample2": {
"criteria1": 5,
"criteria2": 2,
},
}
expected_df = pd.DataFrame(
{"criteria1": [4, 5], "criteria2": [3, 2]},
index=["sample1", "sample2"]
)

# Act
result_df = frame_from_evals(eval_output, [])

# Verify exact match with expected DataFrame
assert_frame_equal(result_df, expected_df)
Loading