agenteval-sample/agent_eval/verifier_agent.py at main · prabdeb/agenteval-sample · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from typing import Optional
import logging

import numpy as np

from autogen_agentchat.agents import AssistantAgent
from autogen_core.models import ChatCompletionClient

from .criterion import Criterion
from .quantification import Quantification

class CriticSummarizerAgent(AssistantAgent):
    """
    An agent for summarizing criteria that are generated by running the CriticAgent multiple times.
    """

    DEFAULT_SYSTEM_MESSAGE = """"You are a helpful assistant. You summarize and create the FINAL SET OF BEST POSSIBLE CRITERIA based on the criteria generated by the critic agent in multiple runs.
    Convert the evaluation criteria into a list where each item is a criteria which consists of the following dictionary as follows
    {"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key}
    Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description.
    Output just the criteria string you have created, no code."""

    DEFAULT_DESCRIPTION = "An AI agent for summarizing the criteria generated by a critic agent by running the critic agent multiple times."

    def __init__(
        self,
        model_client: ChatCompletionClient,
        name="critic_summarizer",
        system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
        description: Optional[str] = DEFAULT_DESCRIPTION,
        **kwargs,
    ):
        """
        Args:
            model_client (ChatCompletionClient): The model client for ChatCompletion inference.
            name (str): agent name.
            system_message (str): system message for the ChatCompletion inference.
                Please override this attribute if you want to reprogram the agent.
            description (str): The description of the agent.
            **kwargs (dict): Please refer to other kwargs in
                [AssistantAgent](../../assistant_agent#__init__).
        """
        super().__init__(
            name=name,
            system_message=system_message,
            description=description,
            model_client=model_client,
            **kwargs,
        )

class Verify:
    """
    A class that represents the verification logic.
    """

    def __init__(self, summarized_criteria: list[Criterion], sampled_quantifications: list[dict[int, list[Quantification]]], logger: logging.Logger = logging.getLogger(__name__)):
        """
        Args:
            summarized_criteria (list[Criterion]): A list of summarized criteria.
            sampled_quantifications (list[dict[int, Quantification]]): A list of sampled quantifications.
        """
        self.summarized_criteria = summarized_criteria
        self.sampled_quantifications = sampled_quantifications
        self.logger = logger

    def _convert_categorical_to_numerical(self):
        """
        Convert the categorical values in the quantifications to numerical values.
        """
        for quantification in self.sampled_quantifications:
            for _, value in quantification.items():
                for q in value:
                    criterion = [c for c in self.summarized_criteria if c.name == q.name][0]
                    q.results = criterion.convert_categorical_to_numerical(q.results)

    def _compute_cv_score(self, values: list[float]) -> float:
        """
        Compute the coefficient of variation (CV) score.

        Args:
            values (list[float]): A list of values.
        Returns:
            float: The CV score.
        """
        return np.std(values) / np.mean(values) if np.mean(values) != 0 else 0

    def _compute_cv_score_each_sample_per_criterion_per_seed(self) -> dict[str, list[float]]:
        """
        Compute the coefficient of variation (CV) score per sample per criterion.

        Returns:
            dict[str, list[float]]: A dictionary where the keys are the criterion names and the values are the CV scores.
        """
        cv_scores_each_sample_per_criterion = dict()
        for quantification in self.sampled_quantifications:
            cv_scores_per_criterion_data = dict()
            for _, all_quantification in quantification.items():
                for value in all_quantification:
                    if value.name in cv_scores_per_criterion_data:
                        cv_scores_per_criterion_data[value.name].append(value.results)
                    else:
                        cv_scores_per_criterion_data[value.name] = [value.results]
            for criterion_name, criterion_values in cv_scores_per_criterion_data.items():
                if criterion_name in cv_scores_each_sample_per_criterion:
                    cv_scores_each_sample_per_criterion[criterion_name].append(self._compute_cv_score(criterion_values))
                else:
                    cv_scores_each_sample_per_criterion[criterion_name] = [self._compute_cv_score(criterion_values)]
        self.logger.debug("CV scores per sample per criterion: ", cv_scores_each_sample_per_criterion)
        return cv_scores_each_sample_per_criterion

    def _compute_cv_score_per_criterion_per_sample(self) -> dict[str, float]:
        """
        Compute the coefficient of variation (CV) score per criterion per sample.

        Returns:
            dict[str, float]: A dictionary where the keys are the criterion names and the values are the CV scores.
        """
        cv_scores_each_sample_per_criterion = self._compute_cv_score_each_sample_per_criterion_per_seed()
        cv_scores_per_criterion = dict()
        for criterion_name, criterion_values in cv_scores_each_sample_per_criterion.items():
            cv_scores_per_criterion[criterion_name] = np.mean(criterion_values)
        self.logger.info("CV scores per criterion: ", cv_scores_per_criterion)
        return cv_scores_per_criterion

    def _filter_final_criteria(self, cv_scores: dict[str, float], threshold: float = 0.5) -> list[Criterion]:
        """
        Filter the final criteria based on the threshold.

        Args:
            cv_scores (dict[str, float]): The CV scores.
            threshold (float): The threshold to filter the final criteria. Default is 0.5.
        Returns:
            list[Criterion]: A list of final criteria.
        """
        return [criterion for criterion in self.summarized_criteria if cv_scores[criterion.name] < threshold]

    def verify(self, threshold: float = 0.5) -> list[Criterion]:
        """
        Verify the criteria.

        Args:
            threshold (float): The threshold to filter the final criteria. Default is 0.5.
        Returns:
            list[Criterion]: A list of final criteria.
        """
        self._convert_categorical_to_numerical()
        cv_scores = self._compute_cv_score_per_criterion_per_sample()
        return self._filter_final_criteria(cv_scores, threshold)