|
28 | 28 | }, |
29 | 29 | { |
30 | 30 | "cell_type": "code", |
31 | | - "execution_count": null, |
| 31 | + "execution_count": 1, |
32 | 32 | "id": "fb6539a1", |
33 | 33 | "metadata": {}, |
34 | 34 | "outputs": [], |
|
60 | 60 | }, |
61 | 61 | { |
62 | 62 | "cell_type": "code", |
63 | | - "execution_count": null, |
| 63 | + "execution_count": 2, |
64 | 64 | "id": "edaee0f2", |
65 | 65 | "metadata": {}, |
66 | 66 | "outputs": [], |
|
99 | 99 | }, |
100 | 100 | { |
101 | 101 | "cell_type": "code", |
102 | | - "execution_count": null, |
| 102 | + "execution_count": 3, |
103 | 103 | "id": "604d7a61", |
104 | 104 | "metadata": {}, |
105 | | - "outputs": [], |
| 105 | + "outputs": [ |
| 106 | + { |
| 107 | + "name": "stderr", |
| 108 | + "output_type": "stream", |
| 109 | + "text": [ |
| 110 | + "DEBUG:evaluation:Set up with log_enabled=True and capacity 10000\n" |
| 111 | + ] |
| 112 | + } |
| 113 | + ], |
106 | 114 | "source": [ |
107 | 115 | "from draft_appeal_prompt import to_prompt\n", |
108 | 116 | "import evaluation_instruments as ev\n", |
|
146 | 154 | }, |
147 | 155 | { |
148 | 156 | "cell_type": "code", |
149 | | - "execution_count": null, |
| 157 | + "execution_count": 4, |
150 | 158 | "id": "4db685fe", |
151 | 159 | "metadata": {}, |
152 | | - "outputs": [], |
| 160 | + "outputs": [ |
| 161 | + { |
| 162 | + "name": "stderr", |
| 163 | + "output_type": "stream", |
| 164 | + "text": [ |
| 165 | + "DEBUG:evaluation:000-Completed evaluation\n", |
| 166 | + "INFO:evaluation:Dumped raw content to None\n" |
| 167 | + ] |
| 168 | + } |
| 169 | + ], |
153 | 170 | "source": [ |
154 | 171 | "output = evaluator.run_dataset(input_df, model='gpt-4o-mini')" |
155 | 172 | ] |
|
164 | 181 | }, |
165 | 182 | { |
166 | 183 | "cell_type": "code", |
167 | | - "execution_count": null, |
| 184 | + "execution_count": 5, |
168 | 185 | "id": "7b248b1e-9ef8-4add-b057-d06de4f07f39", |
169 | 186 | "metadata": {}, |
170 | | - "outputs": [], |
| 187 | + "outputs": [ |
| 188 | + { |
| 189 | + "data": { |
| 190 | + "text/html": [ |
| 191 | + "<div>\n", |
| 192 | + "<style scoped>\n", |
| 193 | + " .dataframe tbody tr th:only-of-type {\n", |
| 194 | + " vertical-align: middle;\n", |
| 195 | + " }\n", |
| 196 | + "\n", |
| 197 | + " .dataframe tbody tr th {\n", |
| 198 | + " vertical-align: top;\n", |
| 199 | + " }\n", |
| 200 | + "\n", |
| 201 | + " .dataframe thead th {\n", |
| 202 | + " text-align: right;\n", |
| 203 | + " }\n", |
| 204 | + "</style>\n", |
| 205 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 206 | + " <thead>\n", |
| 207 | + " <tr style=\"text-align: right;\">\n", |
| 208 | + " <th></th>\n", |
| 209 | + " <th>TextQuality</th>\n", |
| 210 | + " <th>MedicalTerminology</th>\n", |
| 211 | + " <th>Grammar</th>\n", |
| 212 | + " <th>TextFormat</th>\n", |
| 213 | + " <th>Tone</th>\n", |
| 214 | + " <th>References</th>\n", |
| 215 | + " <th>RelevantReferences</th>\n", |
| 216 | + " <th>MedicalNecessity</th>\n", |
| 217 | + " <th>FalseReasoning</th>\n", |
| 218 | + " <th>Opposition</th>\n", |
| 219 | + " <th>FactualAccuracy</th>\n", |
| 220 | + " </tr>\n", |
| 221 | + " </thead>\n", |
| 222 | + " <tbody>\n", |
| 223 | + " <tr>\n", |
| 224 | + " <th>000</th>\n", |
| 225 | + " <td>4</td>\n", |
| 226 | + " <td>5</td>\n", |
| 227 | + " <td>5</td>\n", |
| 228 | + " <td>4</td>\n", |
| 229 | + " <td>5</td>\n", |
| 230 | + " <td>5</td>\n", |
| 231 | + " <td>5</td>\n", |
| 232 | + " <td>5</td>\n", |
| 233 | + " <td>5</td>\n", |
| 234 | + " <td>5</td>\n", |
| 235 | + " <td>5</td>\n", |
| 236 | + " </tr>\n", |
| 237 | + " </tbody>\n", |
| 238 | + "</table>\n", |
| 239 | + "</div>" |
| 240 | + ], |
| 241 | + "text/plain": [ |
| 242 | + " TextQuality MedicalTerminology Grammar TextFormat Tone References \\\n", |
| 243 | + "000 4 5 5 4 5 5 \n", |
| 244 | + "\n", |
| 245 | + " RelevantReferences MedicalNecessity FalseReasoning Opposition \\\n", |
| 246 | + "000 5 5 5 5 \n", |
| 247 | + "\n", |
| 248 | + " FactualAccuracy \n", |
| 249 | + "000 5 " |
| 250 | + ] |
| 251 | + }, |
| 252 | + "execution_count": 5, |
| 253 | + "metadata": {}, |
| 254 | + "output_type": "execute_result" |
| 255 | + } |
| 256 | + ], |
171 | 257 | "source": [ |
172 | 258 | "grades = ev.frame_from_evals(output[0])\n", |
173 | 259 | "grades.xs('score', axis=1, level=1)" |
|
178 | 264 | "execution_count": null, |
179 | 265 | "id": "ec27374a", |
180 | 266 | "metadata": {}, |
181 | | - "outputs": [], |
| 267 | + "outputs": [ |
| 268 | + { |
| 269 | + "ename": "KeyError", |
| 270 | + "evalue": "\"['evidence'] not in index\"", |
| 271 | + "output_type": "error", |
| 272 | + "traceback": [ |
| 273 | + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", |
| 274 | + "\u001b[31mKeyError\u001b[39m Traceback (most recent call last)", |
| 275 | + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m pd.option_context(\u001b[33m'\u001b[39m\u001b[33mdisplay.max_colwidth\u001b[39m\u001b[33m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m display(\u001b[43mgrades\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mMedicalNecessity\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mscore\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mevidence\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m)\n", |
| 276 | + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/local/venv/lib/python3.12/site-packages/pandas/core/frame.py:4108\u001b[39m, in \u001b[36mDataFrame.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 4106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[32m 4107\u001b[39m key = \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[32m-> \u001b[39m\u001b[32m4108\u001b[39m indexer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcolumns\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[32m1\u001b[39m]\n\u001b[32m 4110\u001b[39m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[32m 4111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) == \u001b[38;5;28mbool\u001b[39m:\n", |
| 277 | + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/local/venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:6200\u001b[39m, in \u001b[36mIndex._get_indexer_strict\u001b[39m\u001b[34m(self, key, axis_name)\u001b[39m\n\u001b[32m 6197\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 6198\u001b[39m keyarr, indexer, new_indexer = \u001b[38;5;28mself\u001b[39m._reindex_non_unique(keyarr)\n\u001b[32m-> \u001b[39m\u001b[32m6200\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6202\u001b[39m keyarr = \u001b[38;5;28mself\u001b[39m.take(indexer)\n\u001b[32m 6203\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[32m 6204\u001b[39m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", |
| 278 | + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/local/venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:6252\u001b[39m, in \u001b[36mIndex._raise_if_missing\u001b[39m\u001b[34m(self, key, indexer, axis_name)\u001b[39m\n\u001b[32m 6249\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m]\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 6251\u001b[39m not_found = \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask.nonzero()[\u001b[32m0\u001b[39m]].unique())\n\u001b[32m-> \u001b[39m\u001b[32m6252\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not in index\u001b[39m\u001b[33m\"\u001b[39m)\n", |
| 279 | + "\u001b[31mKeyError\u001b[39m: \"['evidence'] not in index\"" |
| 280 | + ] |
| 281 | + } |
| 282 | + ], |
182 | 283 | "source": [ |
183 | 284 | "with pd.option_context('display.max_colwidth', None):\n", |
184 | | - " display(grades['MedicalNecessity'][['score','evidence']])" |
| 285 | + " display(grades['MedicalNecessity'][['score','explanation']])" |
185 | 286 | ] |
186 | 287 | } |
187 | 288 | ], |
188 | 289 | "metadata": { |
189 | 290 | "kernelspec": { |
190 | | - "display_name": ".venv", |
| 291 | + "display_name": "Python 3 (ipykernel)", |
191 | 292 | "language": "python", |
192 | 293 | "name": "python3" |
193 | 294 | }, |
|
0 commit comments