EntityRepresentations/processResults.py at main · VictorMorand/EntityRepresentations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pathlib, os, json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

def loadResults(xp_path):
    """ load all results from a given experimaestro experiment directory
    xp_path: pathlib.Path, path to the jobs to load.
    """
    if type(xp_path) is not Path:
         xp_path = Path(xp_path)

    jobs = os.listdir(xp_path)
    # print(f"available jobs: {jobs}")

    results = []
    for job in tqdm(jobs):
        jobPath = xp_path / job
        job_data = {"path": jobPath, "hash": str(job)}

        with open( jobPath / "params.json") as json_file:
            params = json.load(json_file)

        params = params["objects"][0]["fields"]

        # merge params with hash
        #add params to job_data
        job_data.update(params)
        if "with_context" not in job_data:
            job_data["with_context"] = False
        if "dataset_name" not in job_data:
            job_data["dataset_name"] = "WebNLG"
        if "extraction_method" not in job_data:
            job_data["extraction_method"] = "after_context"

        # params = params["params"]
        # print(job_data)
        hist_path = jobPath / "history.json"
        if not hist_path.exists():
            # print(f"missing history for {job}")
            pass
        else:
            try:
                with open( hist_path) as json_file:
                    history = json.load(json_file)
            except:
                print(f"error loading {hist_path}")
            job_data["history"] = history

        files = os.listdir(jobPath)

        # Find Evaluation files
        eval_files = sorted([f for f in files if "evaluation" in f.lower()])
        # print("found eval files:", eval_files)
        if len(eval_files) == 0:
            job_data["Eval"] = None
        else:
            eval_f = str(jobPath / eval_files[-1])
            try:
                with open(eval_f) as json_file:
                    job_data["Eval"] = json.load(json_file)
            except:
                print(f"error loading {eval_f}")
            job_data["date"] = os.path.getmtime(eval_f)

        # Find Inference files
        inf_files = [f for f in files if "inference" in f.lower()]

        if len(inf_files) == 0:
            job_data["inference"] = None
        else:
             # Sort files by modification time, most recent first
            inf_files.sort(key=lambda f: os.path.getmtime(jobPath / f), reverse=True)
            # Select the most recent file
            job_data["inference"] = jobPath / inf_files[0]

        results.append(job_data)

    return pd.DataFrame(results)

def get_inference_res(results,
                      model_name,
                      layer,
                      dataset_name=None,
                      with_context = False,
                      extraction_method="in_context",
                      verbose=False):
    results = results[(results["model_name"] == model_name)
                      & (results["layer"] == layer)
                      & (results["with_context"] == with_context)
                      & (results["extraction_method"] == extraction_method)
                      ]
    if dataset_name:
        results = results[results["dataset_name"] == dataset_name]
    if verbose: print(f"found {len(results)} results for layer {layer} of {model_name} {'with' if with_context else 'without'} context {'on ' + dataset_name if dataset_name else ''} with method {extraction_method}.")
    #get first row dict
    if len(results) == 0:
        return None
    res = results.iloc[0].to_dict()
    inference_file = res["path"] / res["inference"]
    if verbose: print(f"got inference files: {inference_file}")
    #get inference in results for layer
    with open(inference_file) as json_file:
        inference = json.load(json_file)
    return inference

def get_taskVec(results, model_name, layer,
                dataset_name=None,
                with_context = False,
                extraction_method="in_context",
                verbose:bool=True):
    results = results[(results["model_name"] == model_name) &
                      (results["layer"] == layer) &
                      (results["with_context"] == with_context) &
                      (results["extraction_method"] == extraction_method) &
                      (results["Eval"].notna())
                      ]
    if dataset_name:
        results = results[results["dataset_name"] == dataset_name]
    if verbose: print(f"found {len(results)} jobs for layer {layer} of {model_name} {'with' if with_context else 'without'} context {'on ' + dataset_name if dataset_name else ''} with method {extraction_method}.")
    #get first row dict
    if len(results) == 0:
        return None

    res = results.iloc[0].to_dict()
    files = [file for file in os.listdir(res["path"]) if file.endswith(".pth") and 'task' in file.lower()]
    #get the taskVec file
    if verbose: print(f"found {files} ")
    if len(files) == 0:
        if verbose: print("No taskVec file found")
        raise FileNotFoundError(f"No taskVec file found in {res['path']}")
    else:
        return res["path"] / files[0]

def get_linear_model(results, model_name, layer, dataset_name=None, with_context = False, verbose:bool=True):
    results = results[(results["model_name"] == model_name) & (results["layer"] == layer) & (results["with_context"] == with_context)]
    if dataset_name:
        results = results[results["dataset_name"] == dataset_name]
    if verbose: print(f"found {len(results)} jobs for layer {layer} of {model_name} {'with' if with_context else 'without'} context {'on ' + dataset_name if dataset_name else ''}")
    #get first row dict
    if len(results) == 0:
        return None
    res = results.iloc[0].to_dict()
    files = [file for file in os.listdir(res["path"]) if file.endswith(".pth") and ('linear' in file.lower() or 'extractor' in file.lower())]

    if verbose: print(f"found {files} ")
    if len(files) == 0:
        if verbose: print("No linear model pth found")
        return None
    else:
        return res["path"] / files[0]