DriftAnalysis/load.py at main · isneslab/DriftAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
load.py
~~~~~~~~~~~

This module loads in data from a given file and adds
functionality to manipulate them to the required
format for other modules. Each dataset requires its own loading function.

"""
import datetime
import numpy as np
import json
import os
import pandas as pd
from tqdm import tqdm
import pickle
import util

from sklearn.feature_extraction import DictVectorizer

def load_file(filename):
    """Helper function to load json files

    Arg:
        filename (str): File path of json file to be loaded

    Returns:
        (json object): json file object

    """
    with open(filename, 'r') as f:
        data = json.load(f)

    return data

def get_family_name(labels):
    """Helper function for getting family names"""
    for label in labels:
        if label.startswith('FAM'):
            family = label.split('FAM:')[1].split('|')[0]
            return family


def load_transcend(X, y, meta_info, meta_family):
    """Loading function for transcend dataset

    Args:
        X (str): File path of X
        y (str): File path of y
        meta_info (str): File path of meta info file containing timestamps and md5
        meta_family (str): File path of tsv file containing family labels

    Returns:
        (scipy.sparse.csr_matrix, np.ndarray, np.ndarray, np.ndarray, np.ndarray):
            Array of predictors X
            Array of predictors y
            Array of time stamps
            Array of family labels
            Array of feature names
            Array of md5
    """
    print("Loading Transcend dataset, this can take up to 3 minutes...")

    # Load in X and convert to numpy ndarray
    X = load_file(X)
    vec = DictVectorizer()
    X = vec.fit_transform(X).astype("float32")
    feature_names = vec.get_feature_names_out()
    print("X loaded")

    # Load in y
    y = load_file(y)
    y = np.asarray(y).flatten()
    print("y loaded")

    # Load in time
    meta_file = load_file(meta_info)
    t = [n['dex_date'] for n in meta_file]
    t = [datetime.datetime.strptime(n, '%Y-%m-%dT%H:%M:%S') if "T" in n
             else datetime.datetime.strptime(n, '%Y-%m-%d %H:%M:%S') for n in t]
    t = np.array(t)
    print("Timestamps loaded")

    # Get md5 for each sample
    md5 = [n['md5'].upper() for n in meta_file]

    # Read family meta info file as dataframe
    print("Loading family labels")
    family_pd = pd.read_csv(meta_family, delimiter='\t')

    # Splice out samples with family labels
    family_pd = family_pd.loc[~family_pd['families'].isnull() & ~(family_pd['families'] == '[]')]

    # Find family label from md5
    if os.path.exists("pkl_files/family_labels.pkl"):
        with open("pkl_files/family_labels.pkl","rb") as file:
            data = pickle.load(file)
        index_with_families = data[0]
        f = data[1]

    else:
        index_with_families = []
        f = []

        family_pd['md5'] = family_pd['md5'].str.upper()
        malware_with_labels = family_pd.loc[family_pd["md5"].isin(md5)]

        for idx, md5_sample in tqdm(enumerate(md5), total=len(md5)):
            if y[idx] == 0:
                f.append('GOODWARE')
                index_with_families.append(idx)
            elif md5_sample in list(malware_with_labels['md5']):
                labels = malware_with_labels['families'].loc[malware_with_labels['md5'] == md5_sample].to_list()
                labels = labels[-1].split(',')

                if get_family_name(labels):
                    family = get_family_name(labels).upper()
                    f.append(family.upper())
                    index_with_families.append(idx)
            else:
                pass
        with open("pkl_files/family_labels.pkl","wb") as file:
            pickle.dump([index_with_families,f],file)

    print("Family labels loaded")

    # Feature reduction
    X, feature_names = util.feature_reduction(X, y, feature_names, "pkl_files/feature_index_1000_before_greyware.pkl", feature_size=1000)

    y = y[index_with_families]
    X = X[index_with_families]
    t = t[index_with_families]
    f = np.array(f)
    md5 = np.array(md5)[index_with_families]
    print("Finished loading Transcend dataset")
    return X, y, t, f, feature_names, md5