-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdfmaxreducer.py
More file actions
84 lines (70 loc) · 2.87 KB
/
dfmaxreducer.py
File metadata and controls
84 lines (70 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
__AUTHOR__ = 'Kirgsn'
class Maxreducer:
"""
Class that takes a dict of increasingly big numpy datatypes to transform
the data of a pandas dataframe to in order to save memory usage.
"""
memory_scale_factor = 1024**2 # memory in MB
def __init__(self, conv_table=None):
"""
:param conv_table: dict with np.dtypes-strings as keys
"""
if conv_table is None:
self.conversion_table = \
{'int': [np.int8, np.int16, np.int32, np.int64],
'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
'float': [np.float16, np.float32, ]}
else:
self.conversion_table = conv_table
def _type_candidates(self, k):
for c in self.conversion_table[k]:
i = np.iinfo(c) if 'int' in k else np.finfo(c)
yield c, i
def reduce(self, df, verbose=False):
"""Takes a dataframe and returns it with all data transformed to the
smallest necessary types.
:param df: pandas dataframe
:param verbose: If True, outputs more information
:return: pandas dataframe with reduced data types
"""
ret_list = Parallel(n_jobs=-1)(delayed(self._reduce)
(df[c], c, verbose) for c in
df.columns)
return pd.concat(ret_list, axis=1)
def _reduce(self, s, colname, verbose):
# skip NaNs
if s.isnull().any():
if verbose:
print(colname, 'has NaNs - Skip..')
return s
# detect kind of type
coltype = s.dtype
if np.issubdtype(coltype, np.integer):
conv_key = 'int' if s.min() < 0 else 'uint'
elif np.issubdtype(coltype, np.floating):
conv_key = 'float'
elif coltype == 'object':
# print(colname, 'is', coltype, '- coming soon')
if (s.nunique() / s.size) <= 0.5:
print('convert', colname, 'to category')
return s.astype('category')
else:
print(colname, 'has', s.nunique(), 'values - Skip..')
return s
else:
print(colname, 'is', coltype, '- Skip..')
return s
# find right candidate
for cand, cand_info in self._type_candidates(conv_key):
if s.max() <= cand_info.max and s.min() >= cand_info.min:
if verbose:
print('convert', colname, 'to', str(cand))
return s.astype(cand)
# reaching this code is bad. Probably there are inf, or other high numbs
print(("WARNING: {} "
"doesn't fit the grid with \nmax: {} "
"and \nmin: {}").format(colname, s.max(), s.min()))
print('Dropping it..')