-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathJsonGenerator.py
More file actions
126 lines (92 loc) · 4.98 KB
/
JsonGenerator.py
File metadata and controls
126 lines (92 loc) · 4.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Author: Anderrow
# JSON Generator using Polars and openpyxl
# https://github.com/anderrow
import time
import polars as pl
import json, os, re , glob
start_time = time.perf_counter()
# Path to the Excel file
source_folder = "../SOURCE FILES/"
excel_files = glob.glob(os.path.join(source_folder, "*.csv")) #All .csv files in the SOURCE FILES folder
dfs = {}
# -------------------------- READ ALL EXCEL FILES -------------------------- #
for path in excel_files:
# Use the filename (without extension) as the "sheet name"
sheet_name = os.path.splitext(os.path.basename(path))[0]
try:
#Read each sheet individually using Polars
df = pl.read_csv(path)
# Check if the sheet is empty or headers are invalid (None)
if df.height == 0 or df.columns is None or any(col is None for col in df.columns):
print(f"⚠️ File '{sheet_name}' has invalid or missing headers, skipping...")
continue
# Store the DataFrame in the dictionary using the sheet name as key
dfs[sheet_name] = df
print(f"🔍 File '{sheet_name}' read successfully with {df.height} rows and {df.width} columns.")
except Exception as e:
print(f"⚠️ Error processing sheet '{sheet_name}': {e}")
print(f"current time {time.perf_counter() - start_time:.2f} seconds")
# ---------------------------- FILTER DATAFRAMES ---------------------------- #
# Initialize an empty dict for filtered DataFrames
df_filtered_dict = {}
# Each item in the dictionary has a sheet name and a DataFrame
for sheet_name, df in dfs.items():
# Replace nulls in the 3rd column with empty string and cast to UTF8 (convert to a unicode string)
df = df.with_columns(
pl.col(df.columns[2]).fill_null("").cast(pl.Utf8)
)
# Filter out rows containing "Undefined" (case-insensitive)
df_filtered = df.filter(
~pl.col(df.columns[2]).str.contains(r"(?i).*Undefined")
)
# Filter out rows containing 'Unkown message' (yes, typo included)
df_filtered2 = df_filtered.filter(
~pl.col(df.columns[2]).str.contains(r"(?i)Unkown message")
)
# Select columns 2 to 8
df_filtered_subset = df_filtered2.select(df_filtered2.columns[1:9])
# Remove rows where the second column is empty (No message found)
df_cleaned = df_filtered_subset.filter(
pl.col(df_filtered_subset.columns[1]) != ""
)
df_filtered_dict[sheet_name] = df_cleaned
print("=" * 50)
print(f"Sheet: {sheet_name}")
print(f"Original length: {len(df)}")
print(f"Filtered length: {len(df_cleaned)}")
print(f"Data has been reduced by a {100 - len(df_cleaned) * 100 / len(df):.2f}%")
print("=" * 50)
# --------------------------------END OF FILTER DATAFRAMES--------------------------------#
# -------------------------- VALIDATE COLUMN NAMES -------------------------- #
column_name_pattern = r'^[a-z]{2}-[A-Z]{2}$'
for sheet_name, df in df_filtered_dict.items():
for i in range(1, df.shape[1]): # iterate over columns except the first one. Shape returns a tuple with dimensions: [0] = number of rows, [1] = number of columns
column_name = df.columns[i]
if not re.match(column_name_pattern, column_name):
raise ValueError(
f"\n❌ Column name '{column_name}' of {sheet_name} does not match format [a-z][a-z]-[A-Z][A-Z]"
)
#---------------------------END OF VALIDATE COLUMN NAMES -----------------------#
# --------------------------- GENERATE JSON FILES --------------------------- #
for sheet_name, df in df_filtered_dict.items():
output_folder = f"../JSON FILES/{sheet_name}"
os.makedirs(output_folder, exist_ok=True)
print(f"\n📁 {sheet_name}")
print("**" * len((f"*✅ JSON file created:{sheet_name.replace(os.sep, '/')}*")))
for i in range(1, df.shape[1]):
column_name = df.columns[i] # Keep the name of the column with index i
column_name_simple = re.sub(r"^([a-zA-Z]+)-.*", r"\1", column_name)
# Generate the name of the file based on the name of the Excel sheet
json_filename = os.path.join(output_folder, f"{column_name_simple}.json")
# Convert the DataFrame to a dictionary with Column1 as the key and Column2 as the value
keys = df.select(df.columns[0]).to_series().to_list()
values = df.select(column_name).to_series().to_list()
df_to_json = dict(zip(keys, values))
with open(json_filename, 'w', encoding="utf-8") as json_file:
json.dump(df_to_json, json_file)
print(f"*✅ JSON file created: {json_filename.replace(os.sep, '/')} *")
print("*" * len((f"*✅ JSON file created: {json_filename.replace(os.sep, '/')} *")))
# --------------------------------END OF GENERATE JSON FILES--------------------------------#
end_time = time.perf_counter()
elapsed = end_time - start_time
print(f"\n Total execution time: {elapsed:.2f} seconds")