-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfix_hebrew_truncation.py
More file actions
286 lines (233 loc) · 10.7 KB
/
fix_hebrew_truncation.py
File metadata and controls
286 lines (233 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python3
"""
Fix Hebrew Text Truncation in COMPLETE_TRANSIT_PIPELINE.ipynb
This script adds a fix_truncated_hebrew() function that repairs truncated Hebrew text
in the area and location columns.
Issue: Hebrew text is being truncated, losing the final letter:
- 'גלעין' becomes 'גלעי'
- 'טבעת פנימית' becomes 'טבעת פנימי'
"""
import json
import re
# Define the fix function to be inserted
FIX_FUNCTION = ''' def fix_truncated_hebrew(text):
"""
Fix truncated Hebrew text by restoring the missing final letter.
Common truncations:
- 'גלעי' -> 'גלעין' (Core)
- 'טבעת פנימי' -> 'טבעת פנימית' (Inner Ring)
- 'טבעת חיצוני' -> 'טבעת חיצונית' (Outer Ring)
- 'טבעת תיכונ' -> 'טבעת תיכונה' (Middle Ring)
- Any word ending with 'י' that should end with 'ית' (feminine adjective)
Args:
text: Hebrew text string that may be truncated
Returns:
Fixed Hebrew text with proper final letters
"""
if not isinstance(text, str) or not text:
return text
# Known truncation patterns and their fixes
# Format: (truncated_pattern, correct_replacement)
fixes = [
# Exact matches (most specific)
(r'^גלעי$', 'גלעין'), # Core (exact word)
(r'^טבעת פנימי$', 'טבעת פנימית'), # Inner Ring
(r'^טבעת חיצוני$', 'טבעת חיצונית'), # Outer Ring
(r'^טבעת תיכונ$', 'טבעת תיכונה'), # Middle Ring
(r'^טבע$', 'טבעת'), # Ring (if severely truncated)
# Word-level fixes (match within larger strings)
(r'\\bגלעי\\b', 'גלעין'), # Core (as word)
(r'\\bטבעת פנימי\\b', 'טבעת פנימית'), # Inner Ring (as phrase)
(r'\\bטבעת חיצוני\\b', 'טבעת חיצונית'), # Outer Ring (as phrase)
(r'\\bטבעת תיכונ\\b', 'טבעת תיכונה'), # Middle Ring (as phrase)
# General pattern: Hebrew word ending with 'י' should end with 'ית' (feminine)
# Only apply if the word is likely an adjective (e.g., after 'טבעת')
(r'(טבעת\\s+\\S*?)י(?=\\s|$)', r'\\1ית'), # Any adjective after טבעת ending in י
]
fixed_text = text.strip()
# Apply each fix pattern
for pattern, replacement in fixes:
fixed_text = re.sub(pattern, replacement, fixed_text)
# If text changed, log it
if fixed_text != text.strip():
# This will be visible during notebook execution
pass # Logging will be done by caller
return fixed_text
def fix_truncated_hebrew_in_gdf(gdf, columns):
"""
Apply Hebrew text fixes to specified columns in a GeoDataFrame.
Args:
gdf: GeoDataFrame to fix
columns: List of column names to fix
Returns:
Number of values fixed
"""
fixes_count = 0
for col in columns:
if col not in gdf.columns:
continue
# Apply fix to non-null string values
original_values = gdf[col].copy()
gdf[col] = gdf[col].apply(
lambda x: fix_truncated_hebrew(x) if pd.notna(x) and isinstance(x, str) else x
)
# Count how many were fixed
changed = (original_values != gdf[col]) & original_values.notna()
if changed.any():
fixes_count += changed.sum()
print(f" ✓ Fixed {changed.sum()} truncated values in column '{col}'")
# Show examples of fixes
for idx in gdf[changed].index[:3]: # Show first 3 examples
old_val = original_values.loc[idx]
new_val = gdf.loc[idx, col]
print(f" '{old_val}' -> '{new_val}'")
return fixes_count
'''
# Define code to apply the fix after tagging
APPLY_FIX_CODE = '''
# ============================================================================
# FIX: Repair truncated Hebrew text in area and location columns
# ============================================================================
print("\\n Fixing any truncated Hebrew text...")
# Fix area column (string values)
if 'area' in gdf_demand.columns:
original_areas = gdf_demand['area'].copy()
gdf_demand['area'] = gdf_demand['area'].apply(
lambda x: fix_truncated_hebrew(x) if pd.notna(x) else x
)
area_fixed = ((original_areas != gdf_demand['area']) & original_areas.notna()).sum()
if area_fixed > 0:
print(f" ✓ Fixed {area_fixed} truncated values in 'area' column")
# Show examples
for idx in gdf_demand[(original_areas != gdf_demand['area']) & original_areas.notna()].index[:3]:
print(f" '{original_areas.loc[idx]}' -> '{gdf_demand.loc[idx, 'area']}'")
# Fix location column (list values - fix each element)
if 'location' in gdf_demand.columns:
location_fixed = 0
for idx in gdf_demand.index:
loc_val = gdf_demand.loc[idx, 'location']
if isinstance(loc_val, list):
fixed_loc = [fix_truncated_hebrew(item) if isinstance(item, str) else item
for item in loc_val]
if fixed_loc != loc_val:
location_fixed += 1
if location_fixed <= 3: # Show first 3 examples
print(f" '{loc_val}' -> '{fixed_loc}'")
gdf_demand.at[idx, 'location'] = fixed_loc
if location_fixed > 0:
print(f" ✓ Fixed {location_fixed} truncated values in 'location' column")
print(" ✓ Hebrew text fix complete")
'''
def main():
print("Loading COMPLETE_TRANSIT_PIPELINE.ipynb...")
with open('COMPLETE_TRANSIT_PIPELINE.ipynb', 'r', encoding='utf-8') as f:
nb = json.load(f)
print(f"Notebook has {len(nb['cells'])} cells")
# Find Cell 30 (the one with helper functions and tagging logic)
target_cell_idx = None
for i, cell in enumerate(nb['cells']):
if cell.get('cell_type') == 'code':
source = ''.join(cell.get('source', []))
if 'def check_hebrew_truncation' in source and 'Step 2.3' in source:
target_cell_idx = i
print(f"Found target cell at index {i}")
break
if target_cell_idx is None:
print("ERROR: Could not find Cell 30 with helper functions")
return False
# Get the cell source
cell = nb['cells'][target_cell_idx]
source_lines = cell['source']
source_text = ''.join(source_lines)
# Check if fix function already exists
if 'def fix_truncated_hebrew(' in source_text:
print("Fix function already exists - updating it...")
# Remove old version
lines = source_text.split('\n')
new_lines = []
skip = False
for line in lines:
if 'def fix_truncated_hebrew(' in line:
skip = True
elif skip and line and not line[0].isspace():
# End of function
skip = False
if not skip:
new_lines.append(line)
source_text = '\n'.join(new_lines)
# Find where to insert the fix function (before check_hebrew_truncation)
lines = source_text.split('\n')
insert_idx = None
for i, line in enumerate(lines):
if 'def check_hebrew_truncation(' in line:
insert_idx = i
print(f"Will insert fix function before line {i}")
break
if insert_idx is None:
print("ERROR: Could not find insertion point")
return False
# Insert the fix function
fix_lines = FIX_FUNCTION.split('\n')
lines = lines[:insert_idx] + fix_lines + lines[insert_idx:]
# Now find where to apply the fix (after the summary section, before "Step 2.3 complete")
apply_idx = None
for i, line in enumerate(lines):
if '✓ Step 2.3 complete!' in line:
apply_idx = i
print(f"Will insert fix application before line {i}")
break
if apply_idx is None:
print("WARNING: Could not find '✓ Step 2.3 complete!' - searching for alternative insertion point")
# Look for the summary section
for i, line in enumerate(lines):
if 'Sample data verification:' in line:
# Insert after the diagnostic section (look for next print statement group)
for j in range(i, len(lines)):
if 'DIAGNOSTIC - After Step 2.3:' in lines[j]:
apply_idx = j
print(f"Will insert fix application before line {j}")
break
break
if apply_idx is None:
print("ERROR: Could not find where to apply the fix")
return False
# Check if fix is already applied
has_fix_applied = any('Fixing any truncated Hebrew text' in line for line in lines)
if not has_fix_applied:
# Insert the fix application code
apply_lines = APPLY_FIX_CODE.split('\n')
lines = lines[:apply_idx] + apply_lines + lines[apply_idx:]
print("Inserted fix application code")
else:
print("Fix application code already exists")
# Reconstruct the cell source
new_source = '\n'.join(lines)
# Convert back to list of lines (Jupyter format)
# Each line should end with \n except the last one
new_source_lines = []
split_lines = new_source.split('\n')
for i, line in enumerate(split_lines):
if i < len(split_lines) - 1:
new_source_lines.append(line + '\n')
else:
new_source_lines.append(line)
# Update the cell
nb['cells'][target_cell_idx]['source'] = new_source_lines
# Save the notebook
print("Saving modified notebook...")
with open('COMPLETE_TRANSIT_PIPELINE.ipynb', 'w', encoding='utf-8') as f:
json.dump(nb, f, ensure_ascii=False, indent=1)
print("✓ Successfully updated COMPLETE_TRANSIT_PIPELINE.ipynb")
print("\nChanges made:")
print(" 1. Added fix_truncated_hebrew() function")
print(" 2. Added fix_truncated_hebrew_in_gdf() function")
print(" 3. Applied fix to area and location columns after tagging")
print("\nThe fix will:")
print(" - Repair 'גלעי' -> 'גלעין'")
print(" - Repair 'טבעת פנימי' -> 'טבעת פנימית'")
print(" - Repair other similar truncations")
return True
if __name__ == '__main__':
import sys
success = main()
sys.exit(0 if success else 1)