-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpostsxml_sql_extract.py
More file actions
32 lines (27 loc) · 1.12 KB
/
postsxml_sql_extract.py
File metadata and controls
32 lines (27 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
import html
import os
import sys
if len(sys.argv) < 2:
print("Please provide a path as a command line argument.")
sys.exit()
path = sys.argv[1]
if not os.path.exists(path):
print(f"The path '{path}' does not exist.")
sys.exit()
select_from_regex = r'<code>SELECT\s.*?FROM\s.*?</code>'
counter = 0
with open(path, 'r', encoding='utf-8') as f:
with open('sqlcommands.txt', 'w', encoding='utf-8') as outfile: # opening a new file to write output
for line in f:
try:
matches = re.findall(select_from_regex, line)
for match in matches:
counter += 1
if (counter % 10000) == 0:
print(counter)
without_comments = re.sub('--.*?
', '', match)
output_string = html.unescape(html.unescape(without_comments.replace('
', ' '))).replace('<code>','').replace('</code>','')
outfile.write(output_string+'\n') # writing output to file
except UnicodeDecodeError as e:
print(f"UnicodeDecodeError: {e}")