-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtmlToText.py
More file actions
94 lines (89 loc) · 2.98 KB
/
htmlToText.py
File metadata and controls
94 lines (89 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# attention, l'ordre des sous-fonctions est important
from textFct import *
from htmlFct import *
import loggerFct as log
def fromTable (text):
text = text.replace ('</td>', "")
text = text.replace ('</th>', ':')
text = text.replace ('</tr>', "")
text = text.replace ('<tr><td>', '\n')
text = text.replace ('<tr><th>', '\n')
text = text.replace ('<td>', '\t')
text = text.replace ('<th>', '\t')
return text
def fromImage (text):
textList = text.split ('<img ')
textRange = range (1, len (textList))
for t in textRange:
src =""
alt =""
f= textList[t].find ('>')
if 'src=' in textList[t][:f]:
d=5+ textList[t].find ('src=')
e= textList[t].find (textList[t][d-1], d)
src = textList[t][d:e]
if 'alt=' in textList[t][:f]:
d=5+ textList[t].find ('alt=')
e= textList[t].find (textList[t][d-1], d)
alt = textList[t][d:e]
"""
else:
e= src.rfind ('.')
d= src.rfind ('/')
if '\\' in src: d= src.rfind ('\\')
alt = src[d+1:f]
textList[t] = alt +' ('+ src +') '+ textList[t][f+1:]
"""
if alt: src = src +' ('+ alt +')' # inverse de la méthode de htmlFromText
textList[t] = src +" "+ textList[t][f+1:]
text = '\n'.join (textList)
# text = '\nimg\t'.join (textList)
return text
def fromLink (text):
textList = text.split ('</a>')
textRange = range (len (textList) -1)
for t in textRange:
d=6+ textList[t].rfind ('href=')
f= textList[t].find (textList[t][d-1], d+1)
link = textList[t][d:f]
f=1+ textList[t].find ('>', f)
title = textList[t][f:]
d= textList[t].rfind ('<a ')
textList[t] = textList[t][:d]
textList[t] = textList[t] + link +' ('+ title +')' # inverse de la méthode de htmlFromText
text = " ".join (textList)
text = text.replace (' <','<')
text = text.replace ('> ','>')
return text
def fromHtml (text):
text = cleanHtml (text)
text = text.replace ('</dt><dd>', ': ')
text = fromLink (text)
# les conteneurs
tagsBlank =( ('<hr/>', '\n**\n'), ('<hr>', '\n**\n'), ('<br>', '\n'), ('<br/>', '\n'), ('<dt>', '\n'), ('<xmp>', '\nCode\n'), ('</xmp>', '\n/\n'), ('<figure>', '\nFigure\n'), ('</figure>', '\n/\n'))
tagsClosing =( 'li', 'dd', 'tr', 'th', 'td', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')
for tag in listTagsContainer:
text = text.replace ('</'+ tag +'>', "")
text = text.replace ('<'+ tag +'>', "")
text = fromTable (text)
text = fromImage (text)
# les tags
for html, perso in tagHtml: text = text.replace (html.strip(), perso)
for html, perso in tagsBlank: text = text.replace (html, perso)
for tag in tagsClosing: text = text.replace ('</'+ tag +'>', "")
# les lignes
text = text.replace ('</p><p>', '\n')
lines =( 'p', 'caption', 'figcaption' )
for tag in lines:
text = text.replace ('</'+ tag +'>', '\n')
text = text.replace ('<'+ tag +'>', '\n')
# les phrases
for tag in listTagsIntern:
text = text.replace ('</'+ tag +'>', " ")
text = text.replace ('<'+ tag +'>', " ")
text = text.replace (' \n', '\n')
text = text.replace ('\n ', '\n')
text = shape (text)
return text