python-modules/htmlToText.py at master · deborah-powers/python-modules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# attention, l'ordre des sous-fonctions est important
from textFct import *
from htmlFct import *
import loggerFct as log

def fromTable (text):
	text = text.replace ('</td>', "")
	text = text.replace ('</th>', ':')
	text = text.replace ('</tr>', "")
	text = text.replace ('<tr><td>', '\n')
	text = text.replace ('<tr><th>', '\n')
	text = text.replace ('<td>', '\t')
	text = text.replace ('<th>', '\t')
	return text

def fromImage (text):
	textList = text.split ('<img ')
	textRange = range (1, len (textList))
	for t in textRange:
		src =""
		alt =""
		f= textList[t].find ('>')
		if 'src=' in textList[t][:f]:
			d=5+ textList[t].find ('src=')
			e= textList[t].find (textList[t][d-1], d)
			src = textList[t][d:e]
		if 'alt=' in textList[t][:f]:
			d=5+ textList[t].find ('alt=')
			e= textList[t].find (textList[t][d-1], d)
			alt = textList[t][d:e]
		"""
		else:
			e= src.rfind ('.')
			d= src.rfind ('/')
			if '\\' in src: d= src.rfind ('\\')
			alt = src[d+1:f]
		textList[t] = alt +' ('+ src +') '+ textList[t][f+1:]
		"""
		if alt: src = src +' ('+ alt +')'	# inverse de la méthode de htmlFromText
		textList[t] = src +" "+ textList[t][f+1:]
	text = '\n'.join (textList)
	# text = '\nimg\t'.join (textList)
	return text

def fromLink (text):
	textList = text.split ('</a>')
	textRange = range (len (textList) -1)
	for t in textRange:
		d=6+ textList[t].rfind ('href=')
		f= textList[t].find (textList[t][d-1], d+1)
		link = textList[t][d:f]
		f=1+ textList[t].find ('>', f)
		title = textList[t][f:]
		d= textList[t].rfind ('<a ')
		textList[t] = textList[t][:d]
		textList[t] = textList[t] + link +' ('+ title +')'	# inverse de la méthode de htmlFromText
	text = " ".join (textList)
	text = text.replace (' <','<')
	text = text.replace ('> ','>')
	return text

def fromHtml (text):
	text = cleanHtml (text)
	text = text.replace ('</dt><dd>', ': ')
	text = fromLink (text)
	# les conteneurs
	tagsBlank =( ('<hr/>', '\n**\n'), ('<hr>', '\n**\n'), ('<br>', '\n'), ('<br/>', '\n'), ('<dt>', '\n'), ('<xmp>', '\nCode\n'), ('</xmp>', '\n/\n'), ('<figure>', '\nFigure\n'), ('</figure>', '\n/\n'))
	tagsClosing =( 'li', 'dd', 'tr', 'th', 'td', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')
	for tag in listTagsContainer:
		text = text.replace ('</'+ tag +'>', "")
		text = text.replace ('<'+ tag +'>', "")
	text = fromTable (text)
	text = fromImage (text)
	# les tags
	for html, perso in tagHtml: text = text.replace (html.strip(), perso)
	for html, perso in tagsBlank: text = text.replace (html, perso)
	for tag in tagsClosing: text = text.replace ('</'+ tag +'>', "")
	# les lignes
	text = text.replace ('</p><p>', '\n')
	lines =( 'p', 'caption', 'figcaption' )
	for tag in lines:
		text = text.replace ('</'+ tag +'>', '\n')
		text = text.replace ('<'+ tag +'>', '\n')
	# les phrases
	for tag in listTagsIntern:
		text = text.replace ('</'+ tag +'>', " ")
		text = text.replace ('<'+ tag +'>', " ")
	text = text.replace (' \n', '\n')
	text = text.replace ('\n ', '\n')
	text = shape (text)
	return text