python-modules/htmlFct.py at master · deborah-powers/python-modules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# attention, l'ordre des sous-fonctions est important
from os import sep
from urllib import request as urlRequest
from PIL import Image
from io import BytesIO
import base64
from fileLocal import pathRoot
from textFct import *
import loggerFct as log

listTagsContainer =( 'ul', 'ol', 'dl', 'table', 'nav', 'div', 'fieldset', 'form', 'figure', 'math', 'section', 'article', 'body', 'header', 'footer', 'main' )
listTagsIntern =( 'i', 'b', 'em', 'span', 'strong', 'thead', 'tbody' )
tagHtml =(
	('\n<h1>', '\n== '), ('\n<h2>', '\n** '), ('\n<h3>', '\n-- '), ('\n<h4>', '\n__ '), ('\n<h5>', '\n## '), ('\n<h6>', '\n++ '),
	("\n<hr class='h1'/>\n", '\n\n==\n\n'), ("\n<hr class='h2'/>\n", '\n\n**\n\n'), ("\n<hr class='h3'/>\n", '\n\n--\n\n'),
	("\n<hr>\n", '\n\n**\n\n'), ("\n<hr/>\n", '\n\n**\n\n'),
	('\n<figure>', '\nFig\n'), ('\n<xmp>!', '\nCode\n'), ('\n<xmp>: ', '\nCode: '), ('\n<li>', '\n\t')
)

def findTitleFromUrl (link):
	if link[-1] == '/' or link[-1] == '\\': link = link[:-1]
	pos =0
	if '/' in link: pos =1+ link.rfind ('/')
	elif '\\' in link: pos =1+ link.rfind ('\\')
	title = link[pos:]
	if '.' in title:
		pos = title.rfind ('.')
		if len (title) -pos <11: title = title[:pos]
	title = title.replace ('www.',"")
	urlWords =( '-', '_', '.', '?', '#', '%20' );
	for word in urlWords: title = title.replace (word, " ")
	while "  " in title: title = title.replace ("  "," ")
	return title

def imageFromBase64One (imgStr):
	buff = BytesIO (base64.b64decode (imgStr))
	return Image.open (buff)

def imgToB64Local (imgPath):
	if sep == '\\':
		imgPath = imgPath.replace ('/', sep)
		imgPath = imgPath.replace ('users\\lenovo', 'Users\\LENOVO')
		imgPath = imgPath.replace ('c:\\', 'C:\\')
	if pathRoot not in imgPath: return ""
	imageOriginal = Image.open (imgPath)
	imageOriginal = imageOriginal.convert ('RGB')
	buff = BytesIO()
	if imgPath[-3:] == 'jpg': imageOriginal.save (buff, format='jpeg')
	else: imageOriginal.save (buff, format=imgPath[-3:])
	imgStr = base64.b64encode (buff.getvalue())
	imgStr = 'data:image/' + imgPath[-3:] + ';base64,' + str (imgStr)[2:-1]
	return imgStr

def imgToB64Web (imgPath):
	myRequest = urlRequest.Request (imgPath, headers={ 'User-Agent': 'Mozilla/5.0' })
	response = urlRequest.urlopen (myRequest)
	tmpByte = response.read()
	imgStr = base64.b64encode (tmpByte)
	imgStr = 'data:image/' + imgPath[-3:] + ';base64,' + str (imgStr)[2:-1]
	return imgStr

def imgToB64 (text):
	if 'src=' not in text: return text
	textList = text.split ('src=')
	textRange = range (1, len (textList))
	for t in textRange:
		f= textList[t].find (textList[t][0], 2)
		if textList[t][f-4:f] not in '.bmp .png .gif .jpg': continue
		imgStr = textList[t][1:f]
		if imgStr[0:4] == 'http': imgStr = imgToB64Web (imgStr)
		else: imgStr = imgToB64Local (imgStr)
		textList[t] = textList[t][0] + imgStr + textList[t][f:]
	text = 'src='.join (textList)
	return text

def imgToB64One (imageName):
	imageOriginal = Image.open (imageName)
	imageOriginal = imageOriginal.convert ('RGB')
	buff = BytesIO()
	if imageName[-3:] == 'jpg': imageOriginal.save (buff, format='jpeg')
	else: imageOriginal.save (buff, format=imageName[-3:])
	imgStr = base64.b64encode (buff.getvalue())
	imgStr = 'data:image/' + imageName[-3:] + ';base64,' + str (imgStr)[2:-1]
	return imgStr

def imgToB64_va (text):
	if 'src=' in text:
		text = text.replace ("src='http", "scr='http")
		text = text.replace ('src="http', 'scr="http')
		if 'src=' in text:
			textList = text.split ('src=')
			textRange = range (1, len (textList))
			for t in textRange:
			#	if textList[t][1:5] == 'http': continue
				f= textList[t].find (textList[t][0], 2)
				if textList[t][f-4:f] not in '.bmp .png .gif .jpg': continue
				imageName = textList[t][1:f].replace ('/', sep)
				if pathRoot not in imageName: continue
				imgStr = imgToB64One (imageName)
				textList[t] = textList[t][0] + imgStr + textList[t][f:]
			text = 'src='.join (textList)
		text = text.replace ('scr=', 'src=')
	return text

def cleanHtmlForWritting (text):
	if '</xmp>' in text: text = cleanBasic (text)
	else: text = cleanHtml (text)
	for tag in listTagsIntern:
		text = text.replace ('<'+ tag +'>', ' <'+ tag +'>')
		text = text.replace ('</'+ tag +'>', '</'+ tag +'> ')
	text = text.replace ('</a>', '</a> ')
	text = text.replace ("<a ", " <a ")
	while "  " in text: text = text.replace ("  ", " ")
	text = text.replace ('> <', '><')
	points = '.,)'
	for p in points: text = text.replace (" "+p, p)
	text = text.replace ("( ", '(')
	innerTags =( 'i', 'b', 'em', 'span', 'strong', 'a')
	for tag in innerTags:
		for tig in innerTags: text = text.replace ('</'+ tag + '><'+ tig +'>', '</'+ tag + '> <'+ tig +'>')
	return text