-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert-html.py
More file actions
72 lines (65 loc) · 2.69 KB
/
convert-html.py
File metadata and controls
72 lines (65 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import pypandoc
from fileCls import File
fileSrc = 'C:\\Users\\deborah.powers\\Desktop\\rgaa-certif\\rgaa diaporama.html'
fileDst = 'C:\\Users\\deborah.powers\\Desktop\\rgaa présentation conv.pptx'
fileSrcEpsiloon = 'C:\\Users\\LENOVO\\Desktop\\magasine epsiloon 54.pdf'
fileSrc = 'C:\\Users\\LENOVO\\Desktop\\okilele tapuscrit.pdf'
fileDst = 'C:\\Users\\LENOVO\\Desktop\\okilele tapuscrit.html'
fileTxt = File (fileSrc)
fileTxt.fromPdf()
def pdfToHtmlWex():
import subprocess
subprocess.call ('pdf2htmlEX '+ fileSrc, shell=True)
def pdfToHtmlWspirePdf():
"""
from spire.filePdf.common import *
from spire.pdf import *
"""
doc = PdfDocument()
doc.LoadFromFile(fileSrc)
# Set the conversion options to
convertOptions = doc.ConvertOptions
# Specify convert options
convertOptions.SetPdfToHtmlOptions (True, True, 1, False)
doc.SaveToFile(fileDst, FileFormat.HTML)
doc.Dispose()
"""
convertOptions.SetPdfToHtmlOptions
useEmbeddedSvg(boolean) : Specifies whether to convert the PDF to an SVG image file and embed it in the HTML code.
useEmbededImg(boolean): Specifies whether to embed image data in the HTML file. This option is only effective when useEmbeddedSvg is set to false.
maxPageOneFile(boolean): Specifies the maximum number of pages in one HTML file. This option is only effective when useEmbeddedSvg is set to false.
useHighQualityEmbeddedSvg(boolean): Specifies whether to use high-quality SVG image embedding in the HTML. This option is only effective when useEmbeddedSvg is set to true.
"""
def pdfToTextWcodecs():
import codecs
textBrut = open (pdfPath, 'rb')
tmpByte = textBrut.read()
encodingList = ('ISO-8859-1', 'ISO8859-1')
text =""
for encoding in encodingList:
try:
text = codecs.decode (tmpByte, encoding='ISO8859-1')
text = text.encode('utf-8', errors='ignore')
except UnicodeDecodeError: pass
else: break
if not text:
for encoding in encodingList:
try:
text = codecs.decode (tmpByte, encoding=encoding, errors='ignore')
text = text.encode('utf-8', errors='ignore')
except UnicodeDecodeError: pass
else: break
textBrut.close()
print (text[:200])
"""
output = pypandoc.convert_file (fileSrc, 'pptx', outputfile=fileDst)
output = pypandoc.convert_file (fileSrc, 'plain', outputfile=fileDst)
fileSrc = 'C:\\Users\\deborah.powers\\Desktop\\bdc.pdf'
fileDst = 'C:\\Users\\deborah.powers\\Desktop\\fichier.html'
output = pypandoc.convert_file (fileSrc, 'html', outputfile=fileDst)
fileSrc = 'C:\\Users\\deborah.powers\\Desktop\\Portail Outils de gestion de projet avec ANCT.odt'
fileDst = 'C:\\Users\\deborah.powers\\Desktop\\gestion de projet.html'
output = pypandoc.convert_file (fileSrc, 'html', outputfile=fileDst)
"""