-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocx.go
More file actions
108 lines (82 loc) · 2.55 KB
/
docx.go
File metadata and controls
108 lines (82 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package textract
import (
"encoding/xml"
)
/***************************************************************************
XML structure based on public information and reverse engineering
***************************************************************************/
// Not sure if the XML structure is really as simple as below, but based on
// limited reverse-engineering done so far, this appears to be the case.
// Will need to enhance testing over time to confirm.
type Docx_Doc struct {
XMLName xml.Name `xml:"document"`
Bodies []Docx_Body `xml:"body"`
}
type Docx_Body struct {
XMLName xml.Name `xml:"body"`
Paragraphs []Docx_Paragraph `xml:"p"`
}
type Docx_Paragraph struct {
XMLName xml.Name `xml:"p"`
Runs []Docx_Run `xml:"r"`
}
type Docx_Run struct {
XMLName xml.Name `xml:"r"`
Text string `xml:"t"`
}
/***************************************************************************
Data structure for various data parsed from this document type
***************************************************************************/
type DocxParser struct {
Content []MemberFileContent
}
/***************************************************************************
Functions required for the document interface
***************************************************************************/
func (d *DocxParser) extension() string {
return ".docx"
}
func (d *DocxParser) trueType() string {
return "application/zip"
}
func (d *DocxParser) filter(identifier string) bool {
return identifier == "word/document.xml"
}
func (d *DocxParser) readFile(path string) error {
list, err := ExtractArchiveContent(path, d.filter)
if err != nil {
return err
}
d.Content = *list
return nil
}
func (d *DocxParser) retrieveTextFromFile() (string, error) {
overallText := ""
for _, mfc := range d.Content {
text, err := d.docXML2Text(mfc.Identifier, mfc.Data)
if err != nil {
return "", err
}
overallText += text
}
return overallText, nil
}
func (d *DocxParser) docXML2Text(identifier string, byteData []byte) (string, error) {
doc := Docx_Doc{}
if err := xml.Unmarshal(byteData, &doc); err != nil {
return "", err
}
var text string
// Each document probably only has one (1) body, but let's still
// iterate thru for now until we're certain.
for i := 0; i < len(doc.Bodies); i++ {
for j := 0; j < len(doc.Bodies[i].Paragraphs); j++ {
t := ""
for k := 0; k < len(doc.Bodies[i].Paragraphs[j].Runs); k++ {
text += doc.Bodies[i].Paragraphs[j].Runs[k].Text
}
text += t + "\n"
}
}
return text, nil
}