-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCASManager.cpp
More file actions
282 lines (272 loc) · 11.1 KB
/
CASManager.cpp
File metadata and controls
282 lines (272 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/**
Project: libtpc
File name: TpcCASManager.cpp
@author valerio
@version 1.0 7/28/17.
*/
#include "cas-generators/pdf2tpcas/PdfInfo.h"
#include "cas-generators/Stream2Tpcas.h"
#include "cas-generators/xml2tpcas/ReadXml2Stream.h"
#include "CASManager.h"
#include <boost/filesystem/path.hpp>
#include <podofo/podofo.h>
#include <boost/filesystem/operations.hpp>
#include <boost/regex.hpp>
#include <regex>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <uima/xmideserializer.hpp>
#include "uima/xmiwriter.hpp"
#include "Utils.h"
using namespace tpc::cas;
using namespace std;
void CASManager::convert_raw_file_to_cas1(const string& file_path, FileType type, const string& out_dir,
bool use_parent_dir_as_outname)
{
// get rid of overwhelming messages from podofo library
PoDoFo::PdfError::EnableDebug(false);
PoDoFo::PdfError::EnableLogging(false);
string file_name_no_ext = boost::filesystem::path(file_path).filename().string();
string file_name_tpcas;
size_t extPos = file_name_no_ext.rfind('.');
if (extPos != std::string::npos) {
file_name_no_ext.erase(extPos);
}
file_name_tpcas = file_name_no_ext + ".tpcas";
string foutname;
string fimageoutname;
if (use_parent_dir_as_outname) {
foutname = out_dir + "/" + boost::filesystem::path(file_path).parent_path().filename().string();
fimageoutname = foutname + "/images/" + boost::filesystem::path(file_path).parent_path().filename().string();
} else {
foutname = out_dir + "/" + boost::filesystem::path(file_path).stem().string();
fimageoutname = foutname + "/images/" + boost::filesystem::path(file_path).stem().string();
}
boost::filesystem::create_directories(foutname + "/images");
foutname.append("/" + file_name_tpcas);
stringstream sout;
switch (type) {
case FileType::pdf:
try {
PdfInfo myInfo(file_path, fimageoutname);
myInfo.StreamAll(sout);
const char *descriptor = PDF2TPCAS_DESCRIPTOR.c_str();
Stream2Tpcas stp(sout, foutname, descriptor);
stp.processInputStream();
} catch (PoDoFo::PdfError &e) {
cerr << "Error: An error occurred during processing the pdf file." << endl << e.GetError() << endl
<< file_path << endl;
e.PrintErrorMsg();
}
break;
case FileType::xml:
ReadXml2Stream rs(file_path.c_str());
std::stringstream sout;
rs.GetStream(sout);
const char *descriptor = XML2TPCAS_DESCRIPTOR.c_str();
Stream2Tpcas stp(sout, foutname, descriptor);
stp.processInputStream();
break;
}
}
int CASManager::convert_cas1_to_cas2(const string &file_path, const std::string &out_dir)
{
string foutname = out_dir + "/" + boost::filesystem::path(file_path).parent_path().filename().string();
string temp_dir_path = boost::filesystem::temp_directory_path().string();
string tpcasfile = Utils::decompress_gzip(file_path, temp_dir_path);
try {
/* Create/link up to a UIMACPP resource manager instance (singleton) */
(void) uima::ResourceManager::createInstance("TPCAS2LINDEXAE");
uima::ErrorInfo errorInfo;
const char* descriptor = TPCAS1_2_TPCAS2_DESCRIPTOR.c_str();
uima::AnalysisEngine * pEngine
= uima::Framework::createAnalysisEngine(descriptor, errorInfo);
if (errorInfo.getErrorId() != UIMA_ERR_NONE) {
std::cerr << std::endl
<< " Error string : "
<< uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId())
<< std::endl
<< " UIMACPP Error info:" << std::endl
<< errorInfo << std::endl;
exit((int) errorInfo.getErrorId());
}
uima::TyErrorId utErrorId; // Variable to store UIMACPP return codes
/* Get a new CAS */
uima::CAS* cas = pEngine->newCAS();
if (cas == nullptr) {
std::cerr << "pEngine->newCAS() failed." << std::endl;
exit(1);
}
/* process input / cas */
try {
/* initialize from an xmicas */
XMLCh* native = XMLString::transcode(tpcasfile.c_str());
LocalFileInputSource fileIS(native);
XMLString::release(&native);
uima::XmiDeserializer::deserialize(fileIS, *cas, true);
std::string filename(tpcasfile);
string filehash = Utils::gettpfnvHash(*cas);
/* process the CAS */
auto text = Utils::getFulltext(*cas);
if (text.length() > 0) {
((uima::AnalysisEngine *) pEngine)->process(*cas);
} else {
cout << "Skip file." << endl;
}
} catch (uima::Exception e) {
uima::ErrorInfo errInfo = e.getErrorInfo();
std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl;
std::cerr << errInfo << std::endl;
}
/* call collectionProcessComplete */
utErrorId = pEngine->collectionProcessComplete();
/* Free annotator */
utErrorId = pEngine->destroy();
delete cas;
delete pEngine;
std::remove(tpcasfile.c_str()); //delete uncompressed temp casfile
return 1;
} catch (uima::Exception e) {
std::cerr << "Exception: " << e << std::endl;
return 0;
}
}
void CASManager::writeXmi(uima::CAS & outCas, int num, std::string outfn) {
std::string ofn;
ofn.append(outfn);
ofn.append("_seg_");
std::stringstream s;
s << num;
ofn.append(s.str());
//open a file stream for output xmi
std::ofstream file;
file.open(ofn.c_str(), std::ios::out | std::ios::binary);
if (!file) {
std::cerr << "Error opening output xmi: " << ofn.c_str() << std::endl;
exit(99);
}
//serialize the cas
uima::XmiWriter writer(outCas, true);
writer.write(file);
file.close();
}
BibInfo CASManager::get_bib_info_from_xml_text(const std::string &xml_text) {
boost::regex nline("\\n");
string xml_text_nn = boost::regex_replace(xml_text, nline, "");
//find author
boost::regex authorregex("\<contrib-group\>(.+?)\<\/contrib-group\>");
boost::smatch author_matches;
string author = "";
string text_copy = xml_text_nn;
while (boost::regex_search(text_copy, author_matches, authorregex)) {
int size = author_matches.size();
std::string hit_text = author_matches[1];
boost::smatch name_matches;
boost::regex nameregex("\<surname\>(.+?)\<\/surname\>\\s+\<given-names\>(.+?)\<\/given-names\>");
while (boost::regex_search(hit_text, name_matches, nameregex)) {
author = author + name_matches[1] + " " + name_matches[2] + ", ";
hit_text = name_matches.suffix().str();
}
text_copy = author_matches.suffix().str();
}
boost::regex comma("\\, $");
author = boost::regex_replace(author, comma, "");
//find subject
boost::regex subjectregex("\<subject\>(.+?)\<\/subject>");
boost::smatch subject_matches;
std::string subject = "";
text_copy = xml_text_nn;
while (boost::regex_search(text_copy, subject_matches, subjectregex)) {
subject = subject + subject_matches[1] + ", ";
text_copy = subject_matches.suffix().str();
}
subject = boost::regex_replace(subject, comma, "");
//find accession
std::string accession = "";
boost::regex pmidregex("\<article-id pub-id-type=\"pmid\"\>(\\d+?)\<\/article-id\>");
boost::regex pmcregex("\<article-id pub-id-type=\"pmc\"\>(\\d+?)\<\/article-id\>");
boost::smatch pmid_matches;
boost::smatch pmc_matches;
if (boost::regex_search(xml_text_nn, pmid_matches, pmidregex)) {
accession = "PMID " + pmid_matches[1];
} else if (boost::regex_search(xml_text_nn, pmc_matches, pmcregex)) {
accession = "PMC " + pmc_matches[1];
}
// find article type
std::string type = "";
boost::regex typeregex("article-type=\"(.+?)\"");
boost::smatch type_matches;
if (boost::regex_search(xml_text_nn, type_matches, typeregex)) {
type = type_matches[1];
}
// find journal
std::string journal = "";
boost::regex journalregex("\<journal-title\>(.+?)\<\/journal-title\>");
boost::smatch journal_matches;
if (boost::regex_search(xml_text_nn, journal_matches, journalregex)) {
journal = journal_matches[1];
}
// find article title
std::string title = "";
boost::regex articleregex("\<article-title\>(.+?)\<\/article-title\>");
boost::smatch article_matches;
if (boost::regex_search(xml_text_nn, article_matches, articleregex)) {
title = article_matches[1];
}
// find abstract
std::string abstract = "";
boost::regex abstractregex("\<abstract\>(.+?)\<\/abstract\>");
boost::smatch abstract_matches;
if (boost::regex_search(xml_text_nn, abstract_matches, abstractregex)) {
abstract = abstract_matches[1];
}
// find citation
std::string citation = "";
boost::regex volumeregex("\<volume\>(\\d+)\<\/volume\>");
boost::smatch volume_matches;
if (boost::regex_search(xml_text_nn, volume_matches, volumeregex)) {
citation = citation + "V : " + volume_matches[1] + " ";
}
boost::regex issueregex("\<issue\>(\\d+)\<\/issue\>");
boost::smatch issue_matches;
if (boost::regex_search(xml_text_nn, issue_matches, issueregex)) {
citation = citation + "(" + issue_matches[1] + ") ";
}
boost::regex pageregex("\<fpage\>(\\d+)\<\/fpage\>\\s+\<lpage\>(\\d+)\<\/lpage\>");
boost::smatch page_matches;
if (boost::regex_search(xml_text_nn, page_matches, pageregex)) {
citation = citation + "pp. " + page_matches[1] + "-" + page_matches[2];
}
// find year
std::string year = "";
boost::regex yearregex("\<pub-date pub-type=\".*?\"\>.*?\<year\>(\\d+)\<\/year\>\\s+\<\/pub-date\>");
boost::smatch year_matches;
if (boost::regex_search(xml_text_nn, year_matches, yearregex)) {
year = year_matches[1];
}
BibInfo bibInfo = BibInfo();
bibInfo.author = author;
bibInfo.accession = accession;
bibInfo.type = type;
bibInfo.title = title;
bibInfo.journal = journal;
bibInfo.citation = citation;
bibInfo.year = year;
bibInfo.abstract = abstract;
bibInfo.subject = subject;
return bibInfo;
}
std::vector<std::string> CASManager::classify_article_into_corpora_from_bib_file(const BibInfo &bib_info) {
vector<string> matching_categories;
for (const auto& cat : PMCOA_CAT_REGEX) {
regex cat_regex(cat.second);
if (regex_match(bib_info.subject, cat_regex) || regex_match(bib_info.title, cat_regex) ||
regex_match(bib_info.journal, cat_regex)) {
matching_categories.push_back(cat.first);
}
}
if (matching_categories.empty()) {
matching_categories.push_back(PMCOA_UNCLASSIFIED);
}
return matching_categories;
}