Skip to content

Commit 0a6c8ea

Browse files
committed
BibSched: new INSPIRE-HEP-names tasklet
* Adds new tasklet to fetch INSPIRE-HEP records and map INSPIRE-IDs to INSPIRE-HEP-names. Signed-off-by: Jochen Klein <j.klein@cern.ch>
1 parent 89a2df5 commit 0a6c8ea

1 file changed

Lines changed: 130 additions & 0 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# This file is part of Invenio.
2+
# Copyright (C) 2016 CERN.
3+
#
4+
# Invenio is free software; you can redistribute it and/or
5+
# modify it under the terms of the GNU General Public License as
6+
# published by the Free Software Foundation; either version 2 of the
7+
# License, or (at your option) any later version.
8+
#
9+
# Invenio is distributed in the hope that it will be useful, but
10+
# WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12+
# General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU General Public License
15+
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
16+
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
17+
18+
"""INSPIRE-HEP IDs to names mapping (Invenio Bibliographic Tasklet).
19+
20+
Harvest records on INSPIRE-HEP including MARC fields 035__9 and 035__a. Map
21+
INSPIRE-IDs to INSPIRE-HEP-names, and write the dictionary to a JSON file.
22+
23+
Output example: {"INSPIRE-12345": "john.1", ...}
24+
25+
Usage:
26+
$bibtasklet -N inspirehep-names-mapper
27+
-T bst_inspirehep_names_mapper [-a json_file
28+
[default: invenio.config.CFG_CACHEDIR/inspirehep-names-mapping.json]]
29+
"""
30+
31+
import time
32+
import urllib2
33+
import xml.etree.ElementTree as ET
34+
from os.path import join
35+
from sys import stderr
36+
37+
from invenio.bibauthority_people_utils import (
38+
export_json, UtilsError, version_file)
39+
from invenio.bibtask import write_message
40+
from invenio.config import CFG_CACHEDIR
41+
42+
INSPIREHEP_NAMES_MAPPING_FILE = join(
43+
CFG_CACHEDIR, "inspirehep-names-mapping.json")
44+
ns = {"x": "http://www.loc.gov/MARC21/slim"} # XML namespaces
45+
46+
47+
def get_records(record_limit=250):
48+
"""Get MARCXML record elements.
49+
50+
:param int record_limit: records limit each request. Maximum 251,
51+
except if you are a superadmin
52+
:return: list of MARCXML record elements or empty list
53+
"""
54+
counter = 1
55+
records_all = []
56+
57+
url = (
58+
"https://inspirehep.net/search?cc=HepNames"
59+
"&p=035__9%3ABAI+035__%3AINSPIRE&of=xm&ot=035&rg={0}&jrec={1}")
60+
61+
while 1:
62+
req = urllib2.Request(url.format(record_limit, counter))
63+
try:
64+
response = urllib2.urlopen(req)
65+
except urllib2.URLError as e:
66+
raise e
67+
page_result = response.read()
68+
root = ET.fromstring(page_result)
69+
records = root.findall(".//x:record", namespaces=ns) or []
70+
71+
if not records:
72+
break
73+
74+
records_all = records_all + records
75+
counter += record_limit
76+
77+
# Sleep some seconds between every request not to be banned
78+
time.sleep(3)
79+
80+
return records_all
81+
82+
83+
def get_mapping(inspire_records):
84+
"""Get mapping INSPIRE-ID to INSPIRE-HEP-name.
85+
86+
:param list inspire_records: list of MARCXML record elements
87+
:return: dictionary containing the mapping
88+
"""
89+
inspire_mapping = {}
90+
91+
for record in inspire_records:
92+
inspire_id = inspire_name = None
93+
94+
datafields = record.findall("x:datafield[@tag='035']", namespaces=ns)
95+
for datafield in datafields:
96+
subfield = datafield.find("x:subfield[@code='9']", namespaces=ns)
97+
if subfield is not None:
98+
subfield_a = datafield.find(
99+
"x:subfield[@code='a']", namespaces=ns)
100+
if subfield_a is not None:
101+
if (subfield.text == "INSPIRE"):
102+
inspire_id = subfield_a.text
103+
elif (subfield.text == "BAI"):
104+
inspire_name = subfield_a.text
105+
106+
inspire_mapping[inspire_id] = inspire_name
107+
108+
return inspire_mapping
109+
110+
111+
def bst_inspirehep_names_mapper(json_file=INSPIREHEP_NAMES_MAPPING_FILE):
112+
"""Map INSPIRE-IDs to INSPIRE-HEP-names and write to JSON.
113+
114+
:param filepath json_file: path to JSON file containing the INSPIRE mapping
115+
"""
116+
try:
117+
records = get_records()
118+
write_message("{0} records (HepNames) fetched.".format(len(records)))
119+
mapping = get_mapping(records)
120+
if mapping:
121+
version_file(json_file, 2)
122+
try:
123+
export_json(mapping, json_file)
124+
write_message(
125+
"Mapping for INSPIRE-HEP-ids and -names exported to JSON. "
126+
"See '{0}'.".format(json_file))
127+
except UtilsError as e:
128+
write_message(e.reason, stderr)
129+
except urllib2.URLError as e:
130+
write_message(e.reason, stderr)

0 commit comments

Comments
 (0)