forked from iamlemec/fastpat
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_assign.py
More file actions
204 lines (170 loc) · 6.8 KB
/
parse_assign.py
File metadata and controls
204 lines (170 loc) · 6.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import re
import os
import sys
import glob
import sqlite3
import argparse
from lxml.etree import iterparse
from parse_common import clear, get_text, raw_text, ChunkInserter
from traceback import print_exc
# detect organization type
ORG_CORP = 0
ORG_NONP = 1
ORG_INDV = 2
corp_keys = ['CORP','CO','INC','LLC','LP','PLC','LTD','LIMITED','COMPANY',
'CORPORATION','INCORPORATED','INTERNATIONAL','SYSTEMS','SA','OY',
'CONSULTING','BANK','GMBH','KABUSHIKI','KAISHA','BV','NV','SL',
'AKTIENGESELLSCHAFT','MASCHINENFABRIK','AB','AG','AS','SPA','HF',
'SOCIETE','ASSOCIATES','BUSINESS','INDUSTRIES','GROUP','KK',
'LABORATORIES','WORKS','STUDIO','TELECOM','INVESTMENTS',
'CONSULTANTS','ELECTRONICS','TECHNOLOGIES','MICROSYSTEMS',
'MULTIMEDIA','NETWORKS','TECHNOLOGY','PARTNERSHIP','ELECTRIC',
'COMPONENTS','AUTOMOTIVE','INSTRUMENTS','COMMUNICATION',
'ENTERPRISES','NETWORK','ENGINEERING','DESIGNS','SCIENCES',
'PARTNERS','AKTIENGELLSCHAFT','VENTURE','AEROSPACE',
'PHARMACEUTICALS','DESIGN','MEDICAL','PRODUCTS','PHARMA','ENERGY',
'SOLUTIONS','FRANCE','ISREAL','PRODUCT','PLASTICS',
'COMMUNICATIONS','KGAA','SAS','CELLULAR','GESELLSCHAFT','SE',
'HOLDINGS','KG','SRL','CHIMIE']
corp_re = re.compile('\\b('+'|'.join(corp_keys)+')\\b')
punc_re = re.compile(r'[0-9&()]')
nonp_keys = ['INSTITUTE','UNIVERSITY','HOSPITAL','FOUNDATION','COLLEGE',
'RESEARCH','ADMINISTRATION','RECHERCHE','DEPARTMENT','TRUST',
'ASSOCIATION','MINISTRY','LABORATORY','BOARD','OFFICE','UNIV',
'ECOLE','SECRETARY','UNIVERSIDAD','SOCIETY','UNIVERSITEIT',
'CENTRE','CENTER','NATIONAL','SCHOOL','INSTITUT','INSTITUTES',
'UNIVERSITE']
nonp_re = re.compile('\\b('+'|'.join(nonp_keys)+')\\b')
def org_type(name):
name = name.replace('.','')
name = name.replace('/','')
has_comma = name.find(',') != -1
has_corp = corp_re.search(name) != None
has_nonp = nonp_re.search(name) != None
has_punc = punc_re.search(name) != None
has_space = name.find(' ') != -1
if has_corp or has_punc or not has_space:
return ORG_CORP
elif has_nonp:
return ORG_NONP
else:
return ORG_INDV
# detect conveyance type
CONV_ASSIGN = 0
CONV_LICENSE = 1
CONV_MERGER = 2
CONV_OTHER = 3
# detect if a conveyance is not a name/address change or security agreement
other_keys = ['CHANGE','SECUR','CORRECT','RELEASE','LIEN','UPDATE','NUNC','COLLAT']
other_re = re.compile('|'.join(other_keys))
def convey_type(convey):
if other_re.search(convey) != None:
return CONV_OTHER
elif convey.find('ASSIGN') != -1:
return CONV_ASSIGN
elif convey.find('LICENSE') != -1:
return CONV_LICENSE
elif convey.find('MERGE') != -1:
return CONV_MERGER
else:
return CONV_OTHER
# MAIN SECTION
# parse input arguments
parser = argparse.ArgumentParser(description='USPTO patent assignment parser.')
parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse')
parser.add_argument('--db', type=str, default=None, help='database file to store to')
parser.add_argument('--limit', type=int, default=None, help='only parse n patents')
args = parser.parse_args()
# connect to patent db
con = sqlite3.connect(args.db)
cur = con.cursor()
cur.execute('create table if not exists assign (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)')
cur.execute('create unique index if not exists idx_assign on assign (patnum,execdate,assignor,assignee)')
chunker = ChunkInserter(con, table='assign')
def gen_patnums(patents):
for pat in patents:
for doc in pat.findall('document-id'):
kind = get_text(doc, 'kind')
pnum = get_text(doc, 'doc-number')
if not kind.startswith('B'):
continue
yield pnum
# parseahol
i = 0
o = 0
p = 0
def parse_gen3(fname_in):
global i, o, p
for (event,elem) in iterparse(fname_in, tag='patent-assignment', events=['end'], recover=True):
# top-level section
record = elem.find('assignment-record')
assignor = elem.find('patent-assignors')[0]
assignee = elem.find('patent-assignees')[0]
patents = elem.find('patent-properties')
# conveyance
convey = get_text(record,'conveyance-text')
# names
assignor_name = get_text(assignor, 'name')
assignee_name = get_text(assignee, 'name')
# dates
exec_sec = assignor.find('execution-date')
recd_sec = record.find('recorded-date')
exec_date = get_text(exec_sec, 'date') if exec_sec is not None else ''
recd_date = get_text(recd_sec, 'date') if recd_sec is not None else ''
# location
assignee_country = get_text(assignee, 'country-name', default='UNITED STATES')
assignee_state = get_text(assignee, 'state')
# patent info
patnums = list(gen_patnums(patents))
npat = len(patnums)
if npat == 0:
continue
# code names
src_type = org_type(assignor_name)
dst_type = org_type(assignee_name)
ctype = convey_type(convey)
# throw out individuals
if src_type == ORG_INDV or dst_type == ORG_INDV or ctype == CONV_OTHER:
o += 1
continue
# output
for pn in patnums:
chunker.insert(None, pn, exec_date, recd_date, convey, assignor_name, assignee_name, assignee_state, assignee_country)
# free memory
clear(elem)
# stats
i += 1
p += npat
# logging
if i % 1000 == 0:
print('%4d: %40.40s -> %30.30s (%20.20s, %20.20s)' % (npat, assignor_name, assignee_name, assignee_state, assignee_country))
# break
if args.limit and i >= args.limit:
return False
return True
# collect files
if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])):
targ_dir = 'assign_files' if len(args.target) == 0 else args.target[0]
file_list = sorted(glob.glob('%s/*.xml' % targ_dir))
else:
file_list = args.target
# do robust parsing
for fpath in file_list:
# terminate on limit
if args.limit is not None and i >= args.limit:
print('Reached limit.')
break
(fdir, fname) = os.path.split(fpath)
print('Parsing %s' % fname)
i0, o0, p0 = i, o, p
try:
parse_gen3(fpath)
except Exception as e:
print('EXCEPTION OCCURRED!')
print_exc()
print('Found %d records, %d dropped, %d patents' % (i-i0, o-o0, p-p0))
print('Total %d records, %d dropped, %d patents' % (i, o, p))
print()
# clear out the rest
chunker.commit()
con.close()