holaProject/extractor.py at master · dahrs/holaProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/usr/bin/python
#-*- coding:utf-8 -*-

import re, codecs
import utilsString, utilsOs
from nltk import pos_tag
from nltk.corpus import stopwords


class ruleBasedExtractor():
	'''
	'''
	def __init__(self):
		pass

	##################################################################################
	#HEURISTIC FILTERS FOR GETTING RELIABLE JOB TITLES
	##################################################################################

	def reliableFilter1(self, jobAndPitchFilePath):
		'''
		Rule based filter for all information that needs to analyze
		the raw line of the candidate job title, pitch, etc:
		- not having ampersand (&) or slash (/) signs
		- having less than 3 tokens
		- not having acronyms
		'''
		candidatesDict = {}
		#we look at the job title file line by line (job by job)
		for line in openedFile:
			job = line.split(u'\t')[0]
			#not having ampersand (&) or slash (/ \) signs or brackets (()[]{}) and punctuation (, . ! ?)
			if re.search(r'(&|/|\\|\(|\)|\[|\]|\{|\}|\.|,|\!|\?)', job) == None:
				jobTokenList = utilsString.naiveRegexTokenizer(job.replace(u'/', u' '))
				#having less than 3 tokens
				if len(jobTokenList) <= 3:
					#discard the ones that have accronyms
					if utilsString.findAcronyms(job) != None:
						candidatesDict[line.lower()] = candidatesDict.get(line.lower(),0)+1
		return candidatesDict


	def reliableFilter2(self, candidatesDict, lang, includeJobsWithNApitch):
		'''
		Rule based filter for all information that needs to analyze
		the lines after passing through one first filter:
		- present more than once (no hapax)
		- being the right language (en/fr in both 'job' and 'pitch')
		'''
		setOfReliableJobs = set()
		#if it's present more than once then add it to the set
		for line in candidatesDict:
			job = line.split(u'\t')[0]
			pitch = line.split(u'\t')[1]
			#we might not want to include empty pitchs (that helps to identify the language)
			processIt = False if pitch != 'na\n' and includeJobsWithNApitch != True else True
			#if it's present more than once
			if candidatesDict[line] > 1 and processIt == True:
				#try to guess the language (job title and corresponding pitch) and if it doesn't match do not take it into account
				if lang == utilsString.englishOrFrench(line):
					setOfReliableJobs.add(job)
		return setOfReliableJobs


	def getReliableJobTitles(self, jobAndPitchFilePath, lang=u'en', outputPath=None, includeJobsWithNApitch=True):
		'''
		Make a set containing the job titles that might be considered more reliable:
		filter 1:
			- having less than 3 tokens
			- not having ampersand (&) or slash (/) signs
			- not having acronyms
		filter 2:
			- present more than once (no hapax)
			- being the right language (en/fr in both 'job' and 'pitch')
		'''
		with codecs.open(jobAndPitchFilePath, 'r', encoding='utf8') as openedFile:
			candidatesDict = self.reliableFilter1(openedFile)
		setOfReliableJobs = self.reliableFilter2(candidatesDict, lang, includeJobsWithNApitch)

		#dump the output if the output path is specified
		if outputPath != None:
			utilsOs.dumpRawLines(setOfReliableJobs, outputPath, addNewline=True, rewrite=True)
		return setOfReliableJobs


		##################################################################################
		#EXTRACTS JOB + PITCH FROM THE LINKEDIN CORPUS
		##################################################################################

		def getJobPitchDict(jobPitchFilePath= u'/u/kessler/LBJ/data/2015-06-17/fr/anglophone/candidats.json'):
			'''
			makes a dict containing the job and pitch extracted from the linkedIn profiles
			and dumps it in a json file
			'''
			jobPitchDict = {}
			with open(jobPitchFilePath) as jobPlusPitch:
				pitch = None
				line = jobPlusPitch.readline()
				while line:
					jsonLine = utilsOs.convertJsonLineToDict(line)
					#get the pitch
					if u'personalBranding_pitch' in jsonLine:
						pitch = jsonLine[u'personalBranding_pitch'].replace(u'\n', u'   ')
					#save in dict the pitch and mission to each job function
					if u'experiences' in jsonLine:
						for functionDict in jsonLine[u'experiences']:
							emptyDict = {u'pitch':[], u'mission':[]}
							#we make sure there is at least an empty dict to that function
							if u'function' in functionDict:
								jobPitchDict[functionDict[u'function']] = jobPitchDict.get(functionDict[u'function'], emptyDict)
								#save the pitch to the dict
								if pitch != None:
									#if there is already a pitch for that job fuction we add it to the list
									jobPitchDict[functionDict[u'function']][u'pitch'].append(pitch)
								#save the missions to the dict
								if u'missions' in functionDict:
									#if there already is a mission for that job fuction we add it to the list
									jobPitchDict[functionDict[u'function']][u'mission'].append(functionDict[u'missions'].replace(u'\n', u'   '))
					line = jobPlusPitch.readline()
					pitch = None
			utilsOs.dumpDictToJsonFile(jobPitchDict, u'/u/alfonsda/Documents/DOCTORAT_TAL/004projetOntologie/002data/candidats/2016-09-15/fr/anglophone/jobAndPitch.json')
			return jobPitchDict


class jobTitleExtractorZack():
	'''
	Implements the job title extractor coded by Zack Soliman
	'''
	def __init__( self ):
		pass

	#stopwords
	to_remove = set(stopwords.words("english") + ['', ' ', '&'])
	#pattern of tokenizer splitters
	pattrn = re.compile(r"[-/,\.\\\/\s\&\"\']")

	def getNgram_counts(self, jobFilePath, to_remove, pattrn):
		'''
		Makes a dict of every 1-, 2-, 3-gram in the
		job source file
		'''
		ngram_counts = {}
		with codecs.open(jobFilePath, 'r', encoding='utf8') as openedFile:
			for index, line in enumerate(openedFile):
				jobTitle = line.replace('\n', '')
				tokens = re.split(pattrn, jobTitle)
				tokens = list(filter(lambda tok: tok not in to_remove, tokens))
				for n in range(1,4):
					for ngram in self.get_ngrams(n, tokens):
						ngram_counts[ngram] = ngram_counts.get(ngram, 0)+1
		return ngram_counts


	def testZackExtractor(self, sentence, to_remove, pattrn):
		'''
		just a test, if we need to call the extractor multiple times we need to
		keep the ngram_counts dict in memory instead of making it over and over
		'''
		ngram_counts = self.getNgram_counts(u'/u/alfonsda/Documents/DOCTORAT_TAL/004projetOntologie/002data/candidats/2016-09-15/fr/anglophone/sample100milFunctions/jobTitles.txt', to_remove, pattrn)
		return self.get_best(sentence, to_remove, pattrn, ngram_counts)


	def getJobsZackExtracted(self, jobFilePath, outputPath=None, to_remove=to_remove, pattrn=pattrn):
		'''
		extracts the most best jobs according to their co-reference
		in decreasing order of ngram:
			chief executive officer
				IS BETTER THAN
			chief officer
				IS BETTER THAN
			officer
		'''
		setOfJobs = set()
		#count jobs co-reference (counting the (1-4)-gram token words in the job title)
		ngram_counts = self.getNgram_counts(jobFilePath, to_remove, pattrn)
		#get best possibility
		with codecs.open(jobFilePath, 'r', encoding='utf8') as openedFile:
			for index, jobTitle in enumerate(openedFile):
				#ORIGINAL extractor bestOption = self.get_best(jobTitle, to_remove, pattrn, ngram_counts)
				bestOption = self.get_best_modified(jobTitle, to_remove, pattrn, ngram_counts)
				#add the 'best' job name to the final set
				if bestOption != "<unk>":
					setOfJobs.add(bestOption)
		#dump the output if the output path is specified
		if outputPath != None:
			utilsOs.dumpRawLines(setOfJobs, outputPath, addNewline=True, rewrite=True)
		return setOfJobs


	def get_best_modified(self, s, to_remove, pattrn, ngram_counts, lang='en'):
		#only get something if it matches the language
		if lang == utilsString.englishOrFrench(s):
			tokens = re.split(pattrn, s)
			#in this modified version we add the french stopwords (since it might be noisy)
			to_remove.union(set(stopwords.words("french")))
			tokens = list(filter(lambda tok: tok not in to_remove, tokens))
			bigram = False
			trigram = False

			if len(tokens) == 0:
				return "<unk>"

			unigram = max(self.get_ngrams(1, tokens), key=lambda x: ngram_counts[x])
			if len(tokens) >= 2:
				bigram = max(self.get_ngrams(2, tokens), key=lambda x: ngram_counts[x])
			if len(tokens) >= 3:
				trigram = max(self.get_ngrams(3, tokens), key=lambda x: ngram_counts[x])

			#when not using a sample, we can augment the arbitrary coreference frontier ie:
			#if trigram and ngram_counts[trigram] > 100
			if trigram and ngram_counts[trigram] > 10:
				return " ".join(trigram)
			#when not using a sample, we can augment the arbitrary coreference frontier ie:
			#if trigram and ngram_counts[trigram] > 100
			elif bigram and ngram_counts[bigram] > 10:
				return " ".join(bigram)
			#add a very high score for unigrams for them to be considered results
			elif ngram_counts[unigram] > 100:
				return unigram[0]
		return "<unk>"


	####Zack Soliman's code##############################

	def get_best(self, s, to_remove, pattrn, ngram_counts):
		tokens = re.split(pattrn, s)
		tokens = list(filter(lambda tok: tok not in to_remove, tokens))
		bigram = False
		trigram = False

		if len(tokens) == 0:
			return "<unk>"

		unigram = max(self.get_ngrams(1, tokens), key=lambda x: ngram_counts[x])
		if len(tokens) >= 2:
			bigram = max(self.get_ngrams(2, tokens), key=lambda x: ngram_counts[x])
		if len(tokens) >= 3:
			trigram = max(self.get_ngrams(3, tokens), key=lambda x: ngram_counts[x])

		#when not using a sample, we can augment the arbitrary coreference frontier ie:
		#if trigram and ngram_counts[trigram] > 100
		if trigram and ngram_counts[trigram] > 10:
			return " ".join(trigram)
		#when not using a sample, we can augment the arbitrary coreference frontier ie:
		#if trigram and ngram_counts[trigram] > 100
		elif bigram and ngram_counts[bigram] > 10:
			return " ".join(bigram)
		else:
			return unigram[0]


	def get_ngrams(self, n, tokens):
		return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]