-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathambiguity.py
More file actions
60 lines (45 loc) · 1.87 KB
/
ambiguity.py
File metadata and controls
60 lines (45 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import PlaintextCorpusReader
poeAmbiguity = 0
loveAmbiguity = 0
#Reading
corpus_root = 'corpora'
corpus = PlaintextCorpusReader(corpus_root, '.*')
poeSentences = corpus.words('poeSentences.txt')
lovecraftSentences = corpus.words('lovecraftSentences.txt')
poeQuotes = corpus.words('poeQuotes.txt')
lovecraftQuotes = corpus.words('lovecraftQuotes.txt')
# Grabbing Tokens - all Sentences - can be modded for Quotes
tagsIwant = ['VBD', 'VBN', 'VBZ', 'VB'] #Change here for different Parts of Speech
poelowercaseTokens = [token.lower() for token in poeSentences]
poeUniqueTokens = set(poelowercaseTokens)
lovelowercaseTokens = [token.lower() for token in lovecraftSentences]
loveUniqueTokens = set(lovelowercaseTokens)
poetagged = pos_tag(poeUniqueTokens)
poeshort = [word for word, tag in poetagged if tag in tagsIwant]
lovetagged = pos_tag(loveUniqueTokens)
loveshort = [word for word, tag in lovetagged if tag in tagsIwant]
print(len(loveshort))
print(loveshort)
print(len(poeshort))
print(poeshort)
for w in poeshort:
lemma = wn.morphy(w)
lemma = lemma if lemma else w
synsets = wn.synsets(lemma)
poeAmbiguity += len(synsets) / len(poeshort)
print("The average ambiguity value for each verb in the Poe corpus is:")
print(poeAmbiguity)
for w in loveshort:
lemma = wn.morphy(w)
lemma = lemma if lemma else w
synsets = wn.synsets(lemma)
loveAmbiguity += len(synsets) / len(loveshort)
print('The average ambiguity value for each verb in the Lovecraft corpus is:')
print(loveAmbiguity)
# note - for some reason the values change every time you run the script. They average out to be about 7.5 and 8.1 respectively though.
# Lovecraft is always more ambiguous (verbwise) than Poe (but only slightly!)
# Noun results [NN, NNP, NNPS, NNS] values are == Poe: ~5, Lovecraft: ~5.5