-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathparseJames.py
More file actions
112 lines (97 loc) · 3.87 KB
/
parseJames.py
File metadata and controls
112 lines (97 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import logging
import re
from operator import itemgetter
from gensim import corpora, models, similarities
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
corpus_lsi = []
books = list() #each element in list is full text of a book
bookNames = list()
testamentText = ["", ""] # each element in list is full text of a testament
def calcByBook():
curBook = "x"
with open('kingjames.csv', 'r') as csv:
counter = -1
for line in csv:
verse = line.split(",")[3]
if verse.strip().startswith("Verse Text"): #hard coded, ignoring header line
continue
if not line.startswith(curBook):
counter += 1
books.append("")
curBook = line.split(",")[0]
bookNames.append(curBook)
books[counter] += " " + verse
calcBookScores(books)
calcTestamentScoresSort()
def bookToTestament():
oldT = True
for i in range(len(books)):
if bookNames[i].startswith("The Gospel"):
oldT = False
if oldT:
testamentText[0] += books[i] + " "
else:
testamentText[1] += books[i] + " "
def calcTestamentScoresSort():
bookToTestament()
texts = [[word.strip(",:;.?!") for word in document.lower().split()] for document in testamentText]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
tfidfTest= list()
testNames = ["Old Testament", "New Testament"]
for each in corpus:
tfidfTest.append(tfidf[each])
with open ("salientByTestament.txt", 'w') as out:
out.write("The top 15 salient words in each testament of the bible\n")
for i in range(len(tfidfTest)):
top15 = sorted(tfidfTest[i], key=itemgetter(1), reverse=True)[:15]
out.write(testNames[i] + " - ")
for word in top15:
out.write(dictionary[word[0]] + " ")
out.write("\n")
def calcBookScores(books):
texts = [[word.strip(",:;.?!") for word in document.lower().split()] for document in books]
dictionary = corpora.Dictionary(texts)
print(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
tfidfList = list()
for each in corpus:
tfidfList.append(tfidf[each])
salientByBook(tfidfList, dictionary)
#calcLSI(tfidfList, dictionary)
#calcLSA(corpus, dictionary)
def sortScore(tfidfList, dict):
for i in range(len(tfidfList)):
local10 = list()
top10 = sorted(tfidfList[i], key=itemgetter(1), reverse=True)[:10]
#print(*top10)
print(bookNames[i])
for word in top10:
local10.append(dict[word[0]])
print(dict[word[0]])
def salientByBook(tfidfList, dict): #basically the same as sortScore() but outputs to a file instead of printing
with open ("salientByBook.txt", 'w') as out:
out.write("The top 10 salient words in each book of the bible\n")
for i in range(len(tfidfList)):
local10 = list()
top10 = sorted(tfidfList[i], key=itemgetter(1), reverse=True)[:10]
#print(*top10)
out.write(bookNames[i] + " - ")
for word in top10:
local10.append(dict[word[0]] )
out.write(dict[word[0]] + " ")
out.write("\n")
def calcLSI(tfidfList, dictionary):
global corpus_lsi
topics = 4 #this is arbitrary we should play around with this
lsi = models.LsiModel(tfidfList, id2word=dictionary, num_topics = topics)
corpus_lsi = lsi[tfidfList]
lsi.print_topics(topics)
#after this is new stuff to evaluate topics
def calcLSA(corpus, dictionary):
model = models.LdaModel(corpus, id2word = dictionary, num_topics = 10)
if __name__ == "__main__":
calcByBook()
bookToTestament()