-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLanguage Identifier.py
More file actions
107 lines (91 loc) · 3.1 KB
/
Language Identifier.py
File metadata and controls
107 lines (91 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import sys
import math
def get_parameter_vectors():
'''
This function parses e.txt and s.txt to get the 26-dimensional multinomial
parameter vector (characters probabilities of English and Spanish) as
descibed in section 1.2 of the writeup
Returns: tuple of vectors e and s
'''
#Implementing vectors e,s as lists (arrays) of length 26
#with p[0] being the probability of 'A' and so on
e=[0]*26
s=[0]*26
with open('e.txt',encoding='utf-8') as f:
for line in f:
#strip: removes the newline character
#split: split the string on space character
char,prob=line.strip().split(" ")
#ord('E') gives the ASCII (integer) value of character 'E'
#we then subtract it from 'A' to give array index
#This way 'A' gets index 0 and 'Z' gets index 25.
e[ord(char)-ord('A')]=float(prob)
f.close()
with open('s.txt',encoding='utf-8') as f:
for line in f:
char,prob=line.strip().split(" ")
s[ord(char)-ord('A')]=float(prob)
f.close()
return (e,s)
def shred(filename):
#Using a dictionary here. You may change this to any data structure of
#your choice such as lists (X=[]) etc. for the assignment
X=dict()
for i in range(ord('A'), ord('Z') + 1):
character = chr(i)
X[character] = 0
#Sorting, and Upper Case
with open (filename,encoding='utf-8') as f:
for line in f:
AtoZ = line.upper()
for character in AtoZ:
if character >= 'A' and character <= 'Z':
X[character] += 1
#Output
print("Q1")
for letters in sorted(X.keys()):
print(letters + " " + str(X[letters]))
return X
def logProb (e,s):
#list
lettercounts = []
for ascii_code in range(ord('A'), ord('Z') + 1):
lettercounts.append(letterdict[chr(ascii_code)])
#prob
LogE1 = lettercounts[0] * math.log(e[0])
LogE2 = lettercounts[0] * math.log(s[0])
#print
print('Q2')
print("{:.4f}".format(LogE1))
print("{:.4f}".format(LogE2))
def langProb (e,s):
#list
lettercounts = []
for ascii_code in range(ord('A'), ord('Z') + 1):
lettercounts.append(letterdict[chr(ascii_code)])
#PriorProbs
probEngl = math.log(0.6)
probSpan = math.log(0.4)
#F(English) & F(Spanish)
Fenglish = probEngl + sum(lettercounts[i]*math.log(e[i]) for i in range(26))
Fspanish = probSpan + sum(lettercounts[i]*math.log(s[i]) for i in range(26))
#print
print('Q3')
print("{:.4f}".format(Fenglish))
print("{:.4f}".format(Fspanish))
#Normalizing
if Fspanish - Fenglish >= 100:
EnglishX = 0
elif Fspanish - Fenglish <= -100:
EnglishX = 1
else:
EnglishX = 1/(1+math.exp(Fspanish - Fenglish))
#print
print('Q4')
print("{:.4f}".format(EnglishX))
#Main
if __name__ == '__main__':
e,s = get_parameter_vectors()
letterdict = shred('letter.txt') #bring to testing file and use testing filename here, letter.txt is for cloud testing.
logProb(e,s)
langProb(e,s)