-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcompCalc.py
More file actions
149 lines (124 loc) · 4.4 KB
/
compCalc.py
File metadata and controls
149 lines (124 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import csv
import sys
import ast
import re
import math
from collections import defaultdict
import pyphen
csv.field_size_limit(sys.maxsize)
def rubrikParse(rubrik):
rubrik = re.sub("[\[\]\']", "", rubrik)
rubrik = rubrik.split(", ")
if len(rubrik) > 1:
for item in rubrik:
if not "/" in item:
return item
break
else:
return rubrik[0]
def parseFile(dataTuple):
parsedData = []
fileName, idCol, wordCountCol, rubrikCol, bodyCol = dataTuple
with open(fileName,"r") as data:
reader = csv.reader(data)
next(reader)
for row in reader:
if len(row[rubrikCol]) > 0 and not int(row[wordCountCol]) == 0:
parsedData.append((
row[idCol],
int(row[wordCountCol]),
rubrikParse(row[rubrikCol]),
row[bodyCol]
))
return parsedData
def article_score(article):
# Prepare UWC Calculation
with open("common_words.txt", "r") as file:
common_words = file.readline().split(", ")
common_words[-1][:-1]
body = re.sub('[^a-zA-ZöüäÖÄÜß\- ]', '', article[3]).lower()
body_split = body.split(" ")
unique_split = list(set(body_split))
no_common_words = [i for i in unique_split if i not in common_words]
# Calculate score
uwc = len(no_common_words)
score = float(int(uwc/article[1]*10000))/100
return [uwc,score]
# Count Syllables
def count_sylls_german(text):
dic = pyphen.Pyphen(lang='de_DE')
text = re.sub("[^a-zA-ZäöüÄÖÜß ]", " ", text)
text = re.sub("\s+", " ", text)
num_of_sylls = 0
sylls_per_word = defaultdict(int)
chars_per_word = defaultdict(int)
for word in text.split(" "):
syll_counter = 0
sylls = dic.inserted(word)
syll_counter += (1 + sylls.count("-"))
num_of_sylls += syll_counter
sylls_per_word[syll_counter] += 1
chars_per_word[len(word)] += 1
return num_of_sylls, sylls_per_word, chars_per_word
# Count German Sentences in text
def count_sentences_german(text):
""" Count number of sentences by counting terminals but avoid counting abbreviations """
# Removes punctuation except terminals of sentences
text = re.sub("[^a-zA-ZäöüÄÖÜß\.!? ]", " ", text)
# Identifies the last four characters of a sentence
return len(re.findall("[a-zäöüß)]{3}[\.?!][\n\s]", text)) + 1
def wiener_sachtext_formel(text):
# Setup Text params
number_of_words = text.count(" ") + 1
number_of_sentences = count_sentences_german(text)
number_of_sylls, sylls_per_word, chars_per_word = count_sylls_german(text)
avg_sentence_length = number_of_words / number_of_sentences
avg_syllables = number_of_sylls / number_of_words
# Count Words that are particularly long
num_long_words = sum([v for k, v in sylls_per_word.items() if k >= 3])
ratio_lw = num_long_words / number_of_words
num_very_long_words = sum([v for k, v in chars_per_word.items() if k >= 6])
ratio_vlw = num_very_long_words / number_of_words
avg_syll = sylls_per_word[1] / number_of_words
wsf = calc_wsf(
ratio_lw,
avg_sentence_length,
ratio_vlw,
avg_syll
)
wsf = float(int(wsf*100)) / 100
return wsf
def calc_wsf(ms, sl, iw, es):
"""
MS: Prozentanteil der Wörter mit drei oder mehr Silben,
SL: mittlere Satzlänge (Anzahl Wörter),
IW: Prozentanteil der Wörter mit mehr als sechs Buchstaben,
ES: Prozentanteil der einsilbigen Wörter.
"""
wsf = 19.35 * ms + 0.1672 * sl + 12.97 * iw - 3.27 * es - 0.875
return wsf
def write_csv(data):
with open(use[0][:-4] + "_out.csv","w+") as file:
writer = csv.writer(file,delimiter=',')
writer.writerows(data)
def calculate_scores(parsedData):
article_scores = []
for article in parsedData:
uwc, our_score = article_score(article)
wiener_score = wiener_sachtext_formel(article[3])
if wiener_score > 12 and our_score < 1.5:
print(article[3])
article_scores.append([
article[2],
article[1],
uwc,
our_score,
wiener_score
])
return article_scores
use_goethe = ("goethe.csv", 0, 2, 1, 3)
use_cxense = ("cxense.csv", 4, 2, 11, 5)
use = use_cxense
parsed = parseFile(use)
calculated = calculate_scores(parsed)
write_csv(calculated)