-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetadata.py
More file actions
182 lines (152 loc) · 7.71 KB
/
metadata.py
File metadata and controls
182 lines (152 loc) · 7.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""Reading and a little writing the various metadata files."""
import glob
import os.path
import re
import typing as tg
Entry = str # Pseudo type for strings of form "mypath/EMSE-2021/AbuDab21.pdf"
Filepair = tg.Tuple[str, str, str, str] # (file1, coder1, file2, coder2)
def citekey(list_line: Entry) -> str:
"""From a line like EMSE-2021/AbuDab21.pdf return AbuDab21"""
return split_entry(list_line)[1]
def split_entry(list_line: Entry) -> tg.Tuple[str, str]:
"""From a line like volumes/EMSE-2021/AbuDab21.pdf return its semantic parts EMSE-2021 and AbuDab21"""
mm = re.search(r"([^/]+)/([^/]+)\.pdf$", list_line)
assert mm
return mm.group(1), mm.group(2)
def volume(list_line: Entry) -> str:
"""From a line like EMSE-2021/AbuDab21.pdf return EMSE-2021"""
return split_entry(list_line)[0]
def read_list(filename: str) -> tg.List[Entry]:
with open(filename, 'rt', encoding='utf-8') as lst:
mylist = lst.read().split('\n')
mylist.pop() # file ends with \n, so last item is empty
return mylist
def write_list(to: str, entries: tg.Iterator[Entry]):
with open(to, 'w', encoding='utf8') as lst:
for elem in entries:
lst.write(f"{elem}\n")
class Venue:
"""Understands sample.list and can determine which venue or volume a citekey belongs to."""
FILENAME = "sample.list"
ENTRY_REGEXP = r"(\w+)-(\d+)/([\w-]+)\." # e.g. TSE-2021/LiuKimBis21.pdf
def __init__(self, workdir: str):
self.workdir = workdir
self.venue = dict() # citekey -> venue name (e.g. TSE)
self.volume = dict() # citekey -> volume name (e.g. TSE22 for TSE-2022 or TSE48 for TSE-48)
with open(f"{workdir}/{self.FILENAME}", 'r', encoding='utf8') as f:
lines = f.readlines()
for line in lines:
mm = re.match(self.ENTRY_REGEXP, line)
venue, number, citekey_ = (mm.group(1), mm.group(2), mm.group(3))
if re.fullmatch(r"20\d\d", number):
number = number[2:] # remove century, leaving only a two-digit year
self.venue[citekey_] = venue
self.volume[citekey_] = f"{venue}{number}"
def venue_of(self, citekey_: str) -> str:
return self.venue[citekey_]
def volume_of(self, citekey_: str) -> str:
return self.volume[citekey_]
class WhoWhat:
"""
Can tell which coder annotated which file and which pairs of files to compare.
Hides the following secrets :
1. The filename of the who/what file.
2. The format of that file (for reading only)
3. The meaning of the entries in that file (blocks, reservations, implied filenames, coder_letters)
4. Which pairs of annotated files should be compared
Secret 3 is non-fixed wrt the A/B subdirectories, the names of which may involve a variable prefix.
This makes _subdir_prefix() necessary and makes citekey() and coderletter() a bit ugly.
"""
WHOWHAT_FILE = "sample-who-what.txt" # in workdir
FILENAMEPATTERN = r"/(.*)?([A-Z])/(\w+)\.txt$"
BLOCKHEADER_REGEXP = r"^#---+ [Bb]lock (\d+)"
def __init__(self, workdir: str):
self.workdir = workdir
self.subdir_prefix = self._subdir_prefix()
self.coders = set()
self._coder_of = dict() # filename -> codername
self._block_of = dict() # filename -> blockname
self._pairs: tg.List[Filepair] = []
with open(f"{workdir}/{self.WHOWHAT_FILE}", 'r', encoding='utf8') as f:
lines = f.readlines()
currentblock = "" # block number as a string
for line in lines:
if line.startswith("#"):
mm = re.match(self.BLOCKHEADER_REGEXP, line)
if mm:
currentblock = mm.group(1)
continue
parts = line.strip().split() # splits on single or multiple whitespace after removing trailing \n
citekey_ = parts[0]
columns = parts[1:] # list of coder names or reservations
# --- collect coders and single-file entries:
for index, coder in enumerate(columns):
if self.is_reservation(coder):
continue # reservations are non-entries
self.coders.add(coder)
filename = self._implied_filename(citekey_, index)
self._coder_of[filename] = coder
self._block_of[filename] = currentblock
# --- collect filepair entries, using either _build_pairs_with_A oder _build_neighboring_pairs:
for next_pair in self._build_pairs_with_A(citekey_, columns):
self._pairs.append(next_pair)
def blockname(self, filename: str) -> str:
"""Which block does this file belong to?"""
return self._block_of[filename]
def citekey(self, filename: str) -> str:
return self._filenamepart(filename, 3)
def coder_letter(self, filename: str) -> str:
return self._filenamepart(filename, 2)
def files_of(self, coder: str) -> tg.Generator[str, None, None]:
for myfile, mycoder in self._coder_of.items():
if mycoder == coder:
yield myfile
@staticmethod
def is_reservation(codername: str) -> bool:
return codername.startswith('-')
@property
def pairs(self) -> tg.Generator[str, None, None]:
for file1, coder1, file2, coder2 in self._pairs:
if not self.is_reservation(coder1) and not self.is_reservation(coder2):
yield file1, coder1, file2, coder2
def _implied_filename(self, citekey_: str, columnindex: int) -> str:
"""Knows the abstracts.A, abstracts.B dirname convention. First such column has index 0."""
char = chr(ord('A') + columnindex) # 26 columns maximum (we'll never need more than 10)
return f"{self.workdir}/{self.subdir_prefix}{char}/{citekey_}.txt"
def _filenamepart(self, filename: str, which: int) -> str:
"""Partial inverse of _implied_filename()"""
mm = re.search(self.FILENAMEPATTERN, filename)
return mm.group(which)
def _subdir_prefix(self) -> str:
"""
Determine which, if any, prefix is used for the names of the extracts-holding subdirectories.
For workdir/abstracts.A would yield "abstracts.",
for workdir/A would yield "".
"""
results = glob.glob(f"{self.workdir}/*A")
assert len(results) == 1, f"surprising glob result: {results}"
mm = re.search(r"(.*)A$", os.path.basename(results[0])) # find prefix in last part of path
return mm.group(1)
def _build_neighboring_pairs(self, citekey_: str, columns: tg.Sequence[str]) -> tg.Generator[Filepair, None, None]:
"""
Generator for pairs of neighboring entries A/B, C/D etc. (whether reservation or not).
"""
firstentry = None # we return a pair whenever we have a first and find another entry
for index, coder in enumerate(columns):
if firstentry:
mypair = (firstentry[0], firstentry[1], self._implied_filename(citekey_, index), coder)
firstentry = None
yield mypair
else:
firstentry = (self._implied_filename(citekey_, index), coder)
# firstentry may be set when the loop finishes, leaving an unpaired entry. C'est la vie!
def _build_pairs_with_A(self, citekey_: str, columns: tg.Sequence[str]) -> tg.Generator[Filepair, None, None]:
"""
Generator for pairs A/B, A/C, A/D etc. (whether reservation or not).
"""
if len(columns) == 0:
return # there are zero pairs
coder_A = columns[0]
file_of_A = self._implied_filename(citekey_, 0)
for index, coder in enumerate(columns[1:]):
yield file_of_A, coder_A, self._implied_filename(citekey_, index+1), coder