-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsolve.py
More file actions
123 lines (94 loc) · 3.41 KB
/
solve.py
File metadata and controls
123 lines (94 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""A Subtitution Cypher Solver.
Playing around with neural network language models. Many ideas taken from
Quipster (2003):
https://people.csail.mit.edu/hasinoff/pubs/hasinoff-quipster-2003.pdf
A collab version of this is at:
https://colab.research.google.com/drive/1uDn1KVQkpxw4LLQ2UfzLDLBmXu2Ui4eh#scrollTo=xmHLz6760sm7
"""
import string
from functools import cache
from itertools import product
from random import choice
from typing import Callable
import numpy as np
import requests
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
WHITELIST = string.ascii_letters + "'"
def get_english_ngrams() -> str:
"""Get a dictionary of English 1,2,3-grams."""
urls = [
# Nietzsche
"https://s3.amazonaws.com/text-datasets/nietzsche.txt",
# The Great Gatsby
"https://www.gutenberg.org/cache/epub/64317/pg64317.txt",
]
s = []
for url in urls:
r = requests.get(url, timeout=5)
r.raise_for_status()
s.append(r.text)
s = " ".join(s)
cv = CountVectorizer(analyzer="char", ngram_range=(1, 3), strip_accents="unicode")
X = cv.fit_transform([s])
return dict(zip(cv.get_feature_names_out(), X.toarray().flatten()))
ENGLISH_NGRAMS = get_english_ngrams()
def markov_score(s: str) -> float:
"""Get the negative log likelihood of a string using a Markov model.
Split into words and then use a 3-gram model on each word.
"""
likelihood = 0
for word in s.split(" "):
affixed_word = "* " + word + " "
for i in range(len(word) - 2):
likelihood += get_neg_log_likelihood(affixed_word[i : i + 3])
return likelihood
@cache
def get_neg_log_likelihood(s: str) -> float:
"""Get negative log likelihood of a 3-character string.
Uses a simple 2,3-gram model based on English text.
"""
if "*" not in s:
numerator_count = ENGLISH_NGRAMS.get(s, 0)
denominator_count = ENGLISH_NGRAMS.get(s[:2], 0)
else:
numerator_count = sum(ENGLISH_NGRAMS.get(prefix + s[1:], 0) for prefix in WHITELIST)
denominator_count = sum(
ENGLISH_NGRAMS.get(prefix + " " + suffix, 0) for prefix, suffix in product(WHITELIST, WHITELIST)
)
if numerator_count == 0 or denominator_count == 0:
return 15.0
else:
return -np.log(numerator_count / denominator_count)
def get_solved_text(
s: str,
n_trials: int = 5,
n_steps: int = 2000,
score_func: Callable[[str], float] = markov_score,
) -> str:
"""Get the best solved text.
Makes random swaps and keeps the ones that reduce the negative log
likelihood.
"""
chars = list(set(s) - {" "})
best_scores = []
for _ in range(n_trials):
old_s, old_score = s, score_func(s)
for _ in tqdm(range(n_steps)):
x = y = None
while x == y:
x, y = choice(chars), choice(chars)
new_s = old_s.translate({ord(x): y, ord(y): x})
new_score = score_func(new_s)
if new_score < old_score:
old_s, old_score = new_s, new_score
if new_s != s:
best_scores += [(new_s, new_score)]
return min(best_scores, key=lambda x: x[1])[0]
with open("puzzles.txt") as f:
puzzles = f.readlines()
if __name__ == "__main__":
with open("solutions.txt", "w") as g:
for p in puzzles:
s, _ = get_solved_text(p.strip())
g.write(s + "\n")