-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathmemeocr.py
More file actions
75 lines (64 loc) · 2 KB
/
memeocr.py
File metadata and controls
75 lines (64 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
import os
import subprocess
import re
import cv2
class MemeOCR:
def __init__(self):
self._white_thresh = 240
self._tmp_image_fname = './memeocr.jpg'
self._tmp_txt_base = './memeocr'
self._tmp_txt_fname = self._tmp_txt_base + '.txt'
self._template_image = None
self._keep_tmp_files = False
def set_template(self, fname):
self._template_image = self._read_image(fname)
def recognize(self, fname):
txt = None
img = self._read_image(fname)
self._thresh_words(img, self._template_image)
self._exec_tesseract()
txt = self._read_txt()
self._delete_tmp_files()
return txt
def _read_image(self, fname):
try:
img = cv2.imread(fname)
except IOError:
img = None
return img
def _thresh_words(self, img, template):
if img is None:
return
for i in range(img.shape[0]):
for j in range(img.shape[1]):
if all([elem >= self._white_thresh for elem in img[i][j]]):
img[i][j] = (0, 0, 0)
else:
img[i][j] = (255, 255, 255)
cv2.imwrite(self._tmp_image_fname, img)
def _exec_tesseract(self):
subprocess.run([
'tesseract',
'-l',
'eng',
self._tmp_image_fname,
self._tmp_txt_base,
])
def _read_txt(self):
try:
fr = open(self._tmp_txt_fname)
except IOError:
return None
content = fr.read()
fr.close()
blocks = re.split(r'\n\n', content)
lines = [re.sub(r'\s+', ' ', block) for block in blocks if block.strip()]
return lines
def _delete_tmp_files(self):
if self._keep_tmp_files:
return
if os.path.exists(self._tmp_image_fname):
os.remove(self._tmp_image_fname)
if os.path.exists(self._tmp_txt_fname):
os.remove(self._tmp_txt_fname)