-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy paththeorem_extractor.py
More file actions
199 lines (157 loc) · 6.5 KB
/
theorem_extractor.py
File metadata and controls
199 lines (157 loc) · 6.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# theorem_extractor.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Extract theorem declarations from Lean 4 source files."""
import re
from typing import List, Dict
class TheoremExtractor:
"""
Extracts theorem and lemma declarations from Lean 4 source code.
This class can identify theorem/lemma declarations, extract their full text,
and optionally rename them to avoid naming conflicts.
"""
# Keywords that indicate the start of a theorem/lemma
DECLARATION_KEYWORDS = ('theorem ', 'lemma ')
# Keywords that indicate we should stop collecting lines
STOP_KEYWORDS = (
'theorem ', 'lemma ', 'def ', 'instance ', 'axiom ', 'example ',
'end', 'section ', 'namespace ', '@[', '/-', '/--', 'protected '
)
def __init__(self, rename_suffix: str = "_test"):
"""
Initialize the extractor.
Args:
rename_suffix: Suffix to append to theorem names to avoid conflicts
"""
self.rename_suffix = rename_suffix
def extract(self, lean_content: str) -> List[Dict[str, any]]:
"""
Extract all theorems from Lean source code.
Args:
lean_content: Lean 4 source code as a string
Returns:
List of dictionaries containing theorem information:
- name: Original theorem name
- renamed: New theorem name with suffix
- full_text: Original theorem text
- full_text_renamed: Theorem text with renamed declaration
- line_number: Starting line number (1-indexed)
"""
theorems = []
lines = lean_content.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
if self._is_theorem_start(line):
theorem = self._extract_theorem(lines, i)
if theorem:
theorems.append(theorem)
i = theorem['_end_line']
else:
i += 1
else:
i += 1
return theorems
def _is_theorem_start(self, line: str) -> bool:
"""Check if a line starts a theorem declaration."""
return any(line.startswith(keyword) for keyword in self.DECLARATION_KEYWORDS)
def _collect_preceding_annotations(self, lines: List[str], theorem_idx: int) -> List[str]:
"""
Collect any @[...] annotations that appear before the theorem.
Args:
lines: All lines of the file
theorem_idx: Index where theorem declaration starts
Returns:
List of lines containing annotations (in order)
"""
annotation_lines = []
j = theorem_idx - 1
# Look backwards for annotations
while j >= 0:
line = lines[j].strip()
# Stop if we hit an empty line or non-annotation content
if line == '':
j -= 1
continue
# If it's an attribute annotation, include it
if line.startswith('@['):
annotation_lines.insert(0, lines[j])
j -= 1
else:
# Stop if we hit something that's not an annotation
break
return annotation_lines
def _extract_theorem(self, lines: List[str], start_idx: int) -> Dict[str, any]:
"""
Extract a single theorem starting at the given line.
Args:
lines: All lines of the file
start_idx: Index where theorem starts
Returns:
Dictionary with theorem information or None if extraction failed
"""
line = lines[start_idx].strip()
# Extract theorem name
name_match = re.search(r'(?:theorem|lemma)\s+(\w+)', line)
if not name_match:
return None
name = name_match.group(1)
# Collect any preceding annotations
annotation_lines = self._collect_preceding_annotations(lines, start_idx)
# Start with annotations, then the theorem declaration
theorem_lines = annotation_lines + [lines[start_idx]]
# Collect lines until we find the end
j = start_idx + 1
while j < len(lines):
next_line = lines[j].strip()
# Stop if we hit a new declaration or attribute
if self._should_stop(next_line, j, start_idx):
break
theorem_lines.append(lines[j])
j += 1
full_text = '\n'.join(theorem_lines).strip()
# Create renamed version
new_name = name + self.rename_suffix
full_text_renamed = self._rename_theorem(full_text, name, new_name)
return {
'name': name,
'renamed': new_name,
'full_text': full_text,
'full_text_renamed': full_text_renamed,
'line_number': start_idx + 1, # Still report the theorem declaration line
'_end_line': j # Internal use for iteration
}
def _should_stop(self, line: str, current_idx: int, start_idx: int) -> bool:
"""Determine if we should stop collecting lines for current theorem."""
if current_idx <= start_idx:
return False
return any(line.startswith(keyword) for keyword in self.STOP_KEYWORDS)
def _rename_theorem(self, text: str, old_name: str, new_name: str) -> str:
"""
Rename a theorem in its text.
Args:
text: Original theorem text
old_name: Original theorem name
new_name: New theorem name
Returns:
Text with renamed theorem
"""
return re.sub(
r'((?:theorem|lemma)\s+)' + re.escape(old_name) + r'\b',
r'\1' + new_name,
text,
count=1
)
def get_summary(self, theorems: List[Dict[str, any]]) -> Dict[str, any]:
"""
Get a summary of extracted theorems.
Args:
theorems: List of extracted theorems
Returns:
Dictionary with summary statistics
"""
return {
'total_count': len(theorems),
'theorem_names': [t['name'] for t in theorems],
'average_length': sum(len(t['full_text']) for t in theorems) / len(theorems) if theorems else 0,
}