-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy path5_page_answer_parse
More file actions
81 lines (68 loc) · 2.75 KB
/
5_page_answer_parse
File metadata and controls
81 lines (68 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import base64
from dotenv import load_dotenv
from groq import Groq
os.environ['GROQ_API_KEY'] = "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"
load_dotenv()
client = Groq()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# ----------------------------------
# Encode image as base64 for API
# ----------------------------------
def encode_image(image_path):
if not os.path.isfile(image_path):
raise FileNotFoundError(f"Image not found: {image_path}")
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
# ----------------------------------
# Extract and format visible text using Groq vision model
# ----------------------------------
def extract_text_from_images(image_paths):
base64_images = []
for path in image_paths:
base64_images.append(encode_image(path))
prompt = (
"Extract only the visible text from these images, and organize it by question number.\n"
"- Identify each question based on its number (e.g., Q1, 1., 2., etc.).\n"
"- Group each answer under its respective question number using clear headings like 'Question 1:', 'Question 2:', etc.\n"
"- Do NOT generate or assume any new content—only extract what's actually visible in the image.\n"
"- Correct any spelling mistakes.\n"
"- Preserve logical structure (e.g., headings, bullet points, tables, equations) within each answer.\n"
"- Use clean and consistent formatting so the output is both human-readable and machine-readable.\n"
"- Ignore decorative elements, arrows, or icons unless they contain actual text.\n"
"- Ensure each answer appears immediately after its corresponding question number."
)
message_content = [{"type": "text", "text": prompt}]
for base64_img in base64_images:
message_content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{base64_img}"}
})
response = client.chat.completions.create(
model="meta-llama/llama-4-scout-17b-16e-instruct",
messages=[{"role": "user", "content": message_content}],
temperature=0.2,
max_completion_tokens=2048,
top_p=1,
stream=False
)
extracted_text = response.choices[0].message.content
return extracted_text
# ----------------------------------
# Example usage
# ----------------------------------
if __name__ == "__main__":
image_paths = [
"image1.jpeg",
"image2.jpeg",
"image3.jpeg",
"image4.jpeg",
"image5.jpeg"
]
print("🔍 Extracting text from images...")
try:
result = extract_text_from_images(image_paths)
print("\n📄 Extracted Text:\n")
print(result)
except Exception as e:
print(f"❌ Error: {e}")