Stack10/stake.py at main · HappyRIO/Stack10 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import asyncio
from telethon import TelegramClient, events
from telethon.tl.types import MessageMediaPhoto
from PIL import Image
import pytesseract
import cv2
import numpy as np
from dotenv import load_dotenv

load_dotenv()
# Set up your API credentials
api_id = os.getenv('API_ID')
api_hash = os.getenv('API_HASH')
channel_username = os.getenv('CHANNEL_USERNAME')
special_text = os.getenv('SPECIFIC_TEXT')

# Create directories to save downloaded and preprocessed images
download_dir = 'downloads'
output_dir = 'preprocessed'
os.makedirs(download_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

async def main():
    async with TelegramClient('session_name', api_id, api_hash) as client:
        # Retrieve the channel
        channel = await client.get_entity(channel_username)

        # Event handler for new messages in the channel
        @client.on(events.NewMessage(chats=channel))
        async def handler(event):
            message = event.message

            # Print the message text for visibility
            print(f'Message: {message.message}')

            # Check if the message contains the special text
            if special_text in message.message:
                next_message = await client.get_messages(channel, ids=message.id + 1)
                if next_message:
                    print(f'Image: {next_message.message}')
                    await download_images(next_message, client)

        # Run the client until interrupted
        print(f'Listening for new messages in {channel_username}...')
        await client.run_until_disconnected()

async def download_images(message, client):
    if message.media:
        if isinstance(message.media, MessageMediaPhoto):
            # Download the image from a photo message directly
            file_path = os.path.join(download_dir, f'{message.id}.jpg')
            await client.download_media(message.media, file=file_path)
            print(f'Downloaded: {file_path}')

            extract_text_from_image(file_path, language='eng+spa')

def preprocess_image(image_path):
    """Load and preprocess the image for better OCR results."""
    img_cv = cv2.imread(image_path)

    # Convert to grayscale and apply Gaussian blur
    gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    ret, threshed = cv2.threshold(blur, 200, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Save the preprocessed image
    preprocessed_path = os.path.join(output_dir, os.path.basename(image_path))
    cv2.imwrite(preprocessed_path, threshed)

    return preprocessed_path

def extract_text_from_image(file_path, language: str = 'eng+spa'):
    """Extract text from images in the specified directory and save to a file."""
    output_file_path = 'extracted_text.txt'  # Output file for extracted text

    # Create or overwrite the output file
    with open(output_file_path, 'a', encoding='utf-8') as output_file:  # Append mode
                try:
                    # Preprocess the image and get the path of the preprocessed image
                    preprocessed_image_path = preprocess_image(file_path)

                    # Extract text using pytesseract
                    custom_config = r'--oem 3 --psm 6'
                    extracted_text = pytesseract.image_to_string(preprocessed_image_path, lang=language, config=custom_config)

                    # Save the extracted text to the output file
                    output_file.write(f'>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<\n')
                    output_file.write(f'Extracted from: {file_path}\n')
                    output_file.write(extracted_text + '\n')
                    output_file.write(f'>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<\n\n')

                    print(f"Image '{file_path}' successfully processed.")
                except Exception as e:
                    print(f"An error occurred while processing '{file_path}': {e}")

                # Optionally, remove the downloaded image after processing
                os.remove(file_path)
                os.remove(preprocessed_image_path)

if __name__ == '__main__':
    # Update the Tesseract command based on your OS and installation path
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust as needed

    # Run the main function
    asyncio.run(main())