-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathai_processor.py
More file actions
494 lines (405 loc) · 22.1 KB
/
ai_processor.py
File metadata and controls
494 lines (405 loc) · 22.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
import config
import json
import requests
import os
import re # Import regular expression module
import asyncio
import time
from typing import Optional
# --- Configuration ---
# IMPORTANT: Replace "YOUR_GOOGLE_API_KEY" with the actual Google API key
# you obtained from the Google Cloud Console (APIs & Services -> Credentials).
# For production systems, it is highly recommended to load this from an
# environment variable or a secure secret management service.
# Example: GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "YOUR_GOOGLE_API_KEY")
GOOGLE_API_KEY = config.GOOGLE_API_KEY
# Base URL for the Gemini API (Generative Language API)
GEMINI_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
# Rate limiting configuration
RATE_LIMIT_DELAY = 0.5 # Delay between API calls in seconds
MAX_RETRIES = 3 # Maximum number of retries for failed requests
# --- Helper Function for Gemini API Calls ---
async def call_gemini_api(prompt: str, schema: dict = None, model: str = "gemini-2.0-flash") -> str:
"""
Makes an asynchronous call to the Gemini API with a given prompt and optional schema.
Args:
prompt (str): The text prompt to send to the Gemini model.
schema (dict, optional): An optional JSON schema for structured responses. Defaults to None.
model (str, optional): The Gemini model to use. Defaults to "gemini-2.0-flash".
Returns:
str: The text content of the API response, or None if an error occurs or no content.
"""
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={GOOGLE_API_KEY}"
headers = {'Content-Type': 'application/json'}
payload = {
"contents": [{"role": "user", "parts": [{"text": prompt}]}]
}
if schema:
payload["generationConfig"] = {
"responseMimeType": "application/json",
"responseSchema": schema
}
try:
response = requests.post(url, headers=headers, data=json.dumps(payload))
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result = response.json()
if result.get('candidates') and result['candidates'][0].get('content') and result['candidates'][0]['content'].get('parts'):
# For JSON structured responses, the content is a JSON string which needs parsing.
if schema:
return result['candidates'][0]['content']['parts'][0]['text'] # This is the JSON string
else:
return result['candidates'][0]['content']['parts'][0]['text'] # This is plain text
else:
print(f"Gemini API response did not contain expected content: {result}")
return None
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err} - Response: {response.text}")
return None
except requests.exceptions.ConnectionError as conn_err:
print(f"Connection error occurred: {conn_err}")
return None
except requests.exceptions.Timeout as timeout_err:
print(f"Timeout error occurred: {timeout_err}")
return None
except requests.exceptions.RequestException as req_err:
print(f"An unexpected error occurred: {req_err}")
return None
except json.JSONDecodeError as json_err:
print(f"Failed to decode JSON response: {json_err} - Raw response: {response.text}")
return None
# --- AI Processing Functions ---
async def analyze_and_summarize(text: str, language: str = 'en') -> dict:
"""
Analyzes a football news article and extracts:
- Summary
- Key entities (teams, players, etc.)
- Main event type
- Content quality score
Args:
text (str): The article content to analyze
language (str): Language code for the article (e.g., 'en', 'fa', 'es')
Returns:
dict: Analysis results with summary, entities, event_type, and quality_score
Returns empty dict if analysis fails
"""
prompt = f"""Analyze the following sports news and provide:
1. A concise summary (max 3 sentences).
2. A list of key entities (e.g., team names, player names, event names, specific dates mentioned).
3. The main topic (e.g., "Transfer News", "Match Result", "Injury Update").
Return the output strictly in JSON format with keys: "summary", "entities", "topic".
Example JSON: {{"summary": "...", "entities": ["...", "..."], "topic": "..."}}
News:
{text}
"""
# Define the expected JSON schema for structured output from Gemini
schema = {
"type": "OBJECT",
"properties": {
"summary": {"type": "STRING"},
"entities": {"type": "ARRAY", "items": {"type": "STRING"}},
"topic": {"type": "STRING"}
},
"required": ["summary", "entities", "topic"]
}
print("Calling Gemini for analysis and summarization...")
json_response_str = await call_gemini_api(prompt, schema=schema)
if json_response_str:
try:
parsed_response = json.loads(json_response_str)
print(f"Analysis successful. Summary: {parsed_response.get('summary')[:50]}...")
return parsed_response
except json.JSONDecodeError:
print(f"Failed to parse JSON for analysis: {json_response_str}")
return {}
return {}
async def translate_to_farsi(text: str) -> str:
"""
Translates English text to Farsi using Gemini.
Extracts only the first translation option if multiple are provided.
Args:
text (str): The English text to translate.
Returns:
str: The translated Farsi text, or an empty string if translation fails.
"""
prompt = f"Translate the following English sports news summary into fluent and natural Farsi. Ensure the translation maintains a journalistic tone suitable for a news channel. Provide ONLY the Farsi translation text, without any explanations or multiple options:\n\n{text}"
print("Calling Gemini for Farsi translation...")
farsi_text_raw = await call_gemini_api(prompt)
if farsi_text_raw:
# Attempt to extract only the actual translation, removing any explanations or option headers.
# This regex looks for text before the first instance of "Option" or "Explanation".
match = re.split(r"^(?:Option|Explanation|توضیح)\s*\d*\s*[:(]", farsi_text_raw, flags=re.MULTILINE | re.IGNORECASE)
if match:
# Take the first part, which should be the pure translation
farsi_text = match[0].strip()
# If it still contains "Original" or "Polished" from examples, remove them too
farsi_text = re.sub(r"^(?:Original|Polished|اصلی|ویرایش شده)\s*:\s*", "", farsi_text, flags=re.MULTILINE)
farsi_text = farsi_text.strip()
if farsi_text:
print(f"Translation successful. Farsi: {farsi_text[:50]}...")
return farsi_text
print(f"Could not extract clean Farsi translation from: {farsi_text_raw[:100]}...")
return farsi_text_raw # Fallback to raw if extraction fails
return ""
async def polish_farsi_text(text: str) -> str:
"""
Polishes and refines Farsi text for better readability, grammar, and tone using Gemini.
Extracts only the first polished option if multiple are provided.
Args:
text (str): The Farsi text to polish.
Returns:
str: The polished Farsi text, or an empty string if polishing fails.
"""
prompt = f"""Review and polish the following Farsi sports news text.
Improve its grammar, word choice, sentence structure, and overall flow to make it sound
more professional, engaging, and suitable for a Telegram news channel.
Do not add or remove factual information. Only refine the language.
Provide ONLY the polished Farsi text, without any explanations, multiple options, or headings like 'Polished Options':
Farsi Text to Polish:
{text}
"""
print("Calling Gemini for Farsi text polishing...")
polished_text_raw = await call_gemini_api(prompt)
if polished_text_raw:
# Attempt to extract only the actual polished translation, removing explanations/option headers.
# This regex looks for text before the first instance of "Option" or "Explanation" or "Changes".
match = re.split(r"^(?:Option|Explanation|Changes|توضیح|تغییرات)\s*\d*\s*[:(]", polished_text_raw, flags=re.MULTILINE | re.IGNORECASE)
if match:
# Take the first part, which should be the pure polished text
polished_text = match[0].strip()
# If it still contains "Original" or "Polished" from examples, remove them too
polished_text = re.sub(r"^(?:Original|Polished|اصلی|ویرایش شده)\s*:\s*", "", polished_text, flags=re.MULTILINE)
polished_text = polished_text.strip()
if polished_text:
print(f"Polishing successful. Polished Farsi: {polished_text[:50]}...")
return polished_text
print(f"Could not extract clean polished Farsi text from: {polished_text_raw[:100]}...")
return polished_text_raw # Fallback to raw if extraction fails
return ""
async def generate_post_title(summary: str, original_content: str, entities: list, lang: str = 'en') -> str:
"""
Generates a short, catchy, and attention-grabbing news title for Telegram posts using Gemini API.
Args:
summary (str): The news summary
original_content (str): The original news article content
entities (list): List of key entities (team names, player names, etc.)
lang (str): Target language ('en' for English, 'fa' for Farsi)
Returns:
str: A catchy, bold-worthy news title (1-2 sentences max), or empty string if generation fails
"""
# Prepare language-specific instructions
if lang == 'fa':
language_instruction = "Generate the title in fluent and natural Farsi (Persian)."
example_format = "Example: 'بارسا با گل دقیقه آخر رئال مادرید را شکست داد!'"
else:
language_instruction = "Generate the title in English."
example_format = "Example: 'Barcelona defeats Real Madrid with last-minute goal!'"
# Create entities string for context
entities_str = ", ".join(entities[:5]) if entities else "football news"
prompt = f"""Generate a SHORT, CATCHY, and ATTENTION-GRABBING news title for a Telegram post.
Requirements:
- Maximum 1-2 sentences
- Bold-worthy and engaging
- News headline style
- {language_instruction}
- Focus on the most exciting aspect of the news
- Use action words and create urgency/excitement
Context:
- Key entities: {entities_str}
- News summary: {summary}
- Original content: {original_content[:300]}...
{example_format}
Generate ONLY the title text without any explanations, formatting, or additional text:
"""
print("Calling Gemini for post title generation...")
try:
# Add rate limiting
await asyncio.sleep(RATE_LIMIT_DELAY)
title_response = await call_gemini_api(prompt)
if title_response:
# Clean up the response - remove any formatting, quotes, or explanations
title = title_response.strip()
# Remove common prefixes that might appear in responses
prefixes_to_remove = [
"Title:", "Headline:", "عنوان:", "تیتر:",
"**", "*", '"', "'", "«", "»"
]
for prefix in prefixes_to_remove:
if title.startswith(prefix):
title = title[len(prefix):].strip()
if title.endswith(prefix):
title = title[:-len(prefix)].strip()
# Remove any remaining markdown formatting
title = re.sub(r'\*\*(.+?)\*\*', r'\1', title) # Remove bold formatting
title = re.sub(r'\*(.+?)\*', r'\1', title) # Remove italic formatting
# Ensure the title is not too long (max 150 characters for title)
if len(title) > 150:
title = title[:147] + "..."
if title:
print(f"Post title generation successful: {title[:50]}...")
return title
print("Could not generate post title from Gemini API response")
except Exception as e:
print(f"Error generating post title: {e}")
# Fallback: Create a simple title from the summary
if summary:
# Take first sentence or first 100 characters of summary
fallback_title = summary.split('.')[0].strip()
if len(fallback_title) > 100:
fallback_title = fallback_title[:97] + "..."
print(f"Using fallback title: {fallback_title[:50]}...")
return fallback_title
return ""
async def score_news_importance(text: str, language: str = 'en') -> float:
"""
Evaluates the importance and engagement potential of a football news article using Gemini API.
This function helps prioritize the most catching and efficient articles for publication.
Includes retry logic with exponential backoff to handle API overload errors.
Args:
text (str): The news article content to evaluate (original content or summary).
language (str): Language code for the article (e.g., 'en', 'fa', 'es')
Returns:
float: Importance score between 0.0 and 1.0, where 1.0 represents the highest importance.
Returns 0.0 if scoring fails after all retries.
"""
prompt = f"""Evaluate the importance and engagement potential of this football news article on a scale of 1 to 10.
Consider these factors:
1. **Breaking/Transfer News**: Major signings, departures, or contract renewals
2. **Match Results**: Especially big games, derbies, or unexpected results
3. **Star Players**: News involving well-known players like Messi, Ronaldo, Mbappe, etc.
4. **Big Clubs**: News about Manchester United, Barcelona, Real Madrid, Liverpool, etc.
5. **Controversies**: Scandals, disciplinary actions, or disputes
6. **Injuries**: Significant injuries to key players
7. **Records/Achievements**: Historic milestones or record-breaking performances
8. **International Football**: World Cup, Euro, Copa America news
9. **Managerial Changes**: Hiring/firing of coaches at major clubs
10. **Financial News**: Major transfers, club finances, or regulatory issues
Provide your response in this exact format:
SCORE: [number from 1-10]
REASON: [brief explanation of why this score was assigned]
Article to evaluate:
{text}
"""
print("Calling Gemini for news importance scoring...")
# Implement retry logic with exponential backoff
for attempt in range(MAX_RETRIES):
try:
# Rate limiting: Add delay between requests
await asyncio.sleep(RATE_LIMIT_DELAY)
response = await call_gemini_api(prompt)
if response:
# Extract the score from the response
score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", response, re.IGNORECASE)
if score_match:
score = float(score_match.group(1))
# Normalize score to 0.0-1.0 range
normalized_score = min(max(score / 10.0, 0.0), 1.0)
# Extract reason for logging
reason_match = re.search(r"REASON:\s*(.+)", response, re.IGNORECASE | re.DOTALL)
reason = reason_match.group(1).strip() if reason_match else "No reason provided"
print(f"Importance scoring successful. Score: {normalized_score:.2f}/1.0")
print(f"Reason: {reason[:100]}...")
return normalized_score
else:
print(f"Could not extract score from response: {response[:100]}...")
except Exception as e:
print(f"Error on attempt {attempt + 1}/{MAX_RETRIES}: {e}")
if attempt < MAX_RETRIES - 1:
# Exponential backoff: Wait longer on each retry
wait_time = (2 ** attempt) * RATE_LIMIT_DELAY
print(f"Retrying in {wait_time:.1f} seconds...")
await asyncio.sleep(wait_time)
else:
print("Max retries reached. Assigning default score of 0.0")
print("Failed to get importance score from Gemini API after all retries")
return 0.0
async def detect_content_similarity(text1: str, text2: str) -> float:
"""
Detects similarity between two news articles using keyword overlap and semantic analysis.
Args:
text1 (str): First article content
text2 (str): Second article content
Returns:
float: Similarity score between 0.0 and 1.0, where 1.0 means identical content
"""
# Simple keyword-based similarity detection
# Extract key terms from both texts
def extract_keywords(text):
"""Extract important keywords from text"""
# Convert to lowercase and split into words
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
# Common football-related stop words to ignore
stop_words = {
'the', 'and', 'for', 'are', 'with', 'his', 'her', 'has', 'had', 'was', 'were',
'will', 'would', 'could', 'should', 'this', 'that', 'they', 'them', 'their',
'from', 'into', 'over', 'under', 'after', 'before', 'during', 'while', 'when',
'where', 'what', 'who', 'how', 'why', 'can', 'may', 'might', 'must', 'shall',
'football', 'soccer', 'sport', 'sports', 'game', 'match', 'team', 'club',
'player', 'players', 'goal', 'goals', 'news', 'article', 'report', 'said'
}
# Filter out stop words and short words
keywords = [word for word in words if word not in stop_words and len(word) > 3]
return set(keywords)
keywords1 = extract_keywords(text1)
keywords2 = extract_keywords(text2)
if not keywords1 or not keywords2:
return 0.0
# Calculate Jaccard similarity (intersection over union)
intersection = len(keywords1.intersection(keywords2))
union = len(keywords1.union(keywords2))
similarity = intersection / union if union > 0 else 0.0
# Additional check for specific names/entities that might indicate same story
# Look for proper nouns (names, places, etc.)
proper_nouns1 = set(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text1))
proper_nouns2 = set(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text2))
if proper_nouns1 and proper_nouns2:
name_intersection = len(proper_nouns1.intersection(proper_nouns2))
name_union = len(proper_nouns1.union(proper_nouns2))
name_similarity = name_intersection / name_union if name_union > 0 else 0.0
# Weight name similarity higher as it's more indicative of same story
similarity = (similarity * 0.6) + (name_similarity * 0.4)
return min(similarity, 1.0)
# --- Example Usage (for testing this module) ---
if __name__ == "__main__":
# This block only runs when you execute this script directly.
# It's useful for quickly testing the AI functions.
import asyncio # Required to run async functions
# Replace with your actual API key before running this test
# If GOOGLE_API_KEY is not set above, this test won't work.
if GOOGLE_API_KEY == "YOUR_GOOGLE_API_KEY":
print("WARNING: Please replace 'YOUR_GOOGLE_API_KEY' in ai_processor.py with your actual API key to run tests.")
else:
async def test_ai_processor():
sample_english_news = "Manchester United secures thrilling 3-2 victory over Liverpool in a dramatic late comeback at Old Trafford, with Bruno Fernandes scoring the winning goal in the 90th minute."
print("\n--- Testing analyze_and_summarize ---")
analysis_result = await analyze_and_summarize(sample_english_news)
print("Analysis Result:", analysis_result)
print("\n--- Testing score_news_importance ---")
importance_score = await score_news_importance(sample_english_news, 'en')
print(f"Importance Score: {importance_score:.2f}/1.0")
print("\n--- Testing detect_content_similarity ---")
# Test with similar articles (should have high similarity)
article1 = "Diogo Jota and his brother Andre Silva died in a car crash in Spain."
article2 = "Liverpool player Diogo Jota was killed in a car accident along with his brother Andre Silva."
similarity1 = await detect_content_similarity(article1, article2)
print(f"Similarity between similar articles: {similarity1:.2f}/1.0")
# Test with different articles (should have low similarity)
article3 = "Manchester United signs new striker for €50 million transfer fee."
similarity2 = await detect_content_similarity(article1, article3)
print(f"Similarity between different articles: {similarity2:.2f}/1.0")
if analysis_result and analysis_result.get('summary'):
print("\n--- Testing translate_to_farsi ---")
farsi_summary = await translate_to_farsi(analysis_result['summary'])
print("Farsi Translation:", farsi_summary)
if farsi_summary:
print("\n--- Testing polish_farsi_text ---")
polished_farsi = await polish_farsi_text(farsi_summary)
print("Polished Farsi Text:", polished_farsi)
if polished_farsi:
print("\n--- Testing generate_post_title ---")
post_title = await generate_post_title(analysis_result['summary'], sample_english_news, analysis_result['entities'])
print("Generated Post Title:", post_title)
else:
print("Analysis failed, skipping translation, polishing, and post title generation tests.")
# Run the async test function
asyncio.run(test_ai_processor())
print("\n--- End of AI Processor Test ---")