Research-Agent/web_api.py at main · FRANK-RAN/Research-Agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
import os
import sys
import logging
from dotenv import load_dotenv
from src.research_core import ResearchCore, save_results_to_json
from src.models import CustomOpenAI
import json
import markdown

# Load environment variables from .env file
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Add the src directory to the Python path so we can import research_core
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Get OpenAI API key from environment variable
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable is not set")


app = FastAPI()

# Pydantic model for the request body
class ResearchRequest(BaseModel):
    research_question: str
    use_zotero: bool = True
    use_arxiv: bool = True
    use_full_text: bool = False
    max_papers_to_download: int = 10
    llm_model: str = "o4-mini"
    zotero_config: Optional[Dict[str, Any]] = None
    arxiv_config: Optional[Dict[str, Any]] = None
    zotero_collection_names: Optional[List[str]] = None

# Pydantic model for the response
class ResearchResponse(BaseModel):
    literature_review: str
    file_path: str
    zotero_papers: Optional[List[Dict[str, Any]]] = None
    arxiv_papers: Optional[List[Dict[str, Any]]] = None

@app.post("/run_research", response_model=ResearchResponse)
async def run_research(request: ResearchRequest):
    try:
        logger.info(f"Received research request: {request.research_question}")
        openai_client = CustomOpenAI(api_key=OPENAI_API_KEY, model=request.llm_model)
        # Use provided configs or default values
        zotero_config = request.zotero_config

        arxiv_config = request.arxiv_config

        logger.info("Initializing ResearchCore...")
        # Initialize the research core with both engines
        research_core = ResearchCore(
            llm_model=request.llm_model,
            output_dir='./research_output',
            zotero_config=zotero_config,
            arxiv_config=arxiv_config
        )

        # Set the OpenAI client for the research core
        research_core.llm = openai_client

        logger.info("Running literature review...")
        # Run the literature review
        results = research_core.run_literature_review(
            research_question=request.research_question,
            use_zotero=request.use_zotero,
            use_arxiv=request.use_arxiv,
            zotero_collection_names=request.zotero_collection_names,
            use_full_text=request.use_full_text,
            max_papers_to_download=request.max_papers_to_download
        )

        logger.info("Saving results to JSON...")
        # Save results to JSON
        save_results_to_json(results)

        # Extract paper details from the documents
        zotero_papers = []
        arxiv_papers =  []

        if 'documents' in results:
            # Create a mapping of paper titles to their order in the literature review
            lit_review = results.get('literature_review', '')
            paper_order = {}
            for i, line in enumerate(lit_review.split('\n')):
                if line.strip().startswith('[') and ']' in line:
                    title = line.split(']', 1)[1].strip()
                    paper_order[title.lower()] = i

            # Process documents and maintain order
            for doc in results['documents']:
                # Handle both dictionary and object metadata
                if isinstance(doc, dict):
                    metadata = doc.get('metadata', {})
                else:
                    metadata = getattr(doc, 'metadata', {})

                # Extract title and clean it
                title = metadata.get('title', 'Untitled')
                if title.endswith('.'):
                    title = title[:-1]

                # Extract year based on source
                year = 'Unknown'
                source = metadata.get('source', '').lower()

                if source == 'arxiv':
                    # For ArXiv papers, extract year from ID or published date
                    if 'arxiv_id' in metadata:
                        year = '20' + metadata['arxiv_id'][:2]
                    elif 'published' in metadata:
                        try:
                            year = metadata['published'][:4]
                        except:
                            pass
                elif source == 'zotero':
                    # For Zotero papers, try to get year from various fields
                    if 'date' in metadata:
                        try:
                            year = metadata['date'][:4]
                        except:
                            pass
                    elif 'year' in metadata:
                        year = str(metadata['year'])

                # Check if paper has full text
                has_full_text = metadata.get('has_full_text', False)

                paper_info = {
                    'title': title,
                    'authors': metadata.get('authors', 'Unknown'),
                    'year': year,
                    'order': paper_order.get(title.lower(), 9999),  # Default to end if not found
                    'has_full_text': has_full_text
                }

                # Add to appropriate list based on source
                if source == 'zotero':
                    zotero_papers.append(paper_info)
                elif source == 'arxiv':
                    # Format ArXiv link properly
                    arxiv_id = None
                    if 'arxiv_id' in metadata:
                        arxiv_id = metadata['arxiv_id']
                    elif 'id' in metadata:
                        # Extract ID from the full URL if present
                        id_match = metadata['id'].split('/')[-1]
                        if id_match:
                            arxiv_id = id_match

                    if arxiv_id:
                        # Clean up the ID (remove version number if present)
                        arxiv_id = arxiv_id.split('v')[0]
                        paper_info['link'] = f"https://arxiv.org/abs/{arxiv_id}"
                    else:
                        paper_info['link'] = '#'
                    arxiv_papers.append(paper_info)
                else:
                    # If source is not specified, try to determine from metadata
                    if 'arxiv_id' in metadata or 'id' in metadata:
                        arxiv_id = metadata.get('arxiv_id') or metadata.get('id', '').split('/')[-1].split('v')[0]
                        paper_info['link'] = f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else '#'
                        arxiv_papers.append(paper_info)
                    else:
                        zotero_papers.append(paper_info)

            # Sort papers by their order in the literature review
            arxiv_papers.sort(key=lambda x: x['order'])
            zotero_papers.sort(key=lambda x: x['order'])

            # Remove the order field before sending to frontend
            for paper in arxiv_papers + zotero_papers:
                paper.pop('order', None)

        logger.info(f"Extracted {len(zotero_papers)} Zotero papers and {len(arxiv_papers)} ArXiv papers")

        logger.info("Research completed successfully")
        return ResearchResponse(
            literature_review=results['literature_review'],
            file_path=results['file_path'],
            zotero_papers=zotero_papers if zotero_papers else None,
            arxiv_papers=arxiv_papers if arxiv_papers else None
        )
    except Exception as e:
        logger.error(f"Error processing research request: {str(e)}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)