HARVEST/pdf_download_db.py at main · IntegrativeBioinformaticsLab/HARVEST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Download Tracking Database
Separate SQLite database for tracking PDF download attempts, source performance, and analytics
"""

import sqlite3
import os
from typing import List, Dict, Optional, Tuple
from datetime import datetime, timedelta
import json

PDF_DB_PATH = "pdf_downloads.db"

# Connection pool to reduce database locking
_db_connection_pool = {}
_db_pool_lock = None


def get_pdf_db_connection(db_path: str = PDF_DB_PATH) -> sqlite3.Connection:
    """
    Get a connection to the PDF download tracking database with optimizations.
    Uses Write-Ahead Logging (WAL) mode to reduce locking issues.
    """
    if not os.path.exists(db_path):
        init_pdf_download_db(db_path)

    conn = sqlite3.connect(db_path, timeout=30.0, check_same_thread=False)

    # Enable WAL mode for better concurrent access
    conn.execute("PRAGMA journal_mode=WAL")
    # Increase cache size for better performance
    conn.execute("PRAGMA cache_size=10000")
    # Use normal synchronous mode (faster, still safe with WAL)
    conn.execute("PRAGMA synchronous=NORMAL")

    return conn


def init_pdf_download_db(db_path: str = PDF_DB_PATH) -> bool:
    """
    Initialize the PDF download tracking database with all required tables.
    Returns True on success, False on failure.
    """
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Table 1: Sources - Configuration for each PDF download source
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS sources (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT UNIQUE NOT NULL,
                enabled INTEGER DEFAULT 1,
                base_url TEXT,
                requires_auth INTEGER DEFAULT 0,
                timeout INTEGER DEFAULT 30,
                priority INTEGER DEFAULT 100,
                description TEXT,
                requires_library TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)

        # Table 2: Download Attempts - Log every download attempt
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS download_attempts (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                project_id INTEGER NOT NULL,
                doi TEXT NOT NULL,
                source_name TEXT NOT NULL,
                success INTEGER NOT NULL,
                failure_reason TEXT,
                failure_category TEXT,
                response_time_ms INTEGER,
                file_size_bytes INTEGER,
                pdf_url TEXT,
                timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (source_name) REFERENCES sources(name)
            )
        """)

        # Table 3: Source Performance - Aggregated metrics per source
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS source_performance (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                source_name TEXT UNIQUE NOT NULL,
                total_attempts INTEGER DEFAULT 0,
                success_count INTEGER DEFAULT 0,
                failure_count INTEGER DEFAULT 0,
                avg_response_time_ms REAL DEFAULT 0.0,
                last_success_at TIMESTAMP,
                last_failure_at TIMESTAMP,
                success_rate REAL DEFAULT 0.0,
                last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (source_name) REFERENCES sources(name)
            )
        """)

        # Table 4: Publisher Patterns - Learned URL patterns by publisher
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS publisher_patterns (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                doi_prefix TEXT UNIQUE NOT NULL,
                publisher_name TEXT,
                successful_source TEXT,
                url_pattern TEXT,
                success_count INTEGER DEFAULT 1,
                last_success_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (successful_source) REFERENCES sources(name)
            )
        """)

        # Table 5: Retry Queue - DOIs that need retry with scheduling
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS retry_queue (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                project_id INTEGER NOT NULL,
                doi TEXT NOT NULL,
                failure_category TEXT NOT NULL,
                retry_count INTEGER DEFAULT 0,
                next_retry_at TIMESTAMP,
                last_attempted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                UNIQUE(project_id, doi)
            )
        """)

        # Table 6: Configuration - Store runtime configuration and settings
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS configuration (
                key TEXT PRIMARY KEY,
                value TEXT NOT NULL,
                description TEXT,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)

        # Create indexes for performance
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_attempts_doi ON download_attempts(doi)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_attempts_project ON download_attempts(project_id)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_attempts_source ON download_attempts(source_name)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_attempts_timestamp ON download_attempts(timestamp)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_publisher_prefix ON publisher_patterns(doi_prefix)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_retry_next ON retry_queue(next_retry_at)")

        # Insert default sources
        default_sources = [
            ("unpaywall", 1, "https://api.unpaywall.org/v2/", 0, 10, 10, "Unpaywall REST API - free open access database", None),
            ("unpywall", 1, None, 0, 10, 20, "Unpywall library fallback for Unpaywall API", "unpywall"),
            ("biorxiv_medrxiv", 1, "https://api.biorxiv.org/", 0, 15, 25, "bioRxiv/medRxiv - preprint repositories for life sciences", None),
            ("europe_pmc", 1, "https://www.ebi.ac.uk/europepmc/webservices/rest/", 0, 15, 30, "Europe PMC REST API - biomedical literature", None),
            ("pmc_enhanced", 1, "https://www.ncbi.nlm.nih.gov/pmc/", 0, 15, 35, "Enhanced PubMed Central via E-utilities", None),
            ("arxiv_enhanced", 1, "https://arxiv.org/", 0, 15, 38, "Enhanced arXiv integration with better DOI handling", None),
            ("core", 1, "https://api.core.ac.uk/v3/", 0, 15, 40, "CORE.ac.uk REST API - open access research papers", None),
            ("zenodo", 1, "https://zenodo.org/api/", 0, 15, 45, "Zenodo - CERN's multidisciplinary open repository", None),
            ("semantic_scholar", 1, "https://api.semanticscholar.org/", 0, 15, 50, "Semantic Scholar API - academic paper metadata and PDFs", None),
            ("doaj", 1, "https://doaj.org/api/", 0, 15, 55, "DOAJ - Directory of Open Access Journals", None),
            ("publisher_direct", 1, None, 0, 15, 65, "Direct publisher URLs for known open access patterns", None),
            ("scihub", 0, None, 0, 20, 90, "SciHub mirror (optional, disabled by default)", None),
            ("metapub", 0, None, 0, 15, 60, "Metapub - PubMed Central and arXiv (legacy)", "metapub"),
            ("habanero", 0, None, 0, 15, 70, "Habanero/Crossref - institutional access", "habanero"),
            ("habanero_proxy", 0, None, 1, 20, 80, "Habanero with institutional proxy", "habanero"),
        ]

        cursor.executemany("""
            INSERT OR IGNORE INTO sources (name, enabled, base_url, requires_auth, timeout, priority, description, requires_library)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """, default_sources)

        # Initialize performance records for all sources
        cursor.execute("""
            INSERT OR IGNORE INTO source_performance (source_name)
            SELECT name FROM sources
        """)

        # Insert default configuration
        default_config = [
            ("retry_delay_minutes", "60", "Base delay in minutes before retrying failed downloads"),
            ("max_retry_attempts", "3", "Maximum number of retry attempts for temporary failures"),
            ("cleanup_retention_days", "90", "Number of days to keep download attempt history"),
            ("rate_limit_delay_seconds", "1", "Delay between requests to respect API rate limits"),
            ("user_agent_rotation", "1", "Enable rotating User-Agent headers"),
        ]

        cursor.executemany("""
            INSERT OR IGNORE INTO configuration (key, value, description)
            VALUES (?, ?, ?)
        """, default_config)

        conn.commit()
        conn.close()

        print(f"[PDF DB] Initialized PDF download tracking database: {db_path}")
        return True

    except Exception as e:
        print(f"[PDF DB] Error initializing database: {e}")
        return False


def log_download_attempt(
    project_id: int,
    doi: str,
    source_name: str,
    success: bool,
    failure_reason: Optional[str] = None,
    failure_category: Optional[str] = None,
    response_time_ms: Optional[int] = None,
    file_size_bytes: Optional[int] = None,
    pdf_url: Optional[str] = None,
    db_path: str = PDF_DB_PATH
) -> int:
    """
    Log a download attempt to the database.
    Returns the attempt ID, or -1 on error.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            INSERT INTO download_attempts
            (project_id, doi, source_name, success, failure_reason, failure_category,
             response_time_ms, file_size_bytes, pdf_url)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (project_id, doi, source_name, success, failure_reason, failure_category,
              response_time_ms, file_size_bytes, pdf_url))

        attempt_id = cursor.lastrowid
        conn.commit()
        conn.close()

        # Update aggregated performance metrics
        update_source_performance(source_name, success, response_time_ms, db_path)

        return attempt_id

    except Exception as e:
        print(f"[PDF DB] Error logging download attempt: {e}")
        return -1


def update_source_performance(
    source_name: str,
    success: bool,
    response_time_ms: Optional[int] = None,
    db_path: str = PDF_DB_PATH
) -> bool:
    """
    Update aggregated performance metrics for a source.
    Returns True on success, False on failure.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        # Get current metrics
        cursor.execute("""
            SELECT total_attempts, success_count, failure_count, avg_response_time_ms
            FROM source_performance
            WHERE source_name = ?
        """, (source_name,))

        row = cursor.fetchone()
        if not row:
            # Initialize if not exists
            cursor.execute("""
                INSERT INTO source_performance (source_name, total_attempts, success_count, failure_count)
                VALUES (?, 0, 0, 0)
            """, (source_name,))
            row = (0, 0, 0, 0.0)

        total, success_count, failure_count, avg_time = row

        # Update counts
        total += 1
        if success:
            success_count += 1
        else:
            failure_count += 1

        # Update average response time
        if response_time_ms is not None:
            if avg_time == 0:
                new_avg_time = float(response_time_ms)
            else:
                new_avg_time = ((avg_time * (total - 1)) + response_time_ms) / total
        else:
            new_avg_time = avg_time

        # Calculate success rate
        success_rate = (success_count / total * 100.0) if total > 0 else 0.0

        # Update timestamp based on success/failure
        timestamp_field = "last_success_at" if success else "last_failure_at"

        cursor.execute(f"""
            UPDATE source_performance
            SET total_attempts = ?,
                success_count = ?,
                failure_count = ?,
                avg_response_time_ms = ?,
                success_rate = ?,
                {timestamp_field} = CURRENT_TIMESTAMP,
                last_updated = CURRENT_TIMESTAMP
            WHERE source_name = ?
        """, (total, success_count, failure_count, new_avg_time, success_rate, source_name))

        conn.commit()
        conn.close()
        return True

    except Exception as e:
        print(f"[PDF DB] Error updating source performance: {e}")
        return False


def get_source_rankings(db_path: str = PDF_DB_PATH) -> List[Dict]:
    """
    Get sources ranked by performance (success rate and speed).
    Returns list of source dicts with performance metrics.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            SELECT s.name, s.enabled, s.priority, s.requires_library,
                   sp.success_rate, sp.avg_response_time_ms, sp.total_attempts,
                   sp.success_count, sp.failure_count, s.description
            FROM sources s
            LEFT JOIN source_performance sp ON s.name = sp.source_name
            WHERE s.enabled = 1
            ORDER BY sp.success_rate DESC, sp.avg_response_time_ms ASC, s.priority ASC
        """)

        sources = []
        for row in cursor.fetchall():
            sources.append({
                "name": row[0],
                "enabled": row[1],
                "priority": row[2],
                "requires_library": row[3],
                "success_rate": row[4] or 0.0,
                "avg_response_time_ms": row[5] or 0.0,
                "total_attempts": row[6] or 0,
                "success_count": row[7] or 0,
                "failure_count": row[8] or 0,
                "description": row[9] or ""
            })

        conn.close()
        return sources

    except Exception as e:
        print(f"[PDF DB] Error getting source rankings: {e}")
        return []


def get_best_source_for_publisher(doi_prefix: str, db_path: str = PDF_DB_PATH) -> Optional[str]:
    """
    Get the best source for a given DOI prefix (publisher) based on historical success.
    Returns source name or None.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            SELECT successful_source
            FROM publisher_patterns
            WHERE doi_prefix = ?
            ORDER BY success_count DESC, last_success_at DESC
            LIMIT 1
        """, (doi_prefix,))

        row = cursor.fetchone()
        conn.close()

        return row[0] if row else None

    except Exception as e:
        print(f"[PDF DB] Error getting best source for publisher: {e}")
        return None


def record_publisher_success(
    doi_prefix: str,
    publisher_name: str,
    source_name: str,
    url_pattern: Optional[str] = None,
    db_path: str = PDF_DB_PATH
) -> bool:
    """
    Record a successful download for a publisher to learn patterns.
    Returns True on success, False on failure.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        # Check if pattern exists
        cursor.execute("""
            SELECT id, success_count FROM publisher_patterns
            WHERE doi_prefix = ? AND successful_source = ?
        """, (doi_prefix, source_name))

        row = cursor.fetchone()

        if row:
            # Update existing pattern
            cursor.execute("""
                UPDATE publisher_patterns
                SET success_count = success_count + 1,
                    last_success_at = CURRENT_TIMESTAMP,
                    url_pattern = COALESCE(?, url_pattern)
                WHERE id = ?
            """, (url_pattern, row[0]))
        else:
            # Insert new pattern
            cursor.execute("""
                INSERT INTO publisher_patterns (doi_prefix, publisher_name, successful_source, url_pattern)
                VALUES (?, ?, ?, ?)
            """, (doi_prefix, publisher_name, source_name, url_pattern))

        conn.commit()
        conn.close()
        return True

    except Exception as e:
        print(f"[PDF DB] Error recording publisher success: {e}")
        return False


def add_to_retry_queue(
    project_id: int,
    doi: str,
    failure_category: str,
    retry_delay_minutes: int = 60,
    db_path: str = PDF_DB_PATH
) -> bool:
    """
    Add a failed DOI to the retry queue with scheduled retry time.
    Returns True on success, False on failure.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        # Calculate next retry time with exponential backoff
        cursor.execute("""
            SELECT retry_count FROM retry_queue
            WHERE project_id = ? AND doi = ?
        """, (project_id, doi))

        row = cursor.fetchone()
        retry_count = row[0] + 1 if row else 0

        # Exponential backoff: base_delay * 2^retry_count
        delay_minutes = retry_delay_minutes * (2 ** retry_count)
        next_retry = datetime.now() + timedelta(minutes=delay_minutes)

        cursor.execute("""
            INSERT OR REPLACE INTO retry_queue
            (project_id, doi, failure_category, retry_count, next_retry_at, last_attempted_at)
            VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
        """, (project_id, doi, failure_category, retry_count, next_retry))

        conn.commit()
        conn.close()
        return True

    except Exception as e:
        print(f"[PDF DB] Error adding to retry queue: {e}")
        return False


def get_retry_queue_ready(db_path: str = PDF_DB_PATH) -> List[Dict]:
    """
    Get DOIs from retry queue that are ready to retry (next_retry_at <= now).
    Returns list of retry dicts.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            SELECT id, project_id, doi, failure_category, retry_count
            FROM retry_queue
            WHERE next_retry_at <= CURRENT_TIMESTAMP
            ORDER BY next_retry_at ASC
        """)

        retries = []
        for row in cursor.fetchall():
            retries.append({
                "id": row[0],
                "project_id": row[1],
                "doi": row[2],
                "failure_category": row[3],
                "retry_count": row[4]
            })

        conn.close()
        return retries

    except Exception as e:
        print(f"[PDF DB] Error getting retry queue: {e}")
        return []


def remove_from_retry_queue(project_id: int, doi: str, db_path: str = PDF_DB_PATH) -> bool:
    """
    Remove a DOI from retry queue (after successful download or max retries reached).
    Returns True on success, False on failure.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            DELETE FROM retry_queue
            WHERE project_id = ? AND doi = ?
        """, (project_id, doi))

        conn.commit()
        conn.close()
        return True

    except Exception as e:
        print(f"[PDF DB] Error removing from retry queue: {e}")
        return False


def get_download_statistics(
    project_id: Optional[int] = None,
    days: int = 30,
    db_path: str = PDF_DB_PATH
) -> Dict:
    """
    Get download statistics for a project or overall.
    Returns dict with various metrics.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        # Base query
        where_clause = "WHERE timestamp >= datetime('now', ?)"
        params = [f"-{days} days"]

        if project_id is not None:
            where_clause += " AND project_id = ?"
            params.append(project_id)

        # Overall stats
        cursor.execute(f"""
            SELECT
                COUNT(*) as total_attempts,
                SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
                SUM(CASE WHEN success = 0 THEN 1 ELSE 0 END) as failed,
                AVG(response_time_ms) as avg_response_time,
                COUNT(DISTINCT doi) as unique_dois
            FROM download_attempts
            {where_clause}
        """, params)

        stats_row = cursor.fetchone()

        # Stats by source
        cursor.execute(f"""
            SELECT
                source_name,
                COUNT(*) as attempts,
                SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
                AVG(response_time_ms) as avg_response_time
            FROM download_attempts
            {where_clause}
            GROUP BY source_name
            ORDER BY successful DESC
        """, params)

        source_stats = []
        for row in cursor.fetchall():
            source_stats.append({
                "source": row[0],
                "attempts": row[1],
                "successful": row[2],
                "success_rate": (row[2] / row[1] * 100) if row[1] > 0 else 0,
                "avg_response_time_ms": row[3] or 0
            })

        # Failure categories
        cursor.execute(f"""
            SELECT
                failure_category,
                COUNT(*) as count
            FROM download_attempts
            {where_clause} AND success = 0 AND failure_category IS NOT NULL
            GROUP BY failure_category
            ORDER BY count DESC
        """, params)

        failure_categories = []
        for row in cursor.fetchall():
            failure_categories.append({
                "category": row[0],
                "count": row[1]
            })

        conn.close()

        total = stats_row[0] or 0
        successful = stats_row[1] or 0

        return {
            "total_attempts": total,
            "successful": successful,
            "failed": stats_row[2] or 0,
            "success_rate": (successful / total * 100) if total > 0 else 0,
            "avg_response_time_ms": stats_row[3] or 0,
            "unique_dois": stats_row[4] or 0,
            "by_source": source_stats,
            "failure_categories": failure_categories,
            "period_days": days
        }

    except Exception as e:
        print(f"[PDF DB] Error getting download statistics: {e}")
        return {}


def cleanup_old_attempts(retention_days: int = 90, db_path: str = PDF_DB_PATH) -> int:
    """
    Clean up old download attempts to prevent database bloat.
    Returns number of records deleted.
    """
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            DELETE FROM download_attempts
            WHERE timestamp < datetime('now', ?)
        """, (f"-{retention_days} days",))

        deleted = cursor.rowcount
        conn.commit()
        conn.close()

        print(f"[PDF DB] Cleaned up {deleted} old download attempts (older than {retention_days} days)")
        return deleted

    except Exception as e:
        print(f"[PDF DB] Error cleaning up old attempts: {e}")
        return 0


def get_config_value(key: str, default: str = None, db_path: str = PDF_DB_PATH) -> Optional[str]:
    """Get a configuration value from the database"""
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("SELECT value FROM configuration WHERE key = ?", (key,))
        row = cursor.fetchone()
        conn.close()

        return row[0] if row else default

    except Exception as e:
        print(f"[PDF DB] Error getting config value: {e}")
        return default


def set_config_value(key: str, value: str, description: str = None, db_path: str = PDF_DB_PATH) -> bool:
    """Set a configuration value in the database"""
    try:
        conn = get_pdf_db_connection(db_path)
        cursor = conn.cursor()

        cursor.execute("""
            INSERT OR REPLACE INTO configuration (key, value, description, updated_at)
            VALUES (?, ?, COALESCE(?, (SELECT description FROM configuration WHERE key = ?)), CURRENT_TIMESTAMP)
        """, (key, value, description, key))

        conn.commit()
        conn.close()
        return True

    except Exception as e:
        print(f"[PDF DB] Error setting config value: {e}")
        return False


if __name__ == "__main__":
    # Test database initialization
    print("Testing PDF download tracking database...")

    if init_pdf_download_db():
        print("✓ Database initialized successfully")

        # Test logging an attempt
        attempt_id = log_download_attempt(
            project_id=1,
            doi="10.1371/journal.pone.0000001",
            source_name="unpaywall",
            success=True,
            response_time_ms=450,
            file_size_bytes=1024000
        )
        print(f"✓ Logged download attempt: {attempt_id}")

        # Test getting rankings
        rankings = get_source_rankings()
        print(f"✓ Retrieved {len(rankings)} source rankings")
        for source in rankings[:3]:
            print(f"  - {source['name']}: {source['success_rate']:.1f}% success")

        # Test statistics
        stats = get_download_statistics()
        print(f"✓ Statistics: {stats.get('total_attempts', 0)} attempts, {stats.get('success_rate', 0):.1f}% success")

        print("\nDatabase test completed successfully!")
    else:
        print("✗ Database initialization failed")