HARVEST/migrate_db_v2.py at main · IntegrativeBioinformaticsLab/HARVEST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Database migration script v2 to optimize schema:
1. Remove article_title, article_authors, article_year from doi_metadata (can be fetched on-demand)
2. Remove contributor_email from sentences table (tracked at triple level)
3. Add projects table for project-based annotation
4. Add admin password hash table
"""

import sqlite3
import os
import sys
from datetime import datetime

# Import generate_doi_hash from harvest_store
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from harvest_store import generate_doi_hash

# Import configuration
try:
    from config import DB_PATH
except ImportError:
    # Fallback to environment variable if config.py doesn't exist
    DB_PATH = os.environ.get("HARVEST_DB", "harvest.db")

def migrate_database_v2():
    print(f"Migrating database v2: {DB_PATH}")

    if not os.path.exists(DB_PATH):
        print("Database does not exist yet. No migration needed.")
        return

    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()

    try:
        # 1. Migrate doi_metadata table - remove article_title, article_authors, article_year
        print("\n1. Checking doi_metadata table...")
        cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='doi_metadata';")
        doi_metadata_exists = cur.fetchone() is not None

        if doi_metadata_exists:
            cur.execute("PRAGMA table_info(doi_metadata);")
            doi_metadata_columns = [row[1] for row in cur.fetchall()]

            if 'article_title' in doi_metadata_columns or 'article_authors' in doi_metadata_columns:
                print("   Removing article metadata columns from doi_metadata...")

                # Create new table without article metadata columns
                cur.execute("""
                    CREATE TABLE doi_metadata_new (
                        doi_hash TEXT PRIMARY KEY,
                        doi TEXT NOT NULL,
                        created_at TEXT
                    );
                """)

                # Copy data (only doi_hash, doi, created_at)
                cur.execute("""
                    INSERT INTO doi_metadata_new (doi_hash, doi, created_at)
                    SELECT doi_hash, doi, created_at FROM doi_metadata;
                """)

                # Drop old table and rename
                cur.execute("DROP TABLE doi_metadata;")
                cur.execute("ALTER TABLE doi_metadata_new RENAME TO doi_metadata;")
                print("   ✓ Removed article metadata columns from doi_metadata")
            else:
                print("   ✓ doi_metadata already optimized")
        else:
            print("   ✓ doi_metadata table doesn't exist yet (will be created)")

        # 2. Migrate sentences table - remove contributor_email and handle old schema
        print("\n2. Checking sentences table...")
        cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sentences';")
        sentences_exists = cur.fetchone() is not None

        if not sentences_exists:
            print("   ✓ sentences table doesn't exist yet (will be created)")
        else:
            cur.execute("PRAGMA table_info(sentences);")
            sentences_columns = [row[1] for row in cur.fetchall()]

            has_doi_hash = 'doi_hash' in sentences_columns
            has_contributor_email = 'contributor_email' in sentences_columns
            has_old_doi = 'doi' in sentences_columns

            if not has_doi_hash or has_contributor_email or has_old_doi:
                print("   Migrating sentences table...")

                # Create new table with optimized schema
                cur.execute("""
                    CREATE TABLE sentences_new (
                        id INTEGER PRIMARY KEY,
                        text TEXT NOT NULL,
                        literature_link TEXT,
                        doi_hash TEXT,
                        created_at TEXT
                    );
                """)

                # Migrate data
                if has_old_doi and not has_doi_hash:
                    # Old schema with doi, article_title, etc.
                    print("   Converting old DOI format to doi_hash...")

                    # Create doi_metadata table if it doesn't exist
                    if not doi_metadata_exists:
                        cur.execute("""
                            CREATE TABLE IF NOT EXISTS doi_metadata (
                                doi_hash TEXT PRIMARY KEY,
                                doi TEXT NOT NULL,
                                created_at TEXT
                            );
                        """)

                    cur.execute("SELECT id, text, literature_link, doi, created_at FROM sentences;")
                    rows = cur.fetchall()

                    for row in rows:
                        sid, text, lit_link, doi, created = row
                        doi_hash = None

                        if doi:
                            doi_hash = generate_doi_hash(doi)
                            cur.execute("""
                                INSERT OR IGNORE INTO doi_metadata(doi_hash, doi, created_at)
                                VALUES (?, ?, ?);
                            """, (doi_hash, doi, created))

                        cur.execute("""
                            INSERT INTO sentences_new(id, text, literature_link, doi_hash, created_at)
                            VALUES (?, ?, ?, ?, ?);
                        """, (sid, text, lit_link, doi_hash, created))

                    print(f"   Migrated {len(rows)} sentences from old schema")
                elif has_doi_hash and has_contributor_email:
                    # New schema but with contributor_email
                    cur.execute("""
                        INSERT INTO sentences_new (id, text, literature_link, doi_hash, created_at)
                        SELECT id, text, literature_link, doi_hash, created_at FROM sentences;
                    """)
                    print("   Removed contributor_email column")
                else:
                    # Already correct schema?
                    cur.execute("""
                        INSERT INTO sentences_new (id, text, literature_link, doi_hash, created_at)
                        SELECT id, text, literature_link, doi_hash, created_at FROM sentences;
                    """)

                # Drop old table and rename
                cur.execute("DROP TABLE sentences;")
                cur.execute("ALTER TABLE sentences_new RENAME TO sentences;")
                print("   ✓ Sentences table migrated")
            else:
                print("   ✓ sentences already optimized")

        # 3. Ensure triples table has contributor_email and project_id
        print("\n3. Checking triples table...")
        cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='triples';")
        triples_exists = cur.fetchone() is not None

        if not triples_exists:
            print("   ✓ triples table doesn't exist yet (will be created)")
        else:
            cur.execute("PRAGMA table_info(triples);")
            triple_columns = [row[1] for row in cur.fetchall()]

            if 'contributor_email' not in triple_columns:
                print("   Adding contributor_email to triples...")
                cur.execute("ALTER TABLE triples ADD COLUMN contributor_email TEXT DEFAULT '';")
                print("   ✓ Added contributor_email column")
            else:
                print("   ✓ triples already has contributor_email")

            if 'project_id' not in triple_columns:
                print("   Adding project_id to triples...")
                cur.execute("ALTER TABLE triples ADD COLUMN project_id INTEGER;")
                print("   ✓ Added project_id column")
            else:
                print("   ✓ triples already has project_id")

        # 4. Create projects table
        print("\n4. Creating projects table...")
        cur.execute("""
            CREATE TABLE IF NOT EXISTS projects (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT NOT NULL UNIQUE,
                description TEXT,
                doi_list TEXT NOT NULL,
                created_by TEXT NOT NULL,
                created_at TEXT NOT NULL
            );
        """)
        print("   ✓ Created projects table")

        # 5. Create admin_users table
        print("\n5. Creating admin_users table...")
        cur.execute("""
            CREATE TABLE IF NOT EXISTS admin_users (
                email TEXT PRIMARY KEY,
                password_hash TEXT NOT NULL,
                created_at TEXT NOT NULL
            );
        """)
        print("   ✓ Created admin_users table")

        conn.commit()
        print("\n✅ Migration v2 completed successfully!")

    except Exception as e:
        conn.rollback()
        print(f"\n❌ Migration v2 failed: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
    finally:
        conn.close()

if __name__ == "__main__":
    migrate_database_v2()