-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupdate-missing-fields.py
More file actions
102 lines (84 loc) · 3.13 KB
/
update-missing-fields.py
File metadata and controls
102 lines (84 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import traceback
import math
from sys import exit
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from pymongo import MongoClient
from pymongo.server_api import ServerApi
from math import isnan
import zipfile
from typing import Any
filename = 'summary_metadata.tsv'
# Connect to MongoDB
client = MongoClient("mongodb://vm013.bil.psc.edu:27017/")
def get_json_filename(bildid: str) -> str:
return f'JSON/{bildid}.json'
def get_document(bildid: str) -> Any:
try:
db = client['brainimagelibrary']
collection = db['datasets']
document = collection.find_one({'bildid': bildid})
return document
except:
return None
ttt get_value_by_key(bildid: str, key: str, client=client) -> Any:
try:
db = client['brainimagelibrary']
collection = db['datasets']
# Query the document by bildid
document = collection.find_one({'bildid': bildid})
# Check if document exists and key exists in document
if document and key in document:
return document[key]
else:
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
if Path(filename).exists():
df = pd.read_csv(filename, sep='\t', low_memory=False)
df = df.sort_values(by="bildirectory", key=lambda x: x.str.lower())
# Insert or update documents in MongoDB
for index, row in df.iterrows():
bildid = row['bildid']
print(bildid)
directory = row['bildirectory']
try:
size = int(get_value_by_key(bildid, 'size')['bytes'])
print(f'Size in bytes: {size}')
df.at[index, 'size'] = size
except:
traceback.print_exc()
df.at[index, 'size'] = None
try:
df.at[index, 'pretty_size'] = str(get_value_by_key(bildid, 'size')['pretty'])
except:
df.at[index, 'pretty_size'] = None
try:
df.at[index, 'md5_coverage'] = str(get_value_by_key(bildid, 'coverage')['md5'])
except:
df.at[index, 'md5_coverage'] = None
try:
df.at[index, 'sha256_coverage'] = str(get_value_by_key(bildid, 'coverage')['sha256'])
except:
df.at[index, 'sha256_coverage'] = None
try:
df.at[index, 'xxh64_coverage'] = str(get_value_by_key(bildid, 'coverage')['xxh64'])
except:
df.at[index, 'xxh64_coverage'] = None
try:
df.at[index, 'b2sum_coverage'] = str(get_value_by_key(bildid, 'coverage')['b2sum'])
except:
df.at[index, 'b2sum_coverage'] = None
df.at[index, 'json_file'] = get_json_filename(bildid)
df.at[index, 'number_of_files'] = get_value_by_key(bildid, 'number_of_files')
df.at[index, 'mime_types'] = str(get_value_by_key(bildid, 'mime-types'))
df.at[index, 'frequencies'] = str(get_value_by_key(bildid, 'frequencies'))
df.at[index, 'file_types'] = str(get_value_by_key(bildid, 'file_types'))
try:
df.at[index, 'score'] = sum(value for value in get_value_by_key(bildid, 'coverage').values() if value is not None and not (isinstance(value, float) and math.isnan(value)))/4.0
except:
df.at[index, 'score'] = 0.0
df.to_csv(filename, sep='\t', index=False)
client.close()