-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmulti_level_dedup.py
More file actions
33 lines (27 loc) · 970 Bytes
/
multi_level_dedup.py
File metadata and controls
33 lines (27 loc) · 970 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# multi_level_dedup.py - Two-level deduplication
import hashlib
import xxhash # pip install xxhash
def compute_signatures(chunk_data):
"""
Compute multiple hash signatures for better matching
Use fast hash for candidate matching, strong hash for verification
"""
signatures = {
# Fast hash for quick candidate matching
"fast": xxhash.xxh64(chunk_data).hexdigest(),
# Medium hash for filtering
"medium": hashlib.blake2b(chunk_data, digest_size=16).hexdigest(),
# Strong hash for final verification
"strong": hashlib.sha256(chunk_data).hexdigest()
}
return signatures
def find_similar_chunks(db, fast_hash):
"""
Find potential matches using fast hash first
Reduces full SHA256 comparisons
"""
# Use fast hash index to find candidates
candidates = db.query(Chunk).filter(
Chunk.fast_hash == fast_hash
).all()
return candidates