-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtest-sparse-sample.py
More file actions
136 lines (117 loc) · 4.24 KB
/
test-sparse-sample.py
File metadata and controls
136 lines (117 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# test-sparse-sample.py
import numpy as np
import random
from cosdata import Client
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def generate_random_sparse_vector(dimension: int, non_zero_dims: int = 100) -> dict:
"""Generate a random sparse vector of the specified dimension."""
# Generate a random number of non-zero dimensions between 20 and 100
actual_non_zero_dims = random.randint(20, non_zero_dims)
# Generate unique indices
indices = sorted(random.sample(range(dimension), actual_non_zero_dims))
# Generate values between 0 and 2.0
values = np.random.uniform(0.0, 2.0, actual_non_zero_dims).tolist()
return {
"indices": indices,
"values": values
}
def main():
# Initialize the client
client = Client(
host="http://127.0.0.1:8443",
username="admin",
password="test_key"
)
# Configuration
collection_name = "test_sparse_collection"
dimension = 768
description = "Test collection for sparse vector operations"
logger.info("\n=== Sparse Vector Collection Management ===")
# Create a new sparse collection
collection = client.create_collection(
name=collection_name,
dimension=dimension,
description=description,
dense_vector={
"enabled": False,
"dimension": dimension
},
sparse_vector={
"enabled": True
},
tf_idf_options={
"enabled": False
}
)
logger.info(f"Created sparse collection: {collection.name}")
# List all collections
collections = client.collections()
logger.info("\nAll collections:")
for coll in collections:
logger.info(f" - {coll.name}")
logger.info("\n=== Sparse Index Management ===")
# Create a sparse vector index
index = collection.create_sparse_index(
name="sparse_index",
quantization=64,
sample_threshold=1000
)
logger.info(f"Created sparse index: {index.name}")
# Get index information
index_info = collection.get_index(index.name)
logger.info(f"\nIndex information: {index_info}")
logger.info("\n=== Sparse Vector Operations ===")
# Generate some test sparse vectors
num_vectors = 1000
sparse_vectors = []
for i in range(num_vectors):
vector_id = f"sparse_vec_{i+1}"
sparse_data = generate_random_sparse_vector(dimension)
sparse_vectors.append({
"id": vector_id,
"sparse_values": sparse_data["values"],
"sparse_indices": sparse_data["indices"],
"document_id": f"doc_{i//10}" # Group vectors into documents
})
logger.info(f"Generated {len(sparse_vectors)} test sparse vectors")
# Add sparse vectors through a transaction
logger.info("Starting transaction...")
with collection.transaction() as txn:
txn.batch_upsert_vectors(sparse_vectors)
logger.info("Added sparse vectors through transaction")
# Verify vector existence
test_vector_id = sparse_vectors[0]["id"]
exists = collection.vectors.exists(test_vector_id)
logger.info(f"\nVector {test_vector_id} exists: {exists}")
logger.info("\n=== Sparse Search Operations ===")
# Perform sparse vector search
sparse_data = generate_random_sparse_vector(dimension)
query_terms = [[idx, val] for idx, val in zip(sparse_data["indices"], sparse_data["values"])]
logger.info(f"Query vector: {query_terms[:5]}...") # Print first 5 terms for debugging
sparse_results = collection.search.sparse(
query_terms=query_terms,
top_k=5,
early_terminate_threshold=0.0,
return_raw_text=True
)
logger.info(f"Sparse search results: {sparse_results}")
logger.info("\n=== Version Management ===")
# Get current version
current_version = collection.versions.get_current()
logger.info(f"Current version: {current_version}")
# Cleanup
logger.info("\n=== Cleanup ===")
# Delete the index
index.delete()
logger.info("Deleted sparse index")
# Delete the collection
collection.delete()
logger.info("Deleted collection")
if __name__ == "__main__":
main()