cosdata-sdk-python/test-embed-and-upsert.py at main · cosdata/cosdata-sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import logging
from cosdata import Client
from cosdata.embedding import embed_texts

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def main():
    # Initialize the client
    client = Client(
        host="http://127.0.0.1:8443",
        username="admin",
        password="test_key"
    )

    # Configuration
    collection_name = "test_embed_collection"
    dimension = 768  # thenlper/gte-base has 768 dimensions
    description = "Test collection for embedding utility demonstration"

    logger.info("=== Creating Collection ===")
    collection = client.create_collection(
        name=collection_name,
        dimension=dimension,
        description=description
    )
    logger.info(f"Created collection: {collection.name}")

    logger.info("=== Creating Dense Index ===")
    index = collection.create_index(
        distance_metric="cosine",
        num_layers=7,
        max_cache_size=1000,
        ef_construction=512,
        ef_search=256,
        neighbors_count=32,
        level_0_neighbors_count=64
    )
    logger.info(f"Created dense index: {index.name}")

    # Example texts to embed
    texts = [
        "Cosdata makes vector search easy!",
        "This is a test of the embedding utility.",
        "You can use different models for your embeddings.",
        "Let's try a non-default model for demonstration.",
        "Embeddings are essential for semantic search."
    ]

    # Use a non-default model (e.g., 'thenlper/gte-base')
    model_name = "thenlper/gte-base"
    logger.info(f"Generating embeddings using model: {model_name}")
    embeddings = embed_texts(texts, model_name=model_name)
    logger.info(f"Generated {len(embeddings)} embeddings.")

    # Upsert embeddings into the collection
    logger.info("Upserting embeddings into the collection...")
    with collection.transaction() as txn:
        for i, emb in enumerate(embeddings):
            txn.upsert_vector({
                "id": f"embed_vec_{i+1}",
                "dense_values": emb,
                "document_id": f"doc_{i//2}"
            })
    logger.info("Upserted embeddings.")

    # Perform a search using the first embedding
    logger.info("Performing dense search with the first embedding...")
    results = collection.search.dense(
        query_vector=embeddings[0],
        top_k=3,
        return_raw_text=True
    )
    logger.info(f"Search results: {results}")

    # Cleanup
    logger.info("Cleaning up: deleting collection.")
    collection.delete()
    logger.info("Deleted collection.")

if __name__ == "__main__":
    main()