File size: 2,414 Bytes
e8e78ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf42161
 
 
 
 
 
 
 
 
 
ba6792e
cf42161
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

milvus_token = os.getenv("MILVUS_TOKEN")

COLLECTION_NAME = "final_ragbench_document_embeddings"
MILVUS_CLOUD_URI = "https://in03-7b4da1b7b588a88.serverless.gcp-us-west1.cloud.zilliz.com"

#Function to create milvus db schema to insert the data
def CreateMilvusDbSchema():

    connections.connect("default", uri=MILVUS_CLOUD_URI, token=milvus_token)
    print(connections.get_connection_addr("default"))

    # Define the fields for the collection
    fields = [
        FieldSchema(name="chunk_doc_id", dtype=DataType.VARCHAR, max_length=350, is_primary=True, auto_id=False),  # Primary Key
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=300),  # Document ID
        FieldSchema(name="chunk_embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # Vector Field (embedding)
        FieldSchema(name="context_relevance", dtype=DataType.FLOAT),  # Context Relevance Score
        FieldSchema(name="context_utilization", dtype=DataType.FLOAT),  # Context Utilization Score
        FieldSchema(name="adherence", dtype=DataType.FLOAT),  # Adherence Score
        FieldSchema(name="dataset_name", dtype=DataType.VARCHAR, max_length=300),  # Dataset Name
        FieldSchema(name="relevance_score", dtype=DataType.FLOAT),  # Relevance Score
        FieldSchema(name="utilization_score", dtype=DataType.FLOAT),  # Utilization Score
        FieldSchema(name="completeness_score", dtype=DataType.FLOAT)  # Completeness Score
    ]

    # Define the collection schema
    schema = CollectionSchema(fields, description="RAG Bench document vector collection")

    # Create the collection in Milvus
    collection = Collection(name=COLLECTION_NAME, schema=schema)

    try:
        # Create an optimized index for fast vector search
        collection.create_index(
            "chunk_embedding",
            {
                "index_type": "HNSW",  # Hierarchical Navigable Small World (HNSW) index
                "metric_type": "COSINE",  # Cosine similarity for vector search
                "params": {"M": 16, "efConstruction": 200}  # HNSW parameters
            }
        )
        print("Index created successfully.")
        print(f"Collection '{COLLECTION_NAME}' created successfully.")
    except Exception as e:
        print(f"Failed to create index: {e}")
    finally:
        return collection