Spaces:

ssaiteja16
/

RagBenchCapstone10

Sleeping

App Files Files Community

Saiteja Solleti commited on Feb 15

Commit

9a8353d

1 Parent(s): e8e78ae

milvas insert and search addition

Browse files

Files changed (7) hide show

app.py +25 -2
createmilvusschema.py +0 -1
crudmilvus.py +0 -13
insertmilvushelper.py +143 -0
loaddataset.py +1 -1
requirements.txt +2 -1
searchmilvushelper.py +52 -0

app.py CHANGED Viewed

@@ -3,16 +3,28 @@ import os
 from loaddataset import ExtractRagBenchData
 from createmilvusschema import CreateMilvusDbSchema
 from model import generate_response
 from huggingface_hub import login
 from huggingface_hub import whoami
 from huggingface_hub import dataset_info
 hf_token = os.getenv("HF_TOKEN")
 login(hf_token)
 rag_extracted_data = ExtractRagBenchData()
 #invoke create milvus db function
 try:
@@ -20,12 +32,23 @@ try:
 except Exception as e:
     print(f"Error creating Milvus DB schema: {e}")
-print(rag_extracted_data.head(5))
 def chatbot(prompt):
     return whoami()
-iface = gr.Interface(fn=chatbot, inputs="text", outputs="text", title="Capstone Project Group 10")
 if __name__ == "__main__":
     iface.launch()

 from loaddataset import ExtractRagBenchData
 from createmilvusschema import CreateMilvusDbSchema
+from crudmilvushelper import EmbedAllDocumentsAndInsert
+from sentence_transformers import SentenceTransformer
+from searchmilvushelper import SearchTopKDocuments
 from model import generate_response
 from huggingface_hub import login
 from huggingface_hub import whoami
 from huggingface_hub import dataset_info
+# Load embedding model
+QUERY_EMBEDDING_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
+WINDOW_SIZE = 5
+OVERLAP = 2
+RETRIVE_TOP_K_SIZE=10
 hf_token = os.getenv("HF_TOKEN")
 login(hf_token)
 rag_extracted_data = ExtractRagBenchData()
+print(rag_extracted_data.head(5))
 #invoke create milvus db function
 try:
 except Exception as e:
     print(f"Error creating Milvus DB schema: {e}")
+#insert embdeding to milvus db
+"""
+EmbedAllDocumentsAndInsert(QUERY_EMBEDDING_MODEL, rag_extracted_data, db_collection, window_size=WINDOW_SIZE, overlap=OVERLAP)
+"""
+query = "what would the net revenue have been in 2015 if there wasn't a stipulated settlement from the business combination in october 2015?"
+results_for_top5_chunks = SearchTopKDocuments(db_collection, query, QUERY_EMBEDDING_MODEL, top_k=RETRIVE_TOP_K_SIZE)
+print(results_for_top5_chunks)
 def chatbot(prompt):
     return whoami()
+iface = gr.Interface(fn=chatbot,
+                     inputs="text",
+                     outputs="text",
+                     title="Capstone Project Group 10")
 if __name__ == "__main__":
     iface.launch()

createmilvusschema.py CHANGED Viewed

@@ -5,7 +5,6 @@ milvus_token = os.getenv("MILVUS_TOKEN")
 COLLECTION_NAME = "final_ragbench_document_embeddings"
 MILVUS_CLOUD_URI = "https://in03-7b4da1b7b588a88.serverless.gcp-us-west1.cloud.zilliz.com"
-connections.connect("default", uri=MILVUS_CLOUD_URI, token=milvus_token)
 #Function to create milvus db schema to insert the data
 def CreateMilvusDbSchema():

 COLLECTION_NAME = "final_ragbench_document_embeddings"
 MILVUS_CLOUD_URI = "https://in03-7b4da1b7b588a88.serverless.gcp-us-west1.cloud.zilliz.com"
 #Function to create milvus db schema to insert the data
 def CreateMilvusDbSchema():

crudmilvus.py DELETED Viewed

@@ -1,13 +0,0 @@
-import os
-from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
-from sentence_transformers import SentenceTransformer
-milvus_token = os.getenv("MILVUS_TOKEN")
-COLLECTION_NAME = "final_ragbench_document_embeddings"
-MILVUS_CLOUD_URI = "https://in03-7b4da1b7b588a88.serverless.gcp-us-west1.cloud.zilliz.com"
-connections.connect("default", uri=MILVUS_CLOUD_URI, token=milvus_token)
-# Verify connection
-print(connections.get_connection_addr("default"))

insertmilvushelper.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import nltk
+import pandas as pd
+import numpy as np
+nltk.data.path.append("/content/nltk_data")
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+nltk.download('punkt_tab')
+from nltk.tokenize import sent_tokenize
+#Splits a list of sentences into overlapping chunks using a sliding window approach.
+#sentences (list): List of sentences to split into chunks.
+#        window_size (int): Number of sentences in each chunk. Default is 6.
+#        overlap (int): Number of overlapping sentences between consecutive chunks. Default is 3.
+#    Returns:
+#        list: List of text chunks, where each chunk is a string of concatenated sentences.
+def split_into_sliding_windows(sentences, window_size=6, overlap=3):
+    # Validate input parameters
+    if window_size <= overlap:
+        raise ValueError("window_size must be greater than overlap.")
+    if not sentences:
+        return []
+    chunks = []
+    step = window_size - overlap  # How much to move the window each time
+    # Iterate over the sentences with the specified step size
+    for i in range(0, len(sentences), step):
+        chunk = sentences[i:i + window_size]
+        if len(chunk) >= overlap:  # Ensure chunks have minimum required overlap
+            chunks.append(" ".join(chunk))  # Join sentences into a text block
+    return chunks
+# Processes documents using a sliding window approach and inserts sentence chunks into Milvus.
+#Args: model: The embedding model used to generate document embeddings.
+#   extracted_data: Pandas DataFrame containing the extracted data.
+#    collectionInstance: Milvus collection instance to insert data into.
+#    window_size: Number of sentences in each chunk.
+#    overlap: Number of overlapping sentences between consecutive chunks.
+#
+def EmbedAllDocumentsAndInsert(model, extracted_data, collectionInstance, window_size=5, overlap=2):
+    count = 0
+    total_docs = len(extracted_data)
+    print(f"Total documents: {total_docs}")
+    for index, row in extracted_data.iterrows():
+        document = row["documents"]  # Extract the document text
+        doc_id = row["id"]  # Extract the document ID
+        doccontextrel = row["gpt3_context_relevance"]  # Extract context relevance score
+        doccontextutil = row["gpt35_utilization"]  # Extract context utilization score
+        docadherence = row["gpt3_adherence"]  # Extract adherence score
+        datasetname = row["dataset_name"]  # Extract dataset name
+        relevance_score = row["relevance_score"]  # Extract relevance score
+        utilization_score = row["utilization_score"]  # Extract utilization score
+        completeness_score = row["completeness_score"]  # Extract completeness score
+        if isinstance(document, list):
+            # Flatten the list into a single string
+            document = " ".join([str(item) for item in document if isinstance(item, str)])
+        elif not isinstance(document, str):
+            # If the document is not a string or list, convert it to a string
+            document = str(document)
+        # Step 1: Tokenize document into sentences
+        sentences = sent_tokenize(document) if isinstance(document, str) else document
+        # Step 2: Generate overlapping chunks
+        chunks = split_into_sliding_windows(sentences, window_size, overlap)
+        print(f"Total chunks for document {index}: {len(chunks)}")
+        for chunk_index, chunk_text in enumerate(chunks):
+            # Step 3: Generate embedding for each chunk
+            chunk_vector = np.array(model.encode(chunk_text), dtype=np.float32).flatten().tolist()
+            print(f"chunk_index= {chunk_index}")
+            # Step 4: Insert chunk into Milvus as separate columns
+            insert_embeddings_into_milvus(
+                collectionInstance,
+                chunk_vector,
+                f"{chunk_index}__{doc_id}",  # Unique ID for chunk
+                doc_id,  # Unique ID for doc
+                index,
+                float(doccontextrel) if pd.notna(doccontextrel) else 0.0,  # Handle NaN values
+                float(doccontextutil) if pd.notna(doccontextutil) else 0.0,  # Handle NaN values
+                float(docadherence) if pd.notna(docadherence) else 0.0,  # Handle NaN values
+                datasetname,  # Dataset name column
+                float(relevance_score) if pd.notna(relevance_score) else 0.0,  # Handle NaN values
+                float(utilization_score) if pd.notna(utilization_score) else 0.0,  # Handle NaN values
+                float(completeness_score) if pd.notna(completeness_score) else 0.0  # Handle NaN values
+            )
+            count += 1
+            if count % 1000 == 0:
+                print(f"Uploaded {count} chunks to Milvus.")
+# Inserts document embeddings into Milvus along with metadata.
+#Args:
+#        collection: Milvus collection instance.
+#        embeddings: Embedding vector for the chunk.
+#        chunk_doc_id: Unique ID for the chunk.
+#        doc_id: Unique ID for the document.
+#       index: Index of the document in the dataset.
+#        doccontextrel: Context relevance score.
+#        doccontextutil: Context utilization score.
+#       docadherence: Adherence score.
+#       datasetname: Name of the dataset.
+def insert_embeddings_into_milvus(collection, embeddings, chunk_doc_id, doc_id, index,
+                                  doccontextrel, doccontextutil, docadherence, datasetname,
+                                  relevance_score, utilization_score, completeness_score):
+    try:
+        print(f"Inserting chunk {chunk_doc_id} doc {doc_id} (index {index})")
+        insert_data = [
+            [str(chunk_doc_id)],  # Primary key field (document_id)
+            [str(doc_id)],  # Document ID field
+            [embeddings],  # Vector field (embedding)
+            [float(doccontextrel)],  # Relevance score field
+            [float(doccontextutil)],  # Utilization score field
+            [float(docadherence)],  # Adherence score field
+            [str(datasetname)],  # Dataset name field
+            [float(relevance_score)],  # Relevance score field
+            [float(utilization_score)],  # Utilization score field
+            [float(completeness_score)]  # Completeness score field
+        ]
+        collection.insert(insert_data)
+    except Exception as e:
+        print(f"Error inserting chunk {chunk_doc_id} doc {doc_id} (index {index}): {e}")

loaddataset.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
 from datasets import load_dataset
 from logger import logger
-from typing import Dict, List, Optional
 DATASET_CONFIGS = [

 import pandas as pd
 from datasets import load_dataset
 from logger import logger
+from typing import Dict, List
 DATASET_CONFIGS = [

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ gradio
 transformers
 torch
 huggingface_hub
-pymilvus

 transformers
 torch
 huggingface_hub
+pymilvus
+nltk

searchmilvushelper.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#Search Milvus by generating an embedding for the query text. Returns the top_k most similar documents.
+#Retrieves all columns defined in the Milvus schema.
+def SearchTopKDocuments(collection, query_text, model, top_k=10):
+    # Generate embedding for the query text
+    query_embedding = model.encode(query_text, convert_to_numpy=True)
+    # Define search parameters
+    search_params = {
+        "metric_type": "COSINE",  # Similarity metric
+        "params": {"ef": 64}      # Controls recall, higher values = better accuracy but slower
+    }
+    # Perform the search
+    results = collection.search(
+        data=[query_embedding],
+        anns_field="chunk_embedding",  # Field containing the embeddings
+        param=search_params,
+        limit=top_k,
+        output_fields=[
+            "chunk_doc_id",  # Primary key
+            "doc_id",        # Document ID
+            "context_relevance",  # Context Relevance Score
+            "context_utilization",  # Context Utilization Score
+            "adherence",  # Adherence Score
+            "dataset_name",  # Dataset Name
+            "relevance_score",  # Relevance Score
+            "utilization_score",  # Utilization Score
+            "completeness_score"  # Completeness Score
+        ]
+    )
+    # Process and return the results
+    top_documents = []
+    for hits in results:
+        for hit in hits:
+            doc = {
+                "chunk_doc_id": hit.entity.get("chunk_doc_id"),  # Primary key
+                "doc_id": hit.entity.get("doc_id"),  # Document ID
+                "context_relevance": hit.entity.get("context_relevance"),  # Context Relevance Score
+                "context_utilization": hit.entity.get("context_utilization"),  # Context Utilization Score
+                "adherence": hit.entity.get("adherence"),  # Adherence Score
+                "dataset_name": hit.entity.get("dataset_name"),  # Dataset Name
+                "relevance_score": hit.entity.get("relevance_score"),  # Relevance Score
+                "utilization_score": hit.entity.get("utilization_score"),  # Utilization Score
+                "completeness_score": hit.entity.get("completeness_score"),  # Completeness Score
+                "distance": hit.distance  # Similarity score (cosine distance)
+            }
+            top_documents.append(doc)
+    return top_documents