Spaces:

s4um1l
/

rag-session-3

Sleeping

App Files Files Community

s4um1l commited on Apr 15

Commit

ba10a58

1 Parent(s): 4b9a663

introducing parallel processing to make chunking and embedding quicker

Browse files

Files changed (4) hide show

aimakerspace/text_utils.py +31 -2
aimakerspace/vectordatabase.py +48 -5
backend/rag.py +42 -5
frontend/src/App.js +65 -30

aimakerspace/text_utils.py CHANGED Viewed

@@ -1,6 +1,13 @@
 import os
 from typing import List
 import PyPDF2
 class TextFileLoader:
@@ -42,6 +49,7 @@ class CharacterTextSplitter:
         self,
         chunk_size: int = 1000,
         chunk_overlap: int = 200,
     ):
         assert (
             chunk_size > chunk_overlap
@@ -49,6 +57,7 @@ class CharacterTextSplitter:
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
     def split(self, text: str) -> List[str]:
         chunks = []
@@ -57,9 +66,29 @@ class CharacterTextSplitter:
         return chunks
     def split_texts(self, texts: List[str]) -> List[str]:
         chunks = []
-        for text in texts:
-            chunks.extend(self.split(text))
         return chunks

 import os
 from typing import List
 import PyPDF2
+import concurrent.futures
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 class TextFileLoader:
         self,
         chunk_size: int = 1000,
         chunk_overlap: int = 200,
+        max_workers: int = 4
     ):
         assert (
             chunk_size > chunk_overlap
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
+        self.max_workers = max_workers
     def split(self, text: str) -> List[str]:
         chunks = []
         return chunks
     def split_texts(self, texts: List[str]) -> List[str]:
+        logger.info(f"Splitting {len(texts)} texts in parallel with {self.max_workers} workers")
         chunks = []
+        # Use parallel processing if there are multiple texts or large single text
+        if len(texts) > 1 or (len(texts) == 1 and len(texts[0]) > 50000):
+            with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                # Map the split function to the list of texts
+                future_to_text = {executor.submit(self.split, text): text for text in texts}
+                # Collect results as they complete
+                for future in concurrent.futures.as_completed(future_to_text):
+                    try:
+                        text_chunks = future.result()
+                        chunks.extend(text_chunks)
+                        logger.info(f"Processed text chunk batch: {len(text_chunks)} chunks")
+                    except Exception as e:
+                        logger.error(f"Error processing text chunk: {str(e)}")
+        else:
+            # For small amounts of text, process sequentially
+            for text in texts:
+                chunks.extend(self.split(text))
+        logger.info(f"Completed splitting texts into {len(chunks)} chunks")
         return chunks

aimakerspace/vectordatabase.py CHANGED Viewed

@@ -1,8 +1,16 @@
 import numpy as np
 from collections import defaultdict
-from typing import List, Tuple, Callable
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 import asyncio
 def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
@@ -14,9 +22,10 @@ def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
 class VectorDatabase:
-    def __init__(self, embedding_model: EmbeddingModel = None):
         self.vectors = defaultdict(np.array)
         self.embedding_model = embedding_model or EmbeddingModel()
     def insert(self, key: str, vector: np.array) -> None:
         self.vectors[key] = vector
@@ -48,9 +57,43 @@ class VectorDatabase:
         return self.vectors.get(key, None)
     async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
-        embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
-        for text, embedding in zip(list_of_text, embeddings):
-            self.insert(text, np.array(embedding))
         return self

 import numpy as np
 from collections import defaultdict
+from typing import List, Tuple, Callable, Dict
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 import asyncio
+import logging
+import concurrent.futures
+import time
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
 class VectorDatabase:
+    def __init__(self, embedding_model: EmbeddingModel = None, batch_size: int = 25):
         self.vectors = defaultdict(np.array)
         self.embedding_model = embedding_model or EmbeddingModel()
+        self.batch_size = batch_size  # Process embeddings in batches for better performance
     def insert(self, key: str, vector: np.array) -> None:
         self.vectors[key] = vector
         return self.vectors.get(key, None)
     async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
+        start_time = time.time()
+        if not list_of_text:
+            logger.warning("Empty list provided to build vector database")
+            return self
+        logger.info(f"Building embeddings for {len(list_of_text)} text chunks in batches of {self.batch_size}")
+        # Process in batches to avoid overwhelming the API
+        batches = [list_of_text[i:i + self.batch_size] for i in range(0, len(list_of_text), self.batch_size)]
+        logger.info(f"Split into {len(batches)} batches")
+        for i, batch in enumerate(batches):
+            batch_start = time.time()
+            logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} text chunks")
+            try:
+                # Get embeddings for this batch
+                embeddings = await self.embedding_model.async_get_embeddings(batch)
+                # Insert into vector database
+                for text, embedding in zip(batch, embeddings):
+                    self.insert(text, np.array(embedding))
+                batch_duration = time.time() - batch_start
+                logger.info(f"Batch {i+1} completed in {batch_duration:.2f}s")
+                # Small delay between batches to avoid rate limiting
+                if i < len(batches) - 1:
+                    await asyncio.sleep(0.5)
+            except Exception as e:
+                logger.error(f"Error processing batch {i+1}: {str(e)}")
+                # Continue with next batch even if this one failed
+        total_duration = time.time() - start_time
+        logger.info(f"Vector database built with {len(self.vectors)} vectors in {total_duration:.2f}s")
         return self

backend/rag.py CHANGED Viewed

@@ -92,7 +92,7 @@ class RetrievalAugmentedQAPipeline:
             }
 def process_file(file_path: str, file_name: str) -> List[str]:
-    """Process an uploaded file and convert it to text chunks"""
     logger.info(f"Processing file: {file_name} at path: {file_path}")
     try:
@@ -117,10 +117,20 @@ def process_file(file_path: str, file_name: str) -> List[str]:
             logger.warning("No document content loaded")
             return ["No content found in the document"]
-        # Split text into chunks
-        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         text_chunks = text_splitter.split_texts(documents)
         logger.info(f"Split document into {len(text_chunks)} chunks")
         return text_chunks
@@ -130,23 +140,50 @@ def process_file(file_path: str, file_name: str) -> List[str]:
         return [f"Error processing file: {str(e)}"]
 async def setup_vector_db(texts: List[str]) -> VectorDatabase:
-    """Create vector database from text chunks"""
     logger.info(f"Setting up vector database with {len(texts)} text chunks")
     embedding_model = EmbeddingModel()
-    vector_db = VectorDatabase(embedding_model=embedding_model)
     try:
         await vector_db.abuild_from_list(texts)
         vector_db.documents = texts
         logger.info(f"Vector database built with {len(texts)} documents")
         return vector_db
     except Exception as e:
         logger.error(f"Error setting up vector database: {str(e)}")
         logger.error(traceback.format_exc())
         fallback_db = VectorDatabase(embedding_model=embedding_model)
         error_text = "I'm sorry, but there was an error processing the document."
         fallback_db.insert(error_text, [0.0] * 1536)

             }
 def process_file(file_path: str, file_name: str) -> List[str]:
+    """Process an uploaded file and convert it to text chunks - optimized for speed"""
     logger.info(f"Processing file: {file_name} at path: {file_path}")
     try:
             logger.warning("No document content loaded")
             return ["No content found in the document"]
+        # Split text into chunks - use parallel processing
+        logger.info("Splitting document with parallel processing")
+        chunk_size = 1500  # Increased from 1000 for fewer chunks
+        chunk_overlap = 150  # Increased from 100 for better context
+        # Use 8 workers for parallel processing
+        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, max_workers=8)
         text_chunks = text_splitter.split_texts(documents)
+        # Limit chunks to avoid processing too many for speed
+        max_chunks = 40  # Reduced from default
+        if len(text_chunks) > max_chunks:
+            logger.warning(f"Too many chunks ({len(text_chunks)}), limiting to {max_chunks} for faster processing")
+            text_chunks = text_chunks[:max_chunks]
         logger.info(f"Split document into {len(text_chunks)} chunks")
         return text_chunks
         return [f"Error processing file: {str(e)}"]
 async def setup_vector_db(texts: List[str]) -> VectorDatabase:
+    """Create vector database from text chunks - optimized with parallel processing"""
     logger.info(f"Setting up vector database with {len(texts)} text chunks")
+    # Create embedding model to use with VectorDatabase
     embedding_model = EmbeddingModel()
+    # Use batch size of 20 for better parallelization
+    vector_db = VectorDatabase(embedding_model=embedding_model, batch_size=20)
     try:
+        # Limit number of chunks for faster processing
+        max_chunks = 40
+        if len(texts) > max_chunks:
+            logger.warning(f"Limiting {len(texts)} chunks to {max_chunks} for vector embedding")
+            texts = texts[:max_chunks]
+        # Build vector database with batch processing
+        logger.info("Building vector database with batch processing")
         await vector_db.abuild_from_list(texts)
+        # Add documents property for compatibility
         vector_db.documents = texts
         logger.info(f"Vector database built with {len(texts)} documents")
         return vector_db
+    except asyncio.TimeoutError:
+        logger.error(f"Vector database creation timed out after 300 seconds")
+        # Create minimal fallback DB with just a few documents
+        fallback_db = VectorDatabase(embedding_model=embedding_model)
+        if texts:
+            # Use just first few texts for minimal functionality
+            minimal_texts = texts[:3]
+            for text in minimal_texts:
+                fallback_db.insert(text, [0.0] * 1536)  # Use zero vectors for speed
+            fallback_db.documents = minimal_texts
+        else:
+            error_text = "I'm sorry, but there was a timeout during document processing."
+            fallback_db.insert(error_text, [0.0] * 1536)
+            fallback_db.documents = [error_text]
+        return fallback_db
     except Exception as e:
         logger.error(f"Error setting up vector database: {str(e)}")
         logger.error(traceback.format_exc())
+        # Create fallback DB for this error case
         fallback_db = VectorDatabase(embedding_model=embedding_model)
         error_text = "I'm sorry, but there was an error processing the document."
         fallback_db.insert(error_text, [0.0] * 1536)

frontend/src/App.js CHANGED Viewed

@@ -148,6 +148,8 @@ function FileUploader({ onFileUpload }) {
   const [isUploading, setIsUploading] = useState(false);
   const [uploadProgress, setUploadProgress] = useState(0);
   const [processingStatus, setProcessingStatus] = useState(null);
   const { getRootProps, getInputProps } = useDropzone({
     maxFiles: 1,
@@ -294,13 +296,71 @@ function FileUploader({ onFileUpload }) {
     }
   });
   // Status message based on current processing state
   const getStatusMessage = () => {
     switch(processingStatus) {
       case 'starting':
         return 'Initiating hyperspace jump...';
       case 'processing':
-        return 'The Force is analyzing your document... This may take several minutes.';
       case 'timeout':
         return 'Document processing is taking longer than expected. Patience, young Padawan...';
       case 'failed':
@@ -335,7 +395,7 @@ function FileUploader({ onFileUpload }) {
           <>
             <Text color="brand.500">Uploading to the Jedi Archives...</Text>
             <Progress
-              value={uploadProgress}
               size="sm"
               colorScheme="yellow"
               width="100%"
@@ -370,37 +430,12 @@ function App() {
   const handleFileUpload = (newSessionId, name) => {
     setSessionId(newSessionId);
     setFileName(name);
-    setIsDocProcessing(true);
     setMessages([
-      { text: `Processing ${name}. May the Force be with you...`, isUser: false }
     ]);
-    // Poll for document processing status
-    const checkStatus = async () => {
-      try {
-        const response = await axios.get(`${API_URL}/session/${newSessionId}/status`);
-        console.log('Status response:', response.data);
-        if (response.data.status === 'ready') {
-          setIsDocProcessing(false);
-          setMessages([
-            { text: `"${name}" has been added to the Jedi Archives. What knowledge do you seek?`, isUser: false }
-          ]);
-          return;
-        }
-        // Continue polling if still processing
-        if (response.data.status === 'processing') {
-          setTimeout(checkStatus, 2000);
-        }
-      } catch (error) {
-        console.error('Error checking status:', error);
-        // Continue polling even if there's an error
-        setTimeout(checkStatus, 3000);
-      }
-    };
-    checkStatus();
   };
   const handleSendMessage = async () => {

   const [isUploading, setIsUploading] = useState(false);
   const [uploadProgress, setUploadProgress] = useState(0);
   const [processingStatus, setProcessingStatus] = useState(null);
+  const [processingProgress, setProcessingProgress] = useState(0);
+  const [processingSteps, setProcessingSteps] = useState(0);
   const { getRootProps, getInputProps } = useDropzone({
     maxFiles: 1,
     }
   });
+  // Move pollSessionStatus inside the component where it has access to the necessary variables
+  const pollSessionStatus = async (sessionId, file, retries = 40, interval = 5000) => {
+    // Increased retries from 30 to 40 for longer processing documents
+    let currentRetry = 0;
+    while (currentRetry < retries) {
+      try {
+        const statusUrl = `${API_URL}/session/${sessionId}/status`;
+        console.log(`Checking status (attempt ${currentRetry + 1}/${retries}):`, statusUrl);
+        const statusResponse = await axios.get(statusUrl, {
+          timeout: 30000 // 30 second timeout for status checks
+        });
+        console.log('Status response:', statusResponse.data);
+        if (statusResponse.data.status === 'ready') {
+          setProcessingStatus('complete');
+          setProcessingProgress(100);
+          onFileUpload(sessionId, file.name);
+          return;
+        } else if (statusResponse.data.status === 'failed') {
+          setProcessingStatus('failed');
+          throw new Error('Processing failed on server');
+        }
+        // Still processing, update progress based on attempt number
+        setProcessingStatus('processing');
+        // Calculate progress - more rapid at start, slower towards end
+        const progressIncrement = 75 / retries; // Max out at 75% during polling
+        setProcessingProgress(Math.min(5 + (currentRetry * progressIncrement), 75));
+        // Increment processing steps to show activity
+        setProcessingSteps(prev => prev + 1);
+        await new Promise(resolve => setTimeout(resolve, interval));
+        currentRetry++;
+        // Increase interval slightly for each retry to prevent overwhelming the server
+        interval = Math.min(interval * 1.1, 15000); // Cap at 15 seconds
+      } catch (error) {
+        console.error('Error checking status:', error);
+        // If we hit a timeout or network issue, wait a bit longer before retrying
+        await new Promise(resolve => setTimeout(resolve, interval * 2));
+        currentRetry++;
+      }
+    }
+    // If we've exhausted all retries and still don't have a ready status
+    throw new Error('Status polling timed out');
+  };
   // Status message based on current processing state
   const getStatusMessage = () => {
+    const steps = ['Analyzing text', 'Splitting document', 'Creating embeddings', 'Building vector database', 'Finalizing'];
+    const currentStep = steps[processingSteps % steps.length];
     switch(processingStatus) {
       case 'starting':
         return 'Initiating hyperspace jump...';
+      case 'uploading':
+        return 'Sending document to the Jedi Archives...';
       case 'processing':
+        return `${currentStep}... This may take several minutes.`;
       case 'timeout':
         return 'Document processing is taking longer than expected. Patience, young Padawan...';
       case 'failed':
           <>
             <Text color="brand.500">Uploading to the Jedi Archives...</Text>
             <Progress
+              value={processingStatus === 'uploading' ? uploadProgress : processingProgress}
               size="sm"
               colorScheme="yellow"
               width="100%"
   const handleFileUpload = (newSessionId, name) => {
     setSessionId(newSessionId);
     setFileName(name);
+    setIsDocProcessing(false);
     setMessages([
+      { text: `"${name}" has been added to the Jedi Archives. What knowledge do you seek?`, isUser: false }
     ]);
+    // Don't poll again - already handled in FileUploader
   };
   const handleSendMessage = async () => {