Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +138 -62
- data/Fab2Esp_transparent.png +0 -0
- parameters.py +3 -2
- src/fabrics_processor/config.py +10 -9
- src/fabrics_processor/database.py +11 -9
- src/fabrics_processor/database_updater.py +32 -27
- src/search_qdrant/streamlit_app.py +9 -9
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/Fab2Esp_transparent.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -6,98 +6,174 @@ sdk_version: 5.12.0
|
|
6 |
---
|
7 |
# Fabric to Espanso Converter
|
8 |
|
9 |
-
A Python application that bridges Fabric prompts with Espanso by managing and converting prompts through a vector database.
|
|
|
10 |
|
11 |
## Features
|
12 |
|
13 |
-
- Store and manage Fabric prompts in a Qdrant vector database
|
14 |
-
- Convert stored prompts into Espanso YAML format for system-wide usage
|
15 |
-
-
|
16 |
-
- Web interface for easy
|
|
|
|
|
|
|
17 |
|
18 |
## Prerequisites
|
19 |
|
20 |
-
- Python 3.11
|
21 |
-
-
|
22 |
-
-
|
23 |
-
-
|
|
|
24 |
|
25 |
## Installation
|
26 |
|
27 |
-
1.
|
28 |
-
2. In Obsidian, create the following folder structure:
|
29 |
-
```
|
30 |
-
Extra/
|
31 |
-
βββ FabricPatterns/
|
32 |
-
βββ Official/ # For downloaded Fabric patterns
|
33 |
-
βββ Own/ # For your custom additions
|
34 |
-
```
|
35 |
-
3. Clone this repository
|
36 |
-
4. Install dependencies using PDM:
|
37 |
```bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
pdm install
|
39 |
```
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
## Usage
|
43 |
|
44 |
-
###
|
45 |
|
46 |
-
|
47 |
```bash
|
48 |
-
|
|
|
49 |
```
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
Create a PowerShell script with the following content to start the application:
|
54 |
-
|
55 |
```powershell
|
56 |
-
#
|
57 |
-
|
58 |
-
$startInfo.Filename = "wsl.exe"
|
59 |
-
# Use -c flag to let the command use the WSL2 Ubuntu folder system and not the Windows
|
60 |
-
$startInfo.Arguments = "bash -c ~/Tools/pythagora-core/workspace/fabrics_processor/src/search_qdrant/run_streamlit.sh"
|
61 |
-
$startInfo.UseShellExecute = $false
|
62 |
-
$startInfo.RedirectStandardOutput = $true
|
63 |
-
$startInfo.RedirectStandardError = $true
|
64 |
-
$startInfo.WindowStyle = [System.Diagnostics.ProcessWindowStyle]::Hidden
|
65 |
-
$startInfo.CreateNoWindow = $true
|
66 |
-
|
67 |
-
# Start the process
|
68 |
-
try {
|
69 |
-
$process = [System.Diagnostics.Process]::Start($startInfo)
|
70 |
-
Start-Sleep -Seconds 5
|
71 |
-
|
72 |
-
# Check if Streamlit is actually running
|
73 |
-
$streamlitRunning = Test-NetConnection -ComputerName localhost -Port 8501 -WarningAction SilentlyContinue
|
74 |
-
|
75 |
-
if ($streamlitRunning.TcpTestSucceeded) {
|
76 |
-
Start-Process "msedge.exe" "--app=http://localhost:8501"
|
77 |
-
} else {
|
78 |
-
Write-Error "Failed to start Streamlit application"
|
79 |
-
}
|
80 |
-
} catch {
|
81 |
-
Write-Error "Error starting Streamlit: $_"
|
82 |
-
}
|
83 |
```
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
## Dependencies
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
-
|
95 |
- qdrant-client >= 1.12.1
|
96 |
- fastembed >= 0.4.2
|
97 |
-
-
|
98 |
- pyperclip >= 1.9.0
|
|
|
99 |
- regex >= 2024.11.6
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
## License
|
102 |
|
103 |
This project is licensed under the MIT License.
|
|
|
6 |
---
|
7 |
# Fabric to Espanso Converter
|
8 |
|
9 |
+
A Python application that bridges Fabric prompts with Espanso and Obsidian Textgenerator by managing and converting prompts through a vector database. It enables semantic search and efficient management of prompts while providing a modern web interface for easy interaction.
|
10 |
+
There's also a seperate gradio app that can be hosted on Hugging Face Spaces to provide a query-only interface.
|
11 |
|
12 |
## Features
|
13 |
|
14 |
+
- **Vector Database Integration**: Store and manage Fabric prompts in a Qdrant vector database with semantic search capabilities
|
15 |
+
- **Automated Conversion**: Convert stored prompts into Espanso YAML format for system-wide usage
|
16 |
+
- **Change Detection**: Automatically detect and process changes in the Fabric patterns folder
|
17 |
+
- **Web Interface**: Modern Gradio-based interface for easy prompt searching and management
|
18 |
+
- **Semantic Search**: Find relevant prompts based on their meaning, not just exact matches
|
19 |
+
- **Clipboard Integration**: Quick copying of prompts directly to clipboard
|
20 |
+
- **Logging System**: Comprehensive logging for tracking operations and debugging
|
21 |
|
22 |
## Prerequisites
|
23 |
|
24 |
+
- Python 3.11 or higher
|
25 |
+
- Fabric (https://github.com/danielmiessler/fabric)
|
26 |
+
- Qdrant vector database (local or cloud instance)
|
27 |
+
- Obsidian with TextGenerator plugin (https://github.com/obsidianmd/obsidian-textgenerator)
|
28 |
+
- Linux/WSL2 or Windows with WSL2
|
29 |
|
30 |
## Installation
|
31 |
|
32 |
+
1. **Environment Setup**:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
```bash
|
34 |
+
# Clone the repository
|
35 |
+
git clone [repository-url]
|
36 |
+
cd fabric_to_espanso
|
37 |
+
|
38 |
+
# Install PDM if not already installed
|
39 |
+
pip install pdm
|
40 |
+
|
41 |
+
# Install dependencies
|
42 |
pdm install
|
43 |
```
|
44 |
+
|
45 |
+
2. **Configuration**:
|
46 |
+
- Copy `.env.example` to `.env`
|
47 |
+
- Set your Qdrant API key in `.env`:
|
48 |
+
```
|
49 |
+
QDRANT_API_KEY=your_api_key_here
|
50 |
+
```
|
51 |
+
|
52 |
+
3. **Obsidian Setup**:
|
53 |
+
- Install Obsidian and the TextGenerator plugin
|
54 |
+
- Create the folder structure:
|
55 |
+
```
|
56 |
+
Extra/
|
57 |
+
βββ FabricPatterns/
|
58 |
+
βββ Official/ # Official Fabric patterns
|
59 |
+
βββ Own/ # Custom patterns
|
60 |
+
```
|
61 |
+
|
62 |
+
4. **Fabric Setup**:
|
63 |
+
- Install Fabric, see https://github.com/danielmiessler/fabric
|
64 |
+
|
65 |
+
5. **QDRANT Setup**:
|
66 |
+
- Install Qdrant, see https://qdrant.io/en/
|
67 |
+
- Start Qdrant server
|
68 |
+
|
69 |
+
6. **Parameters**:
|
70 |
+
- Set all the parameters in the file `parameters.py`.
|
71 |
+
|
72 |
+
7. **Optional**:
|
73 |
+
- Create a Powershell script to run the Streamlit app
|
74 |
+
|
75 |
|
76 |
## Usage
|
77 |
|
78 |
+
### Starting the Application
|
79 |
|
80 |
+
#### Linux/WSL2
|
81 |
```bash
|
82 |
+
# Start the Gradio interface
|
83 |
+
python gradio_app_query_only.py
|
84 |
```
|
85 |
|
86 |
+
#### Windows (with WSL2)
|
|
|
|
|
|
|
87 |
```powershell
|
88 |
+
# Use the provided PowerShell script
|
89 |
+
./start_app.ps1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
```
|
91 |
|
92 |
+
### Core Operations
|
93 |
+
|
94 |
+
1. **Search Prompts**:
|
95 |
+
- Enter your search query in the search box
|
96 |
+
- Results are ranked by semantic similarity
|
97 |
+
- Click on a result to view its contents
|
98 |
+
|
99 |
+
2. **Copy Prompts**:
|
100 |
+
- Select a prompt from the results
|
101 |
+
- Click "Copy to Clipboard" to copy the prompt text
|
102 |
+
|
103 |
+
3. **Update Database**:
|
104 |
+
- Run `python main.py` to process changes in the Fabric patterns folder
|
105 |
+
- New and modified prompts are automatically added to the database
|
106 |
+
- Deleted prompts are removed from the database
|
107 |
+
|
108 |
+
## Project Structure
|
109 |
+
|
110 |
+
```
|
111 |
+
fabric_to_espanso/
|
112 |
+
βββ src/
|
113 |
+
β βββ fabrics_processor/ # Core processing logic
|
114 |
+
β βββ search_qdrant/ # Search functionality
|
115 |
+
βββ gradio_app_query_only.py # Web interface
|
116 |
+
βββ main.py # CLI entry point
|
117 |
+
βββ parameters.py # Configuration parameters
|
118 |
+
```
|
119 |
|
120 |
## Dependencies
|
121 |
|
122 |
+
Core dependencies are managed through PDM:
|
123 |
+
|
124 |
+
- gradio >= 5.12.0
|
125 |
- qdrant-client >= 1.12.1
|
126 |
- fastembed >= 0.4.2
|
127 |
+
- python-dotenv
|
128 |
- pyperclip >= 1.9.0
|
129 |
+
- pyyaml >= 6.0.2
|
130 |
- regex >= 2024.11.6
|
131 |
|
132 |
+
## TODO
|
133 |
+
|
134 |
+
The following items need to be addressed to improve code quality, maintainability, and functionality:
|
135 |
+
|
136 |
+
### Database Optimization
|
137 |
+
- Check the database for any points with exactly the same vector or nearly the same. Remove those to reduce redundancy and improve search efficiency.
|
138 |
+
|
139 |
+
### Metadata Enhancement
|
140 |
+
- If available, use the readme.md file from the fabrics folder to fill the "purpose" field in the database entries.
|
141 |
+
- If readme.md is not available in the fabrics folder, create the "purpose" field from an LLM response that summarizes the goal of the fabric file.
|
142 |
+
|
143 |
+
### UI/UX Improvements
|
144 |
+
- Add a compare interface to the gradio app to allow side-by-side comparison of prompts.
|
145 |
+
- Remove the streamlit_only_query app as it's being replaced by the gradio interface.
|
146 |
+
|
147 |
+
### Code Refactoring
|
148 |
+
- Implement proper error handling for database operations.
|
149 |
+
- Add comprehensive logging throughout the application.
|
150 |
+
- Create unit tests for core functionality.
|
151 |
+
- Implement type hints consistently across all Python files.
|
152 |
+
- Add input validation for all user-provided data.
|
153 |
+
- Refactor the database operations into a dedicated class.
|
154 |
+
- Implement connection pooling for better database performance.
|
155 |
+
- Add docstrings to all functions and classes.
|
156 |
+
- Create a configuration class to handle all settings.
|
157 |
+
- Add proper cleanup of resources in error cases.
|
158 |
+
|
159 |
+
### Documentation
|
160 |
+
- Add API documentation for all public interfaces.
|
161 |
+
- Include examples for common use cases.
|
162 |
+
- Document the database schema and vector space organization.
|
163 |
+
- Add contribution guidelines.
|
164 |
+
- Include troubleshooting section.
|
165 |
+
|
166 |
+
### Security
|
167 |
+
- Implement proper environment variable handling.
|
168 |
+
- Add input sanitization for all user inputs.
|
169 |
+
- Implement rate limiting for the web interface.
|
170 |
+
- Add proper authentication for the web interface.
|
171 |
+
|
172 |
+
### Performance
|
173 |
+
- Implement caching for frequently accessed prompts.
|
174 |
+
- Optimize vector similarity search parameters.
|
175 |
+
- Add batch processing for large-scale operations.
|
176 |
+
|
177 |
## License
|
178 |
|
179 |
This project is licensed under the MIT License.
|
data/Fab2Esp_transparent.png
CHANGED
![]() |
![]() |
Git LFS Details
|
parameters.py
CHANGED
@@ -47,7 +47,7 @@ BASE_WORDS = ['Identity', 'Purpose', 'Task', 'Goal']
|
|
47 |
# COLLECTION_NAME = "fabric_patterns"
|
48 |
# Cloud:
|
49 |
QDRANT_URL = "https://91ed3a93-6135-4951-a624-1c8c2878240d.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
50 |
-
COLLECTION_NAME = "
|
51 |
|
52 |
# Required fields for database points
|
53 |
# TODO: default trigger wordt nu twee keer gedefinieerd, oplossen
|
@@ -61,4 +61,5 @@ REQUIRED_FIELDS_DEFAULTS = {
|
|
61 |
|
62 |
# Embedding Model parameters voor Qdrant
|
63 |
USE_FASTEMBED = True
|
64 |
-
|
|
|
|
47 |
# COLLECTION_NAME = "fabric_patterns"
|
48 |
# Cloud:
|
49 |
QDRANT_URL = "https://91ed3a93-6135-4951-a624-1c8c2878240d.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
50 |
+
COLLECTION_NAME = "fabric_patterns_hybrid"
|
51 |
|
52 |
# Required fields for database points
|
53 |
# TODO: default trigger wordt nu twee keer gedefinieerd, oplossen
|
|
|
61 |
|
62 |
# Embedding Model parameters voor Qdrant
|
63 |
USE_FASTEMBED = True
|
64 |
+
EMBED_MODEL_DENSE = 'BAAI/bge-base-en' # "fast-bge-small-en"
|
65 |
+
EMBED_MODEL_SPARSE = "prithivida/Splade_PP_en_v1"
|
src/fabrics_processor/config.py
CHANGED
@@ -16,7 +16,8 @@ from parameters import (
|
|
16 |
BASE_WORDS,
|
17 |
QDRANT_URL,
|
18 |
USE_FASTEMBED,
|
19 |
-
|
|
|
20 |
COLLECTION_NAME,
|
21 |
REQUIRED_FIELDS,
|
22 |
REQUIRED_FIELDS_DEFAULTS
|
@@ -61,22 +62,22 @@ class DatabaseConfig:
|
|
61 |
raise ConfigurationError(str(e))
|
62 |
|
63 |
@dataclass
|
64 |
-
class
|
65 |
"""Embedding model configuration."""
|
66 |
use_fastembed: bool = USE_FASTEMBED
|
67 |
-
model_name: str = EMBED_MODEL
|
68 |
collection_name: str = COLLECTION_NAME
|
69 |
-
|
|
|
70 |
|
71 |
def validate(self) -> None:
|
72 |
"""Validate the embedding configuration."""
|
73 |
-
if not self.
|
74 |
from .exceptions import ConfigurationError
|
75 |
-
raise ConfigurationError("Embedding model name cannot be empty")
|
76 |
|
77 |
-
if self.
|
78 |
from .exceptions import ConfigurationError
|
79 |
-
raise ConfigurationError(
|
80 |
|
81 |
class Config:
|
82 |
"""Global configuration singleton."""
|
@@ -86,7 +87,7 @@ class Config:
|
|
86 |
if cls._instance is None:
|
87 |
cls._instance = super().__new__(cls)
|
88 |
cls._instance.database = DatabaseConfig()
|
89 |
-
cls._instance.embedding =
|
90 |
cls._instance.espanso_trigger = DEFAULT_TRIGGER
|
91 |
cls._instance.fabric_patterns_folder = FABRIC_PATTERNS_FOLDER
|
92 |
cls._instance.yaml_output_folder = YAML_OUTPUT_FOLDER
|
|
|
16 |
BASE_WORDS,
|
17 |
QDRANT_URL,
|
18 |
USE_FASTEMBED,
|
19 |
+
EMBED_MODEL_DENSE,
|
20 |
+
EMBED_MODEL_SPARSE,
|
21 |
COLLECTION_NAME,
|
22 |
REQUIRED_FIELDS,
|
23 |
REQUIRED_FIELDS_DEFAULTS
|
|
|
62 |
raise ConfigurationError(str(e))
|
63 |
|
64 |
@dataclass
|
65 |
+
class EmbeddingModelConfig:
|
66 |
"""Embedding model configuration."""
|
67 |
use_fastembed: bool = USE_FASTEMBED
|
|
|
68 |
collection_name: str = COLLECTION_NAME
|
69 |
+
dense_model_name: str = EMBED_MODEL_DENSE
|
70 |
+
sparse_model_name: str = EMBED_MODEL_SPARSE
|
71 |
|
72 |
def validate(self) -> None:
|
73 |
"""Validate the embedding configuration."""
|
74 |
+
if not self.dense_model_name:
|
75 |
from .exceptions import ConfigurationError
|
76 |
+
raise ConfigurationError("Dense Embedding model name cannot be empty")
|
77 |
|
78 |
+
if not self.sparse_model_name:
|
79 |
from .exceptions import ConfigurationError
|
80 |
+
raise ConfigurationError("Sparse Embedding model name cannot be empty")
|
81 |
|
82 |
class Config:
|
83 |
"""Global configuration singleton."""
|
|
|
87 |
if cls._instance is None:
|
88 |
cls._instance = super().__new__(cls)
|
89 |
cls._instance.database = DatabaseConfig()
|
90 |
+
cls._instance.embedding = EmbeddingModelConfig()
|
91 |
cls._instance.espanso_trigger = DEFAULT_TRIGGER
|
92 |
cls._instance.fabric_patterns_folder = FABRIC_PATTERNS_FOLDER
|
93 |
cls._instance.yaml_output_folder = YAML_OUTPUT_FOLDER
|
src/fabrics_processor/database.py
CHANGED
@@ -52,7 +52,8 @@ def initialize_qdrant_database(
|
|
52 |
api_key: Optional[str] = "",
|
53 |
collection_name: str = config.embedding.collection_name,
|
54 |
use_fastembed: bool = config.embedding.use_fastembed,
|
55 |
-
|
|
|
56 |
) -> QdrantClient:
|
57 |
"""Initialize the Qdrant database for storing markdown file information.
|
58 |
|
@@ -75,6 +76,9 @@ def initialize_qdrant_database(
|
|
75 |
|
76 |
# Create database connection
|
77 |
client = create_database_connection(url=url, api_key=api_key)
|
|
|
|
|
|
|
78 |
|
79 |
# Check if collection exists
|
80 |
collections = client.get_collections()
|
@@ -85,19 +89,17 @@ def initialize_qdrant_database(
|
|
85 |
|
86 |
# Create collection with appropriate vector configuration
|
87 |
if use_fastembed:
|
88 |
-
|
|
|
89 |
else:
|
90 |
-
|
91 |
-
|
92 |
-
size=config.embedding.vector_size,
|
93 |
-
distance=Distance.COSINE
|
94 |
-
)
|
95 |
-
}
|
96 |
|
97 |
try:
|
98 |
client.create_collection(
|
99 |
collection_name=collection_name,
|
100 |
-
vectors_config=
|
|
|
101 |
on_disk_payload=True
|
102 |
)
|
103 |
except exceptions.UnexpectedResponse as e:
|
|
|
52 |
api_key: Optional[str] = "",
|
53 |
collection_name: str = config.embedding.collection_name,
|
54 |
use_fastembed: bool = config.embedding.use_fastembed,
|
55 |
+
dense_model: str = config.embedding.dense_model_name,
|
56 |
+
sparse_model: str = config.embedding.sparse_model_name
|
57 |
) -> QdrantClient:
|
58 |
"""Initialize the Qdrant database for storing markdown file information.
|
59 |
|
|
|
76 |
|
77 |
# Create database connection
|
78 |
client = create_database_connection(url=url, api_key=api_key)
|
79 |
+
|
80 |
+
client.set_model(dense_model)
|
81 |
+
client.set_sparse_model(sparse_model)
|
82 |
|
83 |
# Check if collection exists
|
84 |
collections = client.get_collections()
|
|
|
89 |
|
90 |
# Create collection with appropriate vector configuration
|
91 |
if use_fastembed:
|
92 |
+
vectors_config = client.get_fastembed_vector_params()
|
93 |
+
sparse_vectors_config = client.get_fastembed_sparse_vector_params()
|
94 |
else:
|
95 |
+
print("Creating database without Fastembed not implemented yet.")
|
96 |
+
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
97 |
|
98 |
try:
|
99 |
client.create_collection(
|
100 |
collection_name=collection_name,
|
101 |
+
vectors_config=vectors_config,
|
102 |
+
sparse_vectors_config=sparse_vectors_config,
|
103 |
on_disk_payload=True
|
104 |
)
|
105 |
except exceptions.UnexpectedResponse as e:
|
src/fabrics_processor/database_updater.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from typing import Optional
|
2 |
from qdrant_client import QdrantClient
|
3 |
from qdrant_client.http.models import PointStruct, Filter, FieldCondition, MatchValue, PointIdsList
|
4 |
-
from fastembed import TextEmbedding
|
5 |
import logging
|
6 |
import uuid
|
7 |
from .output_files_generator import generate_yaml_file, generate_markdown_files
|
@@ -11,7 +11,7 @@ from .database import validate_point_payload
|
|
11 |
|
12 |
logger = logging.getLogger('fabric_to_espanso')
|
13 |
|
14 |
-
def get_embedding(text: str
|
15 |
"""
|
16 |
Generate embedding vector for the given text using FastEmbed.
|
17 |
|
@@ -19,10 +19,25 @@ def get_embedding(text: str, embedding_model: TextEmbedding) -> list:
|
|
19 |
text (str): Text to generate embedding for
|
20 |
|
21 |
Returns:
|
22 |
-
list:
|
23 |
"""
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
def update_qdrant_database(client: QdrantClient, collection_name: str, new_files: list, modified_files: list, deleted_files: list):
|
28 |
"""
|
@@ -34,16 +49,10 @@ def update_qdrant_database(client: QdrantClient, collection_name: str, new_files
|
|
34 |
modified_files (list): List of modified files to be updated in the database.
|
35 |
deleted_files (list): List of deleted files to be removed from the database.
|
36 |
"""
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
logger.info(f"Initializing FastEmbed model.")
|
42 |
-
embedding_model = TextEmbedding()
|
43 |
-
else:
|
44 |
-
logger.info(f"Initializing embbeding model: {config.model_name}")
|
45 |
-
# TODO: testen. Weet niet of dit werkt.
|
46 |
-
embedding_model = TextEmbedding(model_name=config.model_name)
|
47 |
|
48 |
try:
|
49 |
# Add new files
|
@@ -52,9 +61,10 @@ def update_qdrant_database(client: QdrantClient, collection_name: str, new_files
|
|
52 |
payload_new = validate_point_payload(file)
|
53 |
point = PointStruct(
|
54 |
id=str(uuid.uuid4()), # Generate a new UUID for each point
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
58 |
payload={
|
59 |
"filename": payload_new['filename'],
|
60 |
"content": payload_new['content'],
|
@@ -87,15 +97,10 @@ def update_qdrant_database(client: QdrantClient, collection_name: str, new_files
|
|
87 |
# Update the existing point with the new file data
|
88 |
point = PointStruct(
|
89 |
id=point_id,
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
# Zie https://github.com/qdrant/qdrant-client/discussions/598
|
95 |
-
# De naam die fastembed gebruikt is afhankelijk van het model dat je gebruikt.
|
96 |
-
# Je kunt de naam vinden door: client.get_vector_field_name()
|
97 |
-
vector={'fast-bge-small-en':
|
98 |
-
get_embedding(file['purpose'], embedding_model)}, # Generate vector from purpose field
|
99 |
payload={
|
100 |
"filename": payload_current['filename'],
|
101 |
"content": file['content'],
|
|
|
1 |
from typing import Optional
|
2 |
from qdrant_client import QdrantClient
|
3 |
from qdrant_client.http.models import PointStruct, Filter, FieldCondition, MatchValue, PointIdsList
|
4 |
+
from fastembed import TextEmbedding, SparseTextEmbedding
|
5 |
import logging
|
6 |
import uuid
|
7 |
from .output_files_generator import generate_yaml_file, generate_markdown_files
|
|
|
11 |
|
12 |
logger = logging.getLogger('fabric_to_espanso')
|
13 |
|
14 |
+
def get_embedding(text: str) -> list:
|
15 |
"""
|
16 |
Generate embedding vector for the given text using FastEmbed.
|
17 |
|
|
|
19 |
text (str): Text to generate embedding for
|
20 |
|
21 |
Returns:
|
22 |
+
list: Tuple of (dense_embeddings, sparse_embeddings)
|
23 |
"""
|
24 |
+
if not config.embedding.use_fastembed:
|
25 |
+
msg = "Embedding model not initialized. Set use_fastembed to True in the configuration."
|
26 |
+
logger.error(msg)
|
27 |
+
raise ConfigurationError(msg)
|
28 |
+
|
29 |
+
# Models are lazily initialized only when needed
|
30 |
+
if not hasattr(get_embedding, '_dense_model'):
|
31 |
+
get_embedding._dense_model = TextEmbedding(model_name=config.embedding.dense_model_name)
|
32 |
+
if not hasattr(get_embedding, '_sparse_model'):
|
33 |
+
get_embedding._sparse_model = SparseTextEmbedding(model_name=config.embedding.sparse_model_name)
|
34 |
+
|
35 |
+
dense_embeddings = list(get_embedding._dense_model.embed(text))[0]
|
36 |
+
sparse_embedding = list(get_embedding._sparse_model.embed(text, return_dense=False))[0]
|
37 |
+
return dense_embeddings, {
|
38 |
+
'indices': sparse_embedding.indices.tolist(),
|
39 |
+
'values': sparse_embedding.values.tolist()
|
40 |
+
}
|
41 |
|
42 |
def update_qdrant_database(client: QdrantClient, collection_name: str, new_files: list, modified_files: list, deleted_files: list):
|
43 |
"""
|
|
|
49 |
modified_files (list): List of modified files to be updated in the database.
|
50 |
deleted_files (list): List of deleted files to be removed from the database.
|
51 |
"""
|
52 |
+
if not config.embedding.use_fastembed:
|
53 |
+
msg = "Embedding model not initialized. Set use_fastembed to True in the configuration."
|
54 |
+
logger.info(msg)
|
55 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
try:
|
58 |
# Add new files
|
|
|
61 |
payload_new = validate_point_payload(file)
|
62 |
point = PointStruct(
|
63 |
id=str(uuid.uuid4()), # Generate a new UUID for each point
|
64 |
+
vector={
|
65 |
+
'fast-bge-base-en': get_embedding(payload_new['purpose'])[0],
|
66 |
+
'fast-sparse-splade_pp_en_v1': get_embedding(payload_new['purpose'])[1]
|
67 |
+
},
|
68 |
payload={
|
69 |
"filename": payload_new['filename'],
|
70 |
"content": payload_new['content'],
|
|
|
97 |
# Update the existing point with the new file data
|
98 |
point = PointStruct(
|
99 |
id=point_id,
|
100 |
+
vector={
|
101 |
+
'fast-bge-base-en': get_embedding(payload_current['purpose'])[0],
|
102 |
+
'fast-sparse-splade_pp_en_v1': get_embedding(payload_current['purpose'])[1]
|
103 |
+
},
|
|
|
|
|
|
|
|
|
|
|
104 |
payload={
|
105 |
"filename": payload_current['filename'],
|
106 |
"content": file['content'],
|
src/search_qdrant/streamlit_app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
import pyperclip
|
3 |
from pathlib import Path
|
4 |
from src.fabrics_processor.database import initialize_qdrant_database
|
@@ -155,15 +156,14 @@ def update_database():
|
|
155 |
fabric_patterns_folder=config.fabric_patterns_folder
|
156 |
)
|
157 |
|
158 |
-
# Update the database
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
)
|
167 |
|
168 |
# Get updated collection info
|
169 |
collection_info = st.session_state.client.get_collection(config.embedding.collection_name)
|
|
|
1 |
import streamlit as st
|
2 |
+
import os
|
3 |
import pyperclip
|
4 |
from pathlib import Path
|
5 |
from src.fabrics_processor.database import initialize_qdrant_database
|
|
|
156 |
fabric_patterns_folder=config.fabric_patterns_folder
|
157 |
)
|
158 |
|
159 |
+
# Update the database
|
160 |
+
update_qdrant_database(
|
161 |
+
client=st.session_state.client,
|
162 |
+
collection_name=config.embedding.collection_name,
|
163 |
+
new_files=new_files,
|
164 |
+
modified_files=modified_files,
|
165 |
+
deleted_files=deleted_files
|
166 |
+
)
|
|
|
167 |
|
168 |
# Get updated collection info
|
169 |
collection_info = st.session_state.client.get_collection(config.embedding.collection_name)
|