Spaces:
Sleeping
Sleeping
Commit
·
8d8abb6
1
Parent(s):
28a498b
feat: add keyword extraction
Browse files
sorbobotapp/keyword_extraction.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
from utils import str_to_list
|
3 |
+
from langchain.chat_models import ChatOpenAI
|
4 |
+
from langchain.output_parsers import NumberedListOutputParser
|
5 |
+
from langchain.prompts import ChatPromptTemplate
|
6 |
+
|
7 |
+
|
8 |
+
query_template = """
|
9 |
+
You are a bi-lingual (french and english) linguistic teacher working at a top-tier university.
|
10 |
+
We are conducting a research project that requires the extraction of keywords from chatbot queries.
|
11 |
+
Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query.
|
12 |
+
For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"].
|
13 |
+
An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length.
|
14 |
+
Please ensure not to list more than three n-grams in total.
|
15 |
+
Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution.
|
16 |
+
|
17 |
+
Please attach your ranked list in the following format:
|
18 |
+
1. Keyword/Phrase - Category
|
19 |
+
2. Keyword/Phrase - Category
|
20 |
+
3. Keyword/Phrase - Category
|
21 |
+
|
22 |
+
You must be concise and don't need to justify your choices.
|
23 |
+
```
|
24 |
+
{query}
|
25 |
+
```
|
26 |
+
"""
|
27 |
+
|
28 |
+
output_parser = NumberedListOutputParser()
|
29 |
+
format_instructions = output_parser.get_format_instructions()
|
30 |
+
|
31 |
+
|
32 |
+
class KeywordExtractor:
|
33 |
+
def __init__(self):
|
34 |
+
super().__init__()
|
35 |
+
self.model = ChatOpenAI()
|
36 |
+
self.prompt = ChatPromptTemplate.from_template(
|
37 |
+
template=query_template,
|
38 |
+
)
|
39 |
+
|
40 |
+
self.chain = self.prompt | self.model # | output_parser
|
41 |
+
|
42 |
+
def __call__(
|
43 |
+
self, inputs: str, filter_categories: list[str] = ["Research domain"]
|
44 |
+
) -> Any:
|
45 |
+
output = self.chain.invoke({"query": inputs})
|
46 |
+
|
47 |
+
keywords = output_parser.parse(output.content)
|
48 |
+
|
49 |
+
filtered_keywords = []
|
50 |
+
for keyword in keywords:
|
51 |
+
if " - " not in keyword:
|
52 |
+
continue
|
53 |
+
|
54 |
+
keyword, category = keyword.split(" - ", maxsplit=2)
|
55 |
+
if category in filter_categories:
|
56 |
+
filtered_keywords.append(keyword)
|
57 |
+
|
58 |
+
return filtered_keywords
|