leo-bourrel commited on
Commit
8d8abb6
·
1 Parent(s): 28a498b

feat: add keyword extraction

Browse files
Files changed (1) hide show
  1. sorbobotapp/keyword_extraction.py +58 -0
sorbobotapp/keyword_extraction.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from utils import str_to_list
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain.output_parsers import NumberedListOutputParser
5
+ from langchain.prompts import ChatPromptTemplate
6
+
7
+
8
+ query_template = """
9
+ You are a bi-lingual (french and english) linguistic teacher working at a top-tier university.
10
+ We are conducting a research project that requires the extraction of keywords from chatbot queries.
11
+ Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query.
12
+ For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"].
13
+ An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length.
14
+ Please ensure not to list more than three n-grams in total.
15
+ Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution.
16
+
17
+ Please attach your ranked list in the following format:
18
+ 1. Keyword/Phrase - Category
19
+ 2. Keyword/Phrase - Category
20
+ 3. Keyword/Phrase - Category
21
+
22
+ You must be concise and don't need to justify your choices.
23
+ ```
24
+ {query}
25
+ ```
26
+ """
27
+
28
+ output_parser = NumberedListOutputParser()
29
+ format_instructions = output_parser.get_format_instructions()
30
+
31
+
32
+ class KeywordExtractor:
33
+ def __init__(self):
34
+ super().__init__()
35
+ self.model = ChatOpenAI()
36
+ self.prompt = ChatPromptTemplate.from_template(
37
+ template=query_template,
38
+ )
39
+
40
+ self.chain = self.prompt | self.model # | output_parser
41
+
42
+ def __call__(
43
+ self, inputs: str, filter_categories: list[str] = ["Research domain"]
44
+ ) -> Any:
45
+ output = self.chain.invoke({"query": inputs})
46
+
47
+ keywords = output_parser.parse(output.content)
48
+
49
+ filtered_keywords = []
50
+ for keyword in keywords:
51
+ if " - " not in keyword:
52
+ continue
53
+
54
+ keyword, category = keyword.split(" - ", maxsplit=2)
55
+ if category in filter_categories:
56
+ filtered_keywords.append(keyword)
57
+
58
+ return filtered_keywords