Spaces:

SorbonneUniversity
/

SorboBot

Sleeping

App Files Files Community

leo-bourrel commited on Nov 13, 2023

Commit

8d8abb6

1 Parent(s): 28a498b

feat: add keyword extraction

Browse files

Files changed (1) hide show

sorbobotapp/keyword_extraction.py +58 -0

sorbobotapp/keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import Any
+from utils import str_to_list
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers import NumberedListOutputParser
+from langchain.prompts import ChatPromptTemplate
+query_template = """
+You are a bi-lingual (french and english) linguistic teacher working at a top-tier university.
+We are conducting a research project that requires the extraction of keywords from chatbot queries.
+Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query.
+For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"].
+An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length.
+Please ensure not to list more than three n-grams in total.
+Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution.
+Please attach your ranked list in the following format:
+1. Keyword/Phrase - Category
+2. Keyword/Phrase - Category
+3. Keyword/Phrase - Category
+You must be concise and don't need to justify your choices.
+```
+{query}
+```
+"""
+output_parser = NumberedListOutputParser()
+format_instructions = output_parser.get_format_instructions()
+class KeywordExtractor:
+    def __init__(self):
+        super().__init__()
+        self.model = ChatOpenAI()
+        self.prompt = ChatPromptTemplate.from_template(
+            template=query_template,
+        )
+        self.chain = self.prompt | self.model  # | output_parser
+    def __call__(
+        self, inputs: str, filter_categories: list[str] = ["Research domain"]
+    ) -> Any:
+        output = self.chain.invoke({"query": inputs})
+        keywords = output_parser.parse(output.content)
+        filtered_keywords = []
+        for keyword in keywords:
+            if " - " not in keyword:
+                continue
+            keyword, category = keyword.split(" - ", maxsplit=2)
+            if category in filter_categories:
+                filtered_keywords.append(keyword)
+        return filtered_keywords