SPO

Sleeping

App Files Files Community

SPO / metagpt /actions /research.py

XiangJinYu

add metagpt

fe5c39d verified 3 months ago

raw

history blame contribute delete

11.2 kB

	#!/usr/bin/env python

	from __future__ import annotations

	import asyncio
	from typing import Any, Callable, Optional, Union

	from pydantic import TypeAdapter, model_validator

	from metagpt.actions import Action
	from metagpt.config2 import config
	from metagpt.logs import logger
	from metagpt.tools.search_engine import SearchEngine
	from metagpt.tools.web_browser_engine import WebBrowserEngine
	from metagpt.utils.common import OutputParser
	from metagpt.utils.text import generate_prompt_chunk, reduce_message_length

	LANG_PROMPT = "Please respond in {language}."

	RESEARCH_BASE_SYSTEM = """You are an AI critical thinker research assistant. Your sole purpose is to write well \
	written, critically acclaimed, objective and structured reports on the given text."""

	RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is:\n#TOPIC#\n{topic}"

	SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic for Google search. \
	Your response must be in JSON format, for example: ["keyword1", "keyword2"]."""

	SUMMARIZE_SEARCH_PROMPT = """### Requirements
	1. The keywords related to your research topic and the search results are shown in the "Search Result Information" section.
	2. Provide up to {decomposition_nums} queries related to your research topic base on the search results.
	3. Please respond in the following JSON format: ["query1", "query2", "query3", ...].

	### Search Result Information
	{search_results}
	"""

	COLLECT_AND_RANKURLS_PROMPT = """### Topic
	{topic}
	### Query
	{query}

	### The online search results
	{results}

	### Requirements
	Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \
	based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the
	ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
	"""

	WEB_BROWSE_AND_SUMMARIZE_PROMPT = """### Requirements
	1. Utilize the text in the "Reference Information" section to respond to the question "{query}".
	2. If the question cannot be directly answered using the text, but the text is related to the research topic, please provide \
	a comprehensive summary of the text.
	3. If the text is entirely unrelated to the research topic, please reply with a simple text "Not relevant."
	4. Include all relevant factual information, numbers, statistics, etc., if available.

	### Reference Information
	{content}
	"""


	CONDUCT_RESEARCH_PROMPT = """### Reference Information
	{content}

	### Requirements
	Please provide a detailed research report in response to the following topic: "{topic}", using the information provided \
	above. The report must meet the following requirements:

	- Focus on directly addressing the chosen topic.
	- Ensure a well-structured and in-depth presentation, incorporating relevant facts and figures where available.
	- Present data and findings in an intuitive manner, utilizing feature comparative tables, if applicable.
	- The report should have a minimum word count of 2,000 and be formatted with Markdown syntax following APA style guidelines.
	- Include all source URLs in APA format at the end of the report.
	"""


	class CollectLinks(Action):
	"""Action class to collect links from a search engine."""

	name: str = "CollectLinks"
	i_context: Optional[str] = None
	desc: str = "Collect links from a search engine."
	search_func: Optional[Any] = None
	search_engine: Optional[SearchEngine] = None
	rank_func: Optional[Callable[[list[str]], None]] = None

	@model_validator(mode="after")
	def validate_engine_and_run_func(self):
	if self.search_engine is None:
	self.search_engine = SearchEngine.from_search_config(self.config.search, proxy=self.config.proxy)
	return self

	async def run(
	self,
	topic: str,
	decomposition_nums: int = 4,
	url_per_query: int = 4,
	system_text: str \| None = None,
	) -> dict[str, list[str]]:
	"""Run the action to collect links.

	Args:
	topic: The research topic.
	decomposition_nums: The number of search questions to generate.
	url_per_query: The number of URLs to collect per search question.
	system_text: The system text.

	Returns:
	A dictionary containing the search questions as keys and the collected URLs as values.
	"""
	system_text = system_text if system_text else RESEARCH_TOPIC_SYSTEM.format(topic=topic)
	keywords = await self._aask(SEARCH_TOPIC_PROMPT, [system_text])
	try:
	keywords = OutputParser.extract_struct(keywords, list)
	keywords = TypeAdapter(list[str]).validate_python(keywords)
	except Exception as e:
	logger.exception(f"fail to get keywords related to the research topic '{topic}' for {e}")
	keywords = [topic]
	results = await asyncio.gather(*(self.search_engine.run(i, as_string=False) for i in keywords))

	def gen_msg():
	while True:
	search_results = "\n".join(
	f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results)
	)
	prompt = SUMMARIZE_SEARCH_PROMPT.format(
	decomposition_nums=decomposition_nums, search_results=search_results
	)
	yield prompt
	remove = max(results, key=len)
	remove.pop()
	if len(remove) == 0:
	break

	model_name = config.llm.model
	prompt = reduce_message_length(gen_msg(), model_name, system_text, config.llm.max_token)
	logger.debug(prompt)
	queries = await self._aask(prompt, [system_text])
	try:
	queries = OutputParser.extract_struct(queries, list)
	queries = TypeAdapter(list[str]).validate_python(queries)
	except Exception as e:
	logger.exception(f"fail to break down the research question due to {e}")
	queries = keywords
	ret = {}
	for query in queries:
	ret[query] = await self._search_and_rank_urls(topic, query, url_per_query)
	return ret

	async def _search_and_rank_urls(self, topic: str, query: str, num_results: int = 4) -> list[str]:
	"""Search and rank URLs based on a query.

	Args:
	topic: The research topic.
	query: The search query.
	num_results: The number of URLs to collect.

	Returns:
	A list of ranked URLs.
	"""
	max_results = max(num_results * 2, 6)
	results = await self.search_engine.run(query, max_results=max_results, as_string=False)
	if len(results) == 0:
	return []
	_results = "\n".join(f"{i}: {j}" for i, j in zip(range(max_results), results))
	prompt = COLLECT_AND_RANKURLS_PROMPT.format(topic=topic, query=query, results=_results)
	logger.debug(prompt)
	indices = await self._aask(prompt)
	try:
	indices = OutputParser.extract_struct(indices, list)
	assert all(isinstance(i, int) for i in indices)
	except Exception as e:
	logger.exception(f"fail to rank results for {e}")
	indices = list(range(max_results))
	results = [results[i] for i in indices]
	if self.rank_func:
	results = self.rank_func(results)
	return [i["link"] for i in results[:num_results]]


	class WebBrowseAndSummarize(Action):
	"""Action class to explore the web and provide summaries of articles and webpages."""

	name: str = "WebBrowseAndSummarize"
	i_context: Optional[str] = None
	desc: str = "Explore the web and provide summaries of articles and webpages."
	browse_func: Union[Callable[[list[str]], None], None] = None
	web_browser_engine: Optional[WebBrowserEngine] = None

	@model_validator(mode="after")
	def validate_engine_and_run_func(self):
	if self.web_browser_engine is None:
	self.web_browser_engine = WebBrowserEngine.from_browser_config(
	self.config.browser,
	browse_func=self.browse_func,
	proxy=self.config.proxy,
	)
	return self

	async def run(
	self,
	url: str,
	*urls: str,
	query: str,
	system_text: str = RESEARCH_BASE_SYSTEM,
	) -> dict[str, str]:
	"""Run the action to browse the web and provide summaries.

	Args:
	url: The main URL to browse.
	urls: Additional URLs to browse.
	query: The research question.
	system_text: The system text.

	Returns:
	A dictionary containing the URLs as keys and their summaries as values.
	"""
	contents = await self.web_browser_engine.run(url, *urls)
	if not urls:
	contents = [contents]

	summaries = {}
	prompt_template = WEB_BROWSE_AND_SUMMARIZE_PROMPT.format(query=query, content="{}")
	for u, content in zip([url, *urls], contents):
	content = content.inner_text
	chunk_summaries = []
	for prompt in generate_prompt_chunk(content, prompt_template, self.llm.model, system_text, 4096):
	logger.debug(prompt)
	summary = await self._aask(prompt, [system_text])
	if summary == "Not relevant.":
	continue
	chunk_summaries.append(summary)

	if not chunk_summaries:
	summaries[u] = None
	continue

	if len(chunk_summaries) == 1:
	summaries[u] = chunk_summaries[0]
	continue

	content = "\n".join(chunk_summaries)
	prompt = WEB_BROWSE_AND_SUMMARIZE_PROMPT.format(query=query, content=content)
	summary = await self._aask(prompt, [system_text])
	summaries[u] = summary
	return summaries


	class ConductResearch(Action):
	"""Action class to conduct research and generate a research report."""

	def __init__(self, **kwargs):
	super().__init__(**kwargs)

	async def run(
	self,
	topic: str,
	content: str,
	system_text: str = RESEARCH_BASE_SYSTEM,
	) -> str:
	"""Run the action to conduct research and generate a research report.

	Args:
	topic: The research topic.
	content: The content for research.
	system_text: The system text.

	Returns:
	The generated research report.
	"""
	prompt = CONDUCT_RESEARCH_PROMPT.format(topic=topic, content=content)
	logger.debug(prompt)
	self.llm.auto_max_tokens = True
	return await self._aask(prompt, [system_text])


	def get_research_system_text(topic: str, language: str):
	"""Get the system text for conducting research.

	Args:
	topic: The research topic.
	language: The language for the system text.

	Returns:
	The system text for conducting research.
	"""
	return " ".join((RESEARCH_TOPIC_SYSTEM.format(topic=topic), LANG_PROMPT.format(language=language)))