Spaces:
Running
Running
from bs4 import BeautifulSoup | |
import os | |
from ..utils import get_relevant_images, extract_title | |
class TavilyExtract: | |
def __init__(self, link, session=None): | |
self.link = link | |
self.session = session | |
from tavily import TavilyClient | |
self.tavily_client = TavilyClient(api_key=self.get_api_key()) | |
def get_api_key(self) -> str: | |
""" | |
Gets the Tavily API key | |
Returns: | |
Api key (str) | |
""" | |
try: | |
api_key = os.environ["TAVILY_API_KEY"] | |
except KeyError: | |
raise Exception( | |
"Tavily API key not found. Please set the TAVILY_API_KEY environment variable.") | |
return api_key | |
def scrape(self) -> tuple: | |
""" | |
This function extracts content from a specified link using the Tavily Python SDK, the title and | |
images from the link are extracted using the functions from `gpt_researcher/scraper/utils.py`. | |
Returns: | |
The `scrape` method returns a tuple containing the extracted content, a list of image URLs, and | |
the title of the webpage specified by the `self.link` attribute. It uses the Tavily Python SDK to | |
extract and clean content from the webpage. If any exception occurs during the process, an error | |
message is printed and an empty result is returned. | |
""" | |
try: | |
response = self.tavily_client.extract(urls=self.link) | |
if response['failed_results']: | |
return "", [], "" | |
# Parse the HTML content of the response to create a BeautifulSoup object for the utility functions | |
response_bs = self.session.get(self.link, timeout=4) | |
soup = BeautifulSoup( | |
response_bs.content, "lxml", from_encoding=response_bs.encoding | |
) | |
# Since only a single link is provided to tavily_client, the results will contain only one entry. | |
content = response['results'][0]['raw_content'] | |
# Get relevant images using the utility function | |
image_urls = get_relevant_images(soup, self.link) | |
# Extract the title using the utility function | |
title = extract_title(soup) | |
return content, image_urls, title | |
except Exception as e: | |
print("Error! : " + str(e)) | |
return "", [], "" |