Spaces:
Running
Running
File size: 3,096 Bytes
372531f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from typing import List, Dict, Any, Tuple
from colorama import Fore, Style
from ..scraper import Scraper
from ..config.config import Config
from ..utils.logger import get_formatted_logger
logger = get_formatted_logger()
def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Scrapes the urls
Args:
urls: List of urls
cfg: Config (optional)
Returns:
Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: Tuple containing scraped content and images
"""
scraped_data = []
images = []
user_agent = (
cfg.user_agent
if cfg
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
)
try:
scraper = Scraper(urls, user_agent, cfg.scraper)
scraped_data = scraper.run()
for item in scraped_data:
if 'image_urls' in item:
images.extend([img for img in item['image_urls']])
except Exception as e:
print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")
return scraped_data, images
async def filter_urls(urls: List[str], config: Config) -> List[str]:
"""
Filter URLs based on configuration settings.
Args:
urls (List[str]): List of URLs to filter.
config (Config): Configuration object.
Returns:
List[str]: Filtered list of URLs.
"""
filtered_urls = []
for url in urls:
# Add your filtering logic here
# For example, you might want to exclude certain domains or URL patterns
if not any(excluded in url for excluded in config.excluded_domains):
filtered_urls.append(url)
return filtered_urls
async def extract_main_content(html_content: str) -> str:
"""
Extract the main content from HTML.
Args:
html_content (str): Raw HTML content.
Returns:
str: Extracted main content.
"""
# Implement content extraction logic here
# This could involve using libraries like BeautifulSoup or custom parsing logic
# For now, we'll just return the raw HTML as a placeholder
return html_content
async def process_scraped_data(scraped_data: List[Dict[str, Any]], config: Config) -> List[Dict[str, Any]]:
"""
Process the scraped data to extract and clean the main content.
Args:
scraped_data (List[Dict[str, Any]]): List of dictionaries containing scraped data.
config (Config): Configuration object.
Returns:
List[Dict[str, Any]]: Processed scraped data.
"""
processed_data = []
for item in scraped_data:
if item['status'] == 'success':
main_content = await extract_main_content(item['content'])
processed_data.append({
'url': item['url'],
'content': main_content,
'status': 'success'
})
else:
processed_data.append(item)
return processed_data
|