File size: 3,096 Bytes
372531f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from typing import List, Dict, Any, Tuple
from colorama import Fore, Style
from ..scraper import Scraper
from ..config.config import Config
from ..utils.logger import get_formatted_logger

logger = get_formatted_logger()

def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """

    Scrapes the urls

    Args:

        urls: List of urls

        cfg: Config (optional)



    Returns:

        Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: Tuple containing scraped content and images



    """
    scraped_data = []
    images = []
    user_agent = (
        cfg.user_agent
        if cfg
        else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
    )

    try:
        scraper = Scraper(urls, user_agent, cfg.scraper)
        scraped_data = scraper.run()
        for item in scraped_data:
            if 'image_urls' in item:
                images.extend([img for img in item['image_urls']])
    except Exception as e:
        print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")

    return scraped_data, images

async def filter_urls(urls: List[str], config: Config) -> List[str]:
    """

    Filter URLs based on configuration settings.



    Args:

        urls (List[str]): List of URLs to filter.

        config (Config): Configuration object.



    Returns:

        List[str]: Filtered list of URLs.

    """
    filtered_urls = []
    for url in urls:
        # Add your filtering logic here
        # For example, you might want to exclude certain domains or URL patterns
        if not any(excluded in url for excluded in config.excluded_domains):
            filtered_urls.append(url)
    return filtered_urls

async def extract_main_content(html_content: str) -> str:
    """

    Extract the main content from HTML.



    Args:

        html_content (str): Raw HTML content.



    Returns:

        str: Extracted main content.

    """
    # Implement content extraction logic here
    # This could involve using libraries like BeautifulSoup or custom parsing logic
    # For now, we'll just return the raw HTML as a placeholder
    return html_content

async def process_scraped_data(scraped_data: List[Dict[str, Any]], config: Config) -> List[Dict[str, Any]]:
    """

    Process the scraped data to extract and clean the main content.



    Args:

        scraped_data (List[Dict[str, Any]]): List of dictionaries containing scraped data.

        config (Config): Configuration object.



    Returns:

        List[Dict[str, Any]]: Processed scraped data.

    """
    processed_data = []
    for item in scraped_data:
        if item['status'] == 'success':
            main_content = await extract_main_content(item['content'])
            processed_data.append({
                'url': item['url'],
                'content': main_content,
                'status': 'success'
            })
        else:
            processed_data.append(item)
    return processed_data