senatornorth commited on
Commit
4040a7b
·
1 Parent(s): 33549e1

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +136 -0
  2. app.py +54 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1.4
2
+
3
+ ARG TARGETPLATFORM
4
+ ARG BUILDPLATFORM
5
+
6
+ # Other build arguments
7
+ ARG PYTHON_VERSION=3.10
8
+
9
+ # Base stage with system dependencies
10
+ FROM python:${PYTHON_VERSION}-slim as base
11
+
12
+ # Declare ARG variables again within the build stage
13
+ ARG INSTALL_TYPE=all
14
+ ARG ENABLE_GPU=false
15
+
16
+ # Platform-specific labels
17
+ LABEL maintainer="unclecode"
18
+ LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
19
+ LABEL version="1.0"
20
+
21
+ # Environment setup
22
+ ENV PYTHONUNBUFFERED=1 \
23
+ PYTHONDONTWRITEBYTECODE=1 \
24
+ PIP_NO_CACHE_DIR=1 \
25
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
26
+ PIP_DEFAULT_TIMEOUT=100 \
27
+ DEBIAN_FRONTEND=noninteractive
28
+
29
+ # Install system dependencies
30
+ RUN apt-get update && apt-get install -y --no-install-recommends \
31
+ build-essential \
32
+ curl \
33
+ wget \
34
+ gnupg \
35
+ git \
36
+ cmake \
37
+ pkg-config \
38
+ python3-dev \
39
+ libjpeg-dev \
40
+ libpng-dev \
41
+ && rm -rf /var/lib/apt/lists/*
42
+
43
+ # Playwright system dependencies for Linux
44
+ RUN apt-get update && apt-get install -y --no-install-recommends \
45
+ libglib2.0-0 \
46
+ libnss3 \
47
+ libnspr4 \
48
+ libatk1.0-0 \
49
+ libatk-bridge2.0-0 \
50
+ libcups2 \
51
+ libdrm2 \
52
+ libdbus-1-3 \
53
+ libxcb1 \
54
+ libxkbcommon0 \
55
+ libx11-6 \
56
+ libxcomposite1 \
57
+ libxdamage1 \
58
+ libxext6 \
59
+ libxfixes3 \
60
+ libxrandr2 \
61
+ libgbm1 \
62
+ libpango-1.0-0 \
63
+ libcairo2 \
64
+ libasound2 \
65
+ libatspi2.0-0 \
66
+ && rm -rf /var/lib/apt/lists/*
67
+
68
+ # GPU support if enabled and architecture is supported
69
+ RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
70
+ apt-get update && apt-get install -y --no-install-recommends \
71
+ nvidia-cuda-toolkit \
72
+ && rm -rf /var/lib/apt/lists/* ; \
73
+ else \
74
+ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
75
+ fi
76
+
77
+ # Create and set working directory
78
+ WORKDIR /app
79
+
80
+ # Copy the entire project
81
+ COPY . .
82
+
83
+ # Install base requirements
84
+ RUN pip install --no-cache-dir -r requirements.txt
85
+
86
+ # Install required library for FastAPI
87
+ RUN pip install fastapi uvicorn psutil
88
+
89
+ # Install ML dependencies first for better layer caching
90
+ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
91
+ pip install --no-cache-dir \
92
+ torch \
93
+ torchvision \
94
+ torchaudio \
95
+ scikit-learn \
96
+ nltk \
97
+ transformers \
98
+ tokenizers && \
99
+ python -m nltk.downloader punkt stopwords ; \
100
+ fi
101
+
102
+ # Install the package
103
+ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
104
+ pip install ".[all]" && \
105
+ python -m crawl4ai.model_loader ; \
106
+ elif [ "$INSTALL_TYPE" = "torch" ] ; then \
107
+ pip install ".[torch]" ; \
108
+ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
109
+ pip install ".[transformer]" && \
110
+ python -m crawl4ai.model_loader ; \
111
+ else \
112
+ pip install "." ; \
113
+ fi
114
+
115
+ # Install MkDocs and required plugins
116
+ RUN pip install --no-cache-dir \
117
+ mkdocs \
118
+ mkdocs-material \
119
+ mkdocs-terminal \
120
+ pymdown-extensions
121
+
122
+ # Build MkDocs documentation
123
+ RUN mkdocs build
124
+
125
+ # Install Playwright and browsers
126
+ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
127
+ playwright install chromium; \
128
+ elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
129
+ playwright install chromium; \
130
+ fi
131
+
132
+ # Expose port
133
+ EXPOSE 8000 11235 9222 8080
134
+
135
+ # Start the FastAPI server
136
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+
3
+ # Install the necessary dependencies as custom container image which we will pass to our functions
4
+ crawler = modal.Image.debian_slim(python_version="3.10").pip_install_from_requirements("requirements.txt").run_commands(
5
+ "apt-get update",
6
+ "apt-get install -y software-properties-common",
7
+ "apt-add-repository non-free",
8
+ "apt-add-repository contrib",
9
+ "playwright install-deps chromium",
10
+ "playwright install chromium",
11
+ "playwright install",
12
+ )
13
+
14
+ import asyncio
15
+ from crawl4ai import AsyncWebCrawler
16
+ import playwright
17
+ from typing import Optional, Union, List
18
+ from pydantic import BaseModel, Field
19
+ from fastapi import Header, HTTPException
20
+ from jwt import decode, PyJWTError
21
+ import os
22
+
23
+ app = modal.App("crawler")
24
+
25
+ class CrawlRequest(BaseModel):
26
+ url: str
27
+ bypass_cache: bool = Field(default=False)
28
+ # other kwargs
29
+
30
+ # Define the function that will be executed in the container
31
+ @app.function(image=crawler)
32
+ @modal.web_endpoint(method="POST", docs=True)
33
+ async def crawl(request: CrawlRequest, authorization: str = Header(...)):
34
+ # You will want to have your own authorization strategy here to protect your endpoint
35
+ print(f"Crawling URL: {request}")
36
+ # Create an instance of AsyncWebCrawler
37
+ async with AsyncWebCrawler(verbose=True) as crawler:
38
+ # Run the crawler on the given URL
39
+ crawl_kwargs = request.dict(exclude_unset=True)
40
+ try:
41
+ result = await crawler.arun(**crawl_kwargs)
42
+ print(result)
43
+ return result
44
+ except Exception as e:
45
+ error_message = f"Error during crawling: {str(e)}"
46
+ print(error_message)
47
+ return {"error": error_message}
48
+
49
+ # Entrypoint that will be used to trigger the crawler when testing locally
50
+ @app.local_entrypoint()
51
+ async def main(url: str):
52
+ result = crawl.remote(CrawlRequest(url=url))
53
+ print(result)
54
+ return result
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ crawl4ai
2
+ asyncio
3
+ playwright
4
+ fastapi[standard]
5
+ pydantic
6
+ PyJWT