Spaces:

senatornorth
/

crawl

Build error

App Files Files Community

senatornorth commited on Jan 14

Commit

4040a7b

1 Parent(s): 33549e1

Add application file

Browse files

Files changed (3) hide show

Dockerfile +136 -0
app.py +54 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,136 @@

+# syntax=docker/dockerfile:1.4
+ARG TARGETPLATFORM
+ARG BUILDPLATFORM
+# Other build arguments
+ARG PYTHON_VERSION=3.10
+# Base stage with system dependencies
+FROM python:${PYTHON_VERSION}-slim as base
+# Declare ARG variables again within the build stage
+ARG INSTALL_TYPE=all
+ARG ENABLE_GPU=false
+# Platform-specific labels
+LABEL maintainer="unclecode"
+LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
+LABEL version="1.0"
+# Environment setup
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_DEFAULT_TIMEOUT=100 \
+    DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    wget \
+    gnupg \
+    git \
+    cmake \
+    pkg-config \
+    python3-dev \
+    libjpeg-dev \
+    libpng-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Playwright system dependencies for Linux
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libglib2.0-0 \
+    libnss3 \
+    libnspr4 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libdbus-1-3 \
+    libxcb1 \
+    libxkbcommon0 \
+    libx11-6 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    libatspi2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# GPU support if enabled and architecture is supported
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+    apt-get update && apt-get install -y --no-install-recommends \
+    nvidia-cuda-toolkit \
+    && rm -rf /var/lib/apt/lists/* ; \
+else \
+    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
+fi
+# Create and set working directory
+WORKDIR /app
+# Copy the entire project
+COPY . .
+# Install base requirements
+RUN pip install --no-cache-dir -r requirements.txt
+# Install required library for FastAPI
+RUN pip install fastapi uvicorn psutil
+# Install ML dependencies first for better layer caching
+RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
+        pip install --no-cache-dir \
+            torch \
+            torchvision \
+            torchaudio \
+            scikit-learn \
+            nltk \
+            transformers \
+            tokenizers && \
+        python -m nltk.downloader punkt stopwords ; \
+    fi
+# Install the package
+RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
+        pip install ".[all]" && \
+        python -m crawl4ai.model_loader ; \
+    elif [ "$INSTALL_TYPE" = "torch" ] ; then \
+        pip install ".[torch]" ; \
+    elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
+        pip install ".[transformer]" && \
+        python -m crawl4ai.model_loader ; \
+    else \
+        pip install "." ; \
+    fi
+    # Install MkDocs and required plugins
+RUN pip install --no-cache-dir \
+    mkdocs \
+    mkdocs-material \
+    mkdocs-terminal \
+    pymdown-extensions
+# Build MkDocs documentation
+RUN mkdocs build
+# Install Playwright and browsers
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+    playwright install chromium; \
+    elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+    playwright install chromium; \
+    fi
+# Expose port
+EXPOSE 8000 11235 9222 8080
+# Start the FastAPI server
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import modal
+# Install the necessary dependencies as custom container image which we will pass to our functions
+crawler = modal.Image.debian_slim(python_version="3.10").pip_install_from_requirements("requirements.txt").run_commands(
+    "apt-get update",
+    "apt-get install -y software-properties-common",
+    "apt-add-repository non-free",
+    "apt-add-repository contrib",
+    "playwright install-deps chromium",
+    "playwright install chromium",
+    "playwright install",
+)
+import asyncio
+from crawl4ai import AsyncWebCrawler
+import playwright
+from typing import Optional, Union, List
+from pydantic import BaseModel, Field
+from fastapi import Header, HTTPException
+from jwt import decode, PyJWTError
+import os
+app = modal.App("crawler")
+class CrawlRequest(BaseModel):
+    url: str
+    bypass_cache: bool = Field(default=False)
+    # other kwargs
+# Define the function that will be executed in the container
+@app.function(image=crawler)
+@modal.web_endpoint(method="POST", docs=True)
+async def crawl(request: CrawlRequest, authorization: str = Header(...)):
+    # You will want to have your own authorization strategy here to protect your endpoint
+    print(f"Crawling URL: {request}")
+    # Create an instance of AsyncWebCrawler
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # Run the crawler on the given URL
+        crawl_kwargs = request.dict(exclude_unset=True)
+        try:
+            result = await crawler.arun(**crawl_kwargs)
+            print(result)
+            return result
+        except Exception as e:
+            error_message = f"Error during crawling: {str(e)}"
+            print(error_message)
+            return {"error": error_message}
+# Entrypoint that will be used to trigger the crawler when testing locally
+@app.local_entrypoint()
+async def main(url: str):
+    result = crawl.remote(CrawlRequest(url=url))
+    print(result)
+    return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+crawl4ai
+asyncio
+playwright
+fastapi[standard]
+pydantic
+PyJWT