diff --git a/README.md b/README.md index 3144d8bb..f14650a7 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ intuned dev run api .parameters/api//default.json |---------|-------------| | [starter](./python-examples/starter/) | Starter template for new projects | | [starter-auth](./python-examples/starter-auth/) | Starter template with Auth Sessions enabled | +| [starter-crawl4ai](./python-examples/starter-crawl4ai/) | Minimal Crawl4AI single-URL crawling starter | | [starter-network-interception](./python-examples/starter-network-interception/) | Minimal network interception starter | | [starter-rpa](./python-examples/starter-rpa/) | Minimal RPA starter for browser automation | | [starter-scrapy](./python-examples/starter-scrapy/) | Minimal Scrapy starter | diff --git a/python-examples/README.md b/python-examples/README.md index a67528b5..87d4ef69 100644 --- a/python-examples/README.md +++ b/python-examples/README.md @@ -26,6 +26,7 @@ intuned dev run api .parameters/api//default.json | -------- | ------------- | | [starter](./starter/) | Starter template for new projects | | [starter-auth](./starter-auth/) | Starter template with Auth Sessions enabled | +| [starter-crawl4ai](./starter-crawl4ai/) | Minimal Crawl4AI single-URL crawling starter | | [starter-network-interception](./starter-network-interception/) | Minimal network interception starter | | [starter-rpa](./starter-rpa/) | Minimal RPA starter for browser automation | | [starter-scrapy](./starter-scrapy/) | Minimal Scrapy starter | diff --git a/python-examples/starter-crawl4ai/.env.example b/python-examples/starter-crawl4ai/.env.example new file mode 100644 index 00000000..29b1e203 --- /dev/null +++ b/python-examples/starter-crawl4ai/.env.example @@ -0,0 +1 @@ +INTUNED_API_KEY=your_api_key_here \ No newline at end of file diff --git a/python-examples/starter-crawl4ai/.gitignore b/python-examples/starter-crawl4ai/.gitignore new file mode 100644 index 00000000..6fc22999 --- /dev/null +++ b/python-examples/starter-crawl4ai/.gitignore @@ -0,0 +1,53 @@ +# Dependencies +node_modules/ +.pnp +.pnp.js + +# Production builds +/build +/dist +/.next/ +/out/ + +# Environment variables +.env +.env.local +.env.development.local +.env.test.local +.env.production.local + +# Logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Runtime/temporary files +.cache +.parcel-cache +*.tsbuildinfo + +# Coverage +/coverage +.nyc_output + +# Python +__pycache__/ +*.py[cod] +*.pyc +.Python +build/ +*.egg-info/ +.venv/ +venv/ +.env + +# OS files +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +.intuned +.intuned-agent diff --git a/python-examples/starter-crawl4ai/.parameters/api/simple-crawl/default.json b/python-examples/starter-crawl4ai/.parameters/api/simple-crawl/default.json new file mode 100644 index 00000000..549c6e13 --- /dev/null +++ b/python-examples/starter-crawl4ai/.parameters/api/simple-crawl/default.json @@ -0,0 +1,3 @@ +{ + "url": "https://playwright.dev/docs/intro" +} diff --git a/python-examples/starter-crawl4ai/Intuned.jsonc b/python-examples/starter-crawl4ai/Intuned.jsonc new file mode 100644 index 00000000..a68c8719 --- /dev/null +++ b/python-examples/starter-crawl4ai/Intuned.jsonc @@ -0,0 +1,31 @@ +// For more information, see our Intuned settings reference +// https://intunedhq.com/docs/main/05-references/intuned-json +{ + "apiAccess": { + "enabled": true + }, + "authSessions": { + "enabled": false + }, + "replication": { + "maxConcurrentRequests": 1, + "size": "large" + }, + "metadata": { + "template": { + "name": "starter-crawl4ai", + "description": "Minimal Crawl4AI starter that crawls a single URL to clean markdown", + "tags": [ + "starter", + "crawling", + "crawl4ai" + ] + }, + "defaultRunPlaygroundInput": { + "apiName": "simple-crawl", + "parameters": { + "url": "https://playwright.dev/docs/intro" + } + } + } +} diff --git a/python-examples/starter-crawl4ai/README.md b/python-examples/starter-crawl4ai/README.md new file mode 100644 index 00000000..7b82fafe --- /dev/null +++ b/python-examples/starter-crawl4ai/README.md @@ -0,0 +1,75 @@ +# starter-crawl4ai (Python) + +Minimal [Crawl4AI](https://crawl4ai.com) starter — crawls a single URL and returns the page content as clean markdown. + +For deep crawling, multi-URL crawling, content selection, and adaptive crawling, see the [Crawl4AI documentation](https://docs.crawl4ai.com/). + + +Run on Intuned + + +## APIs + +| API | Description | +| --- | ----------- | +| `simple-crawl` | Crawls a single URL and returns the page content as clean markdown | + + +## Getting Started + +### Install dependencies + +```bash +uv sync +``` + +If the `intuned` CLI is not installed, install it globally: + +```bash +npm install -g @intuned/cli +``` + +After installing dependencies, `intuned` command should be available in your environment. + +### Run an API + +```bash +intuned dev run api simple-crawl .parameters/api/simple-crawl/default.json +``` + +### Save project + +```bash +intuned dev provision +``` + +### Deploy + +```bash +intuned dev deploy +``` + + +## Project Structure + +``` +starter-crawl4ai/ +├── api/ +│ └── simple-crawl.py # Crawl a single URL to markdown +├── intuned-resources/ +│ └── jobs/ +│ └── simple-crawl.job.jsonc # Job definition for simple-crawl API +├── .parameters/ +│ └── api/ +│ └── simple-crawl/ +├── Intuned.jsonc +├── pyproject.toml +└── README.md +``` + +## Related + +- [Crawl4AI Documentation](https://docs.crawl4ai.com/) +- [Intuned CLI](https://intunedhq.com/docs/main/05-references/cli/overview) +- [Intuned Browser SDK](https://intunedhq.com/docs/automation-sdks/overview) +- [Intuned llm.txt](https://intunedhq.com/docs/llms.txt) diff --git a/python-examples/starter-crawl4ai/api/simple-crawl.py b/python-examples/starter-crawl4ai/api/simple-crawl.py new file mode 100644 index 00000000..96757b20 --- /dev/null +++ b/python-examples/starter-crawl4ai/api/simple-crawl.py @@ -0,0 +1,62 @@ +""" +Crawls a single URL and returns the page content as clean markdown. + +Based on: https://docs.crawl4ai.com/core/simple-crawling/ +""" + +from typing import TypedDict + +from playwright.async_api import BrowserContext, Page + +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CacheMode, CrawlerRunConfig + + +class Params(TypedDict): + url: str + + +async def automation( + page: Page, + params: Params, + context: BrowserContext | None = None, + **_kwargs, +): + url = params.get("url") + if not url: + return { + "success": False, + "error": "URL parameter is required", + } + + browser_config = BrowserConfig(verbose=True) + run_config = CrawlerRunConfig( + # Content filtering + word_count_threshold=10, + excluded_tags=["form", "header"], + exclude_external_links=True, + # Content processing + process_iframes=True, + remove_overlay_elements=True, + # Cache control + cache_mode=CacheMode.ENABLED, # Use cache if available + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url=url, + config=run_config, + ) + + if result.success: + return { + "success": True, + "markdown": result.markdown, + "images": result.media["images"], + "links": result.links["internal"], + } + else: + return { + "success": False, + "error": result.error_message, + } diff --git a/python-examples/starter-crawl4ai/intuned-resources/jobs/simple-crawl.job.jsonc b/python-examples/starter-crawl4ai/intuned-resources/jobs/simple-crawl.job.jsonc new file mode 100644 index 00000000..e410d09d --- /dev/null +++ b/python-examples/starter-crawl4ai/intuned-resources/jobs/simple-crawl.job.jsonc @@ -0,0 +1,16 @@ +{ + "configuration": { + "maxConcurrentRequests": 2, + "retry": { + "maximumAttempts": 3 + } + }, + "payload": [ + { + "apiName": "simple-crawl", + "parameters": { + "url": "https://playwright.dev/docs/intro" + } + } + ] +} diff --git a/python-examples/starter-crawl4ai/pyproject.toml b/python-examples/starter-crawl4ai/pyproject.toml new file mode 100644 index 00000000..3fe017a6 --- /dev/null +++ b/python-examples/starter-crawl4ai/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "default" +version = "0.0.1" +description = "Empty Intuned project" +authors = [{ name = "Intuned", email = "service@intunedhq.com" }] +requires-python = ">=3.12,<3.13" +readme = "README.md" +keywords = [ + "Python", + "intuned-browser-sdk", +] +dependencies = [ + "playwright==1.56", + "intuned-runtime==1.3.33", + "intuned-browser==0.1.17", + "crawl4ai==0.8.6", +] + +[tool.uv] +package = false