diff --git a/README.md b/README.md index 5871dea..05b9eb0 100644 --- a/README.md +++ b/README.md @@ -5,40 +5,27 @@ Made with love by the [ScrapeGraphAI team](https://scrapegraphai.com) πŸ’œ ![Demo Video](/assets/demo.gif) -Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, and crawling. +Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, and crawling. Uses the **v2 API**. ## Project Structure ``` just-scrape/ -β”œβ”€β”€ docs/ # API response docs per endpoint -β”‚ β”œβ”€β”€ smartscraper.md -β”‚ β”œβ”€β”€ searchscraper.md -β”‚ β”œβ”€β”€ markdownify.md -β”‚ β”œβ”€β”€ crawl.md -β”‚ β”œβ”€β”€ scrape.md -β”‚ β”œβ”€β”€ agenticscraper.md -β”‚ β”œβ”€β”€ generate-schema.md -β”‚ β”œβ”€β”€ sitemap.md -β”‚ └── credits.md β”œβ”€β”€ src/ β”‚ β”œβ”€β”€ cli.ts # Entry point, citty main command + subcommands β”‚ β”œβ”€β”€ lib/ +β”‚ β”‚ β”œβ”€β”€ client.ts # ScrapeGraphAI v2 client factory β”‚ β”‚ β”œβ”€β”€ env.ts # Env config (API key, JUST_SCRAPE_* β†’ SGAI_* bridge) β”‚ β”‚ β”œβ”€β”€ folders.ts # API key resolution + interactive prompt β”‚ β”‚ └── log.ts # Logger factory + syntax-highlighted JSON output β”‚ β”œβ”€β”€ commands/ -β”‚ β”‚ β”œβ”€β”€ smart-scraper.ts -β”‚ β”‚ β”œβ”€β”€ search-scraper.ts +β”‚ β”‚ β”œβ”€β”€ extract.ts +β”‚ β”‚ β”œβ”€β”€ search.ts +β”‚ β”‚ β”œβ”€β”€ scrape.ts β”‚ β”‚ β”œβ”€β”€ markdownify.ts β”‚ β”‚ β”œβ”€β”€ crawl.ts -β”‚ β”‚ β”œβ”€β”€ sitemap.ts -β”‚ β”‚ β”œβ”€β”€ scrape.ts -β”‚ β”‚ β”œβ”€β”€ agentic-scraper.ts -β”‚ β”‚ β”œβ”€β”€ generate-schema.ts β”‚ β”‚ β”œβ”€β”€ history.ts -β”‚ β”‚ β”œβ”€β”€ credits.ts -β”‚ β”‚ └── validate.ts +β”‚ β”‚ └── credits.ts β”‚ └── utils/ β”‚ └── banner.ts # ASCII banner + version from package.json β”œβ”€β”€ dist/ # Build output (git-ignored) @@ -90,264 +77,190 @@ Four ways to provide it (checked in order): | Variable | Description | Default | |---|---|---| | `SGAI_API_KEY` | ScrapeGraph API key | β€” | -| `JUST_SCRAPE_API_URL` | Override API base URL | `https://api.scrapegraphai.com/v1` | -| `JUST_SCRAPE_TIMEOUT_S` | Request/polling timeout in seconds | `120` | -| `JUST_SCRAPE_DEBUG` | Set to `1` to enable debug logging to stderr | `0` | +| `SGAI_API_URL` | Override API base URL | `https://api.scrapegraphai.com` | +| `SGAI_TIMEOUT_S` | Request timeout in seconds | `30` | + +Legacy variables (`JUST_SCRAPE_API_URL`, `JUST_SCRAPE_TIMEOUT_S`, `JUST_SCRAPE_DEBUG`) are still bridged. ## JSON Mode (`--json`) All commands support `--json` for machine-readable output. When set, banner, spinners, and interactive prompts are suppressed β€” only minified JSON on stdout (saves tokens when piped to AI agents). ```bash -just-scrape credits --json | jq '.remaining_credits' -just-scrape smart-scraper https://example.com -p "Extract data" --json > result.json -just-scrape history smartscraper --json | jq '.requests[].status' +just-scrape credits --json | jq '.remainingCredits' +just-scrape extract https://example.com -p "Extract data" --json > result.json +just-scrape history extract --json | jq '.[].status' ``` --- -## Smart Scraper +## Extract -Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/services/smartscraper) +Extract structured data from any URL using AI (replaces `smart-scraper`). [docs](https://docs.scrapegraphai.com/api-reference/extract) ### Usage ```bash -just-scrape smart-scraper -p # Extract data with AI -just-scrape smart-scraper -p --schema # Enforce output schema -just-scrape smart-scraper -p --scrolls # Infinite scroll (0-100) -just-scrape smart-scraper -p --pages # Multi-page (1-100) -just-scrape smart-scraper -p --stealth # Anti-bot bypass (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text # Plain text instead of JSON +just-scrape extract -p # Extract data with AI +just-scrape extract -p --schema # Enforce output schema +just-scrape extract -p --scrolls # Infinite scroll (0-100) +just-scrape extract -p --stealth # Anti-bot bypass (+4 credits) +just-scrape extract -p --cookies --headers +just-scrape extract -p --country # Geo-targeting ``` ### Examples ```bash # Extract product listings from an e-commerce page -just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" +just-scrape extract https://store.example.com/shoes -p "Extract all product names, prices, and ratings" # Extract with a strict schema, scrolling to load more content -just-scrape smart-scraper https://news.example.com -p "Get all article headlines and dates" \ +just-scrape extract https://news.example.com -p "Get all article headlines and dates" \ --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 # Scrape a JS-heavy SPA behind anti-bot protection -just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ +just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ --stealth ``` -## Search Scraper +## Search -Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/services/searchscraper) +Search the web and extract structured data from results (replaces `search-scraper`). [docs](https://docs.scrapegraphai.com/api-reference/search) ### Usage ```bash -just-scrape search-scraper # AI-powered web search -just-scrape search-scraper --num-results # Sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # Markdown only (2 credits vs 10) -just-scrape search-scraper --schema # Enforce output schema -just-scrape search-scraper --stealth --headers +just-scrape search # AI-powered web search +just-scrape search --num-results # Sources to scrape (1-20, default 3) +just-scrape search -p # Extraction prompt for results +just-scrape search --schema # Enforce output schema +just-scrape search --headers ``` ### Examples ```bash # Research a topic across multiple sources -just-scrape search-scraper "What are the best Python web frameworks in 2025?" --num-results 10 - -# Get raw markdown from search results (cheaper) -just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 +just-scrape search "What are the best Python web frameworks in 2025?" --num-results 10 # Structured output with schema -just-scrape search-scraper "Top 5 cloud providers pricing" \ +just-scrape search "Top 5 cloud providers pricing" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` -## Markdownify - -Convert any webpage to clean markdown. [docs](https://docs.scrapegraphai.com/services/markdownify) - -### Usage - -```bash -just-scrape markdownify # Convert to markdown -just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) -just-scrape markdownify --headers # Custom headers -``` - -### Examples - -```bash -# Convert a blog post to markdown -just-scrape markdownify https://blog.example.com/my-article - -# Convert a JS-rendered page behind Cloudflare -just-scrape markdownify https://protected.example.com --stealth - -# Pipe markdown to a file -just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md -``` - -## Crawl +## Scrape -Crawl multiple pages and extract data from each. [docs](https://docs.scrapegraphai.com/services/smartcrawler) +Scrape content from a URL in various formats: markdown (default), html, screenshot, or branding. [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape crawl -p # Crawl + extract -just-scrape crawl -p --max-pages # Max pages (default 10) -just-scrape crawl -p --depth # Crawl depth (default 1) -just-scrape crawl --no-extraction --max-pages # Markdown only (2 credits/page) -just-scrape crawl -p --schema # Enforce output schema -just-scrape crawl -p --rules # Crawl rules (include_paths, same_domain) -just-scrape crawl -p --no-sitemap # Skip sitemap discovery -just-scrape crawl -p --stealth # Anti-bot bypass +just-scrape scrape # Markdown (default) +just-scrape scrape -f html # Raw HTML +just-scrape scrape -f screenshot # Screenshot +just-scrape scrape -f branding # Extract branding info +just-scrape scrape --stealth # Anti-bot bypass (+4 credits) +just-scrape scrape --country # Geo-targeting ``` ### Examples ```bash -# Crawl a docs site and extract all code examples -just-scrape crawl https://docs.example.com -p "Extract all code snippets with their language" \ - --max-pages 20 --depth 3 - -# Crawl only blog pages, skip everything else -just-scrape crawl https://example.com -p "Extract article titles and summaries" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 - -# Get raw markdown from all pages (no AI extraction, cheaper) -just-scrape crawl https://example.com --no-extraction --max-pages 10 -``` - -## Sitemap - -Get all URLs from a website's sitemap. [docs](https://docs.scrapegraphai.com/services/sitemap) - -### Usage - -```bash -just-scrape sitemap -``` +# Get markdown of a page +just-scrape scrape https://example.com -### Examples +# Get raw HTML +just-scrape scrape https://example.com -f html -```bash -# List all pages on a site -just-scrape sitemap https://example.com +# Scrape with anti-bot bypass and geo-targeting +just-scrape scrape https://store.example.com --stealth --country DE -# Pipe URLs to another tool -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Extract branding info (logos, colors, fonts) +just-scrape scrape https://example.com -f branding ``` -## Scrape +## Markdownify -Get raw HTML content from a URL. [docs](https://docs.scrapegraphai.com/services/scrape) +Convert any webpage to clean markdown (convenience wrapper for `scrape --format markdown`). [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape scrape # Raw HTML -just-scrape scrape --stealth # Anti-bot bypass (+4 credits) -just-scrape scrape --branding # Extract branding (+2 credits) -just-scrape scrape --country-code # Geo-targeting +just-scrape markdownify # Convert to markdown +just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) +just-scrape markdownify --headers # Custom headers ``` ### Examples ```bash -# Get raw HTML of a page -just-scrape scrape https://example.com +# Convert a blog post to markdown +just-scrape markdownify https://blog.example.com/my-article -# Scrape a geo-restricted page with anti-bot bypass -just-scrape scrape https://store.example.com --stealth --country-code DE +# Convert a JS-rendered page behind Cloudflare +just-scrape markdownify https://protected.example.com --stealth -# Extract branding info (logos, colors, fonts) -just-scrape scrape https://example.com --branding +# Pipe markdown to a file +just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md ``` -## Agentic Scraper +## Crawl -Browser automation with AI β€” login, click, navigate, fill forms. [docs](https://docs.scrapegraphai.com/services/agenticscraper) +Crawl multiple pages. The CLI starts the crawl and polls until completion. [docs](https://docs.scrapegraphai.com/api-reference/crawl) ### Usage ```bash -just-scrape agentic-scraper -s # Run browser steps -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # Persist browser session +just-scrape crawl # Crawl with defaults +just-scrape crawl --max-pages # Max pages (default 50) +just-scrape crawl --max-depth # Crawl depth (default 2) +just-scrape crawl --max-links-per-page # Links per page (default 10) +just-scrape crawl --allow-external # Allow external domains +just-scrape crawl --stealth # Anti-bot bypass ``` ### Examples ```bash -# Log in and extract dashboard data -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Navigate through a multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent session across multiple runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session -``` - -## Generate Schema - -Generate a JSON schema from a natural language description. - -### Usage - -```bash -just-scrape generate-schema # AI generates a schema -just-scrape generate-schema --existing-schema -``` +# Crawl a docs site +just-scrape crawl https://docs.example.com --max-pages 20 --max-depth 3 -### Examples - -```bash -# Generate a schema for product data -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" +# Crawl staying within domain +just-scrape crawl https://example.com --max-pages 50 -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' +# Get crawl results as JSON +just-scrape crawl https://example.com --json --max-pages 10 ``` ## History -Browse request history for any service. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for infinite scroll. +Browse request history for any service. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for pagination. ### Usage ```bash -just-scrape history # Interactive browser -just-scrape history # Fetch specific request -just-scrape history --page # Start from page (default 1) -just-scrape history --page-size # Results per page (default 10, max 100) -just-scrape history --json # Raw JSON (pipeable) +just-scrape history # Interactive browser +just-scrape history # Fetch specific request +just-scrape history --page # Start from page (default 1) +just-scrape history --page-size # Results per page (default 20, max 100) +just-scrape history --json # Raw JSON (pipeable) ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` +Services: `scrape`, `extract`, `search`, `monitor`, `crawl` ### Examples ```bash -# Browse your smart-scraper history interactively -just-scrape history smartscraper +# Browse your extract history interactively +just-scrape history extract # Jump to a specific request by ID -just-scrape history smartscraper abc123-def456-7890 +just-scrape history extract abc123-def456-7890 # Export crawl history as JSON -just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' +just-scrape history crawl --json --page-size 100 | jq '.[].status' ``` ## Credits @@ -356,18 +269,26 @@ Check your credit balance. ```bash just-scrape credits -just-scrape credits --json | jq '.remaining_credits' +just-scrape credits --json | jq '.remainingCredits' ``` -## Validate +--- -Validate your API key (health check). +## Migration from v0.2.x -```bash -just-scrape validate -``` +Commands have been renamed to match the v2 API: ---- +| Old command | New command | Notes | +|---|---|---| +| `smart-scraper` | `extract` | Renamed | +| `search-scraper` | `search` | Renamed | +| `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | +| `scrape` | `scrape` | Gains `--format` flag (markdown, html, screenshot, branding) | +| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external` | +| `agentic-scraper` | β€” | Removed from API | +| `generate-schema` | β€” | Removed from API | +| `sitemap` | β€” | Removed from API | +| `validate` | β€” | Removed from API | ## Contributing @@ -392,7 +313,7 @@ bun run dev --help | CLI Framework | **citty** (unjs) | | Prompts | **@clack/prompts** | | Styling | **chalk** v5 (ESM) | -| SDK | **scrapegraph-js** | +| SDK | **scrapegraph-js** v2 | | Env | **dotenv** | | Lint / Format | **Biome** | | Target | **Node.js 22+**, ESM-only | diff --git a/bun.lock b/bun.lock index 5a7bd89..1732297 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@1.0.0", "", {}, "sha512-eQn8/HRfJHjCoj2yia5yHWQTYUae/bYNhLEx00ZXF+GLKpgUJT0OCGUQM13WGSX5cgw9onz5EiaDJDbzcbeYtQ=="], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#4b86432", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-4b86432"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], diff --git a/package.json b/package.json index 55c9e7b..2d8cd18 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "just-scrape", - "version": "0.2.1", + "version": "0.3.0", "description": "ScrapeGraph AI CLI tool", "type": "module", "main": "dist/cli.mjs", @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/cli.ts b/src/cli.ts index 483a94c..255e93a 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -12,17 +12,13 @@ const main = defineCommand({ description: "ScrapeGraph AI CLI tool", }, subCommands: { - "smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default), - "search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default), + extract: () => import("./commands/extract.js").then((m) => m.default), + search: () => import("./commands/search.js").then((m) => m.default), + scrape: () => import("./commands/scrape.js").then((m) => m.default), markdownify: () => import("./commands/markdownify.js").then((m) => m.default), crawl: () => import("./commands/crawl.js").then((m) => m.default), - sitemap: () => import("./commands/sitemap.js").then((m) => m.default), - scrape: () => import("./commands/scrape.js").then((m) => m.default), - "agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default), - "generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default), history: () => import("./commands/history.js").then((m) => m.default), credits: () => import("./commands/credits.js").then((m) => m.default), - validate: () => import("./commands/validate.js").then((m) => m.default), }, }); diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts deleted file mode 100644 index 67e9b5b..0000000 --- a/src/commands/agentic-scraper.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "agentic-scraper", - description: "Browser automation with AI (login, click, navigate, fill forms)", - }, - args: { - url: { - type: "positional", - description: "Starting URL", - required: true, - }, - steps: { - type: "string", - alias: "s", - description: 'Comma-separated browser steps (e.g. "Click login,Fill email with x")', - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (used with --ai-extraction)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - "ai-extraction": { type: "boolean", description: "Enable AI extraction after steps" }, - "use-session": { type: "boolean", description: "Persist browser session across requests" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/agenticscraper"); - const key = await resolveApiKey(!!args.json); - - const steps = args.steps ? args.steps.split(",").map((s) => s.trim()) : []; - const params: scrapegraphai.AgenticScraperParams = { url: args.url, steps }; - if (args.prompt) params.user_prompt = args.prompt; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args["ai-extraction"]) params.ai_extraction = true; - if (args["use-session"]) params.use_session = true; - - out.start("Running browser automation"); - const result = await scrapegraphai.agenticScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index d55101d..3b23357 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,8 +1,9 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; +const POLL_INTERVAL_MS = 3000; + export default defineCommand({ meta: { name: "crawl", @@ -14,49 +15,54 @@ export default defineCommand({ description: "Starting URL to crawl", required: true, }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (required when extraction mode is on)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/page instead of 10)", - }, - "max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" }, - depth: { type: "string", description: "Crawl depth (default 1)" }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - rules: { type: "string", description: "Crawl rules as JSON object string" }, - "no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" }, + "max-pages": { type: "string", description: "Maximum pages to crawl (default 50)" }, + "max-depth": { type: "string", description: "Crawl depth (default 2)" }, + "max-links-per-page": { type: "string", description: "Max links per page (default 10)" }, + "allow-external": { type: "boolean", description: "Allow crawling external domains" }, stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartcrawler"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); + const sgai = await createClient(!!args.json); - const base: Record = { url: args.url }; - if (args["max-pages"]) base.max_pages = Number(args["max-pages"]); - if (args.depth) base.depth = Number(args.depth); - if (args.rules) base.rules = JSON.parse(args.rules); - if (args["no-sitemap"]) base.sitemap = false; - if (args.stealth) base.stealth = true; + const crawlOptions: Record = {}; + if (args["max-pages"]) crawlOptions.maxPages = Number(args["max-pages"]); + if (args["max-depth"]) crawlOptions.maxDepth = Number(args["max-depth"]); + if (args["max-links-per-page"]) + crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]); + if (args["allow-external"]) crawlOptions.allowExternal = true; + if (args.stealth) crawlOptions.fetchConfig = { stealth: true }; - if (args["no-extraction"]) { - base.extraction_mode = false; - } else { - if (args.prompt) base.prompt = args.prompt; - if (args.schema) base.schema = JSON.parse(args.schema); - } + out.start("Crawling"); + const t0 = performance.now(); + try { + const job = await sgai.crawl.start(args.url, crawlOptions as any); + const jobId = (job.data as { id: string }).id; - const params = base as scrapegraphai.CrawlParams; + if (!jobId) { + out.stop(Math.round(performance.now() - t0)); + out.result(job.data); + return; + } - out.start("Crawling"); - const result = await scrapegraphai.crawl(key, params, out.poll); - out.stop(result.elapsedMs); + // Poll until the crawl completes + while (true) { + await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); + const status = await sgai.crawl.status(jobId); + const statusData = status.data as { status: string; [key: string]: unknown }; + out.poll(statusData.status); - if (result.data) out.result(result.data); - else out.error(result.error); + if (statusData.status === "completed" || statusData.status === "failed" || statusData.status === "cancelled") { + out.stop(Math.round(performance.now() - t0)); + out.result(status.data); + return; + } + } + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/credits.ts b/src/commands/credits.ts index 0d7b75f..457e27d 100644 --- a/src/commands/credits.ts +++ b/src/commands/credits.ts @@ -1,6 +1,5 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -13,13 +12,17 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); + const sgai = await createClient(!!args.json); out.start("Fetching credits"); - const result = await scrapegraphai.getCredits(key); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); + const t0 = performance.now(); + try { + const result = await sgai.credits(); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/extract.ts b/src/commands/extract.ts new file mode 100644 index 0000000..bb0be8f --- /dev/null +++ b/src/commands/extract.ts @@ -0,0 +1,57 @@ +import { defineCommand } from "citty"; +import { createClient } from "../lib/client.js"; +import * as log from "../lib/log.js"; + +export default defineCommand({ + meta: { + name: "extract", + description: "Extract structured data from a URL using AI", + }, + args: { + url: { + type: "positional", + description: "Website URL to scrape", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt", + required: true, + }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + cookies: { type: "string", description: "Cookies as JSON object string" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/extract"); + const sgai = await createClient(!!args.json); + + const fetchConfig: Record = {}; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); + if (args.stealth) fetchConfig.stealth = true; + if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); + if (args.country) fetchConfig.country = args.country; + + const extractOptions: Record = { prompt: args.prompt }; + if (args.schema) extractOptions.schema = JSON.parse(args.schema); + if (Object.keys(fetchConfig).length > 0) extractOptions.fetchConfig = fetchConfig; + + out.start("Extracting"); + const t0 = performance.now(); + try { + const result = await sgai.extract(args.url, extractOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); diff --git a/src/commands/generate-schema.ts b/src/commands/generate-schema.ts deleted file mode 100644 index 8d77e57..0000000 --- a/src/commands/generate-schema.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "generate-schema", - description: "Generate a JSON schema from a natural language prompt", - }, - args: { - prompt: { - type: "positional", - description: "Describe the schema you need", - required: true, - }, - "existing-schema": { - type: "string", - description: "Existing schema to modify (as JSON string)", - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.GenerateSchemaParams = { user_prompt: args.prompt }; - if (args["existing-schema"]) params.existing_schema = JSON.parse(args["existing-schema"]); - - out.start("Generating schema"); - const result = await scrapegraphai.generateSchema(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/history.ts b/src/commands/history.ts index 99ab59e..bf844b7 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -1,11 +1,10 @@ import * as p from "@clack/prompts"; import chalk from "chalk"; import { defineCommand } from "citty"; -import { HISTORY_SERVICES } from "scrapegraph-js"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; +const HISTORY_SERVICES = ["scrape", "extract", "search", "monitor", "crawl"] as const; const VALID = HISTORY_SERVICES.join(", "); const LOAD_MORE = "__load_more__"; @@ -49,98 +48,107 @@ export default defineCommand({ required: true, }, page: { type: "string", description: "Page number (default: 1)" }, - "page-size": { type: "string", description: "Results per page (default: 10, max: 100)" }, + "page-size": { type: "string", description: "Results per page (default: 20, max: 100)" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const quiet = !!args.json; const out = log.create(quiet); - const key = await resolveApiKey(quiet); - const service = args.service as scrapegraphai.HistoryParams["service"]; + const sgai = await createClient(quiet); + const service = args.service as (typeof HISTORY_SERVICES)[number]; const requestId = (args as { _: string[] })._.at(1); - const pageSize = args["page-size"] ? Number(args["page-size"]) : 10; + const limit = args["page-size"] ? Number(args["page-size"]) : 20; let page = args.page ? Number(args.page) : 1; const fetchPage = async (pg: number) => { - const r = await scrapegraphai.history(key, { service, page: pg, page_size: pageSize }); - if (r.status === "error") out.error(r.error); - const d = r.data as { requests: Record[]; next_key?: string }; - return { rows: d.requests ?? [], hasMore: !!d.next_key, ms: r.elapsedMs }; + const t0 = performance.now(); + const r = await sgai.history({ service, page: pg, limit }); + const ms = Math.round(performance.now() - t0); + const d = r.data as { data?: Record[]; requests?: Record[]; next_key?: string; total?: number }; + return { rows: d.data ?? d.requests ?? [], hasMore: !!d.next_key || (d.total != null && pg * limit < d.total), ms }; }; if (quiet || requestId) { - const { rows } = await fetchPage(page); - if (requestId) { - const match = rows.find((r) => getId(r) === requestId); - if (!match) out.error(`Request ${requestId} not found on page ${page}`); - out.result(match); - return; + try { + const { rows } = await fetchPage(page); + if (requestId) { + const match = rows.find((r) => getId(r) === requestId); + if (!match) out.error(`Request ${requestId} not found on page ${page}`); + out.result(match); + return; + } + out.result(rows); + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); } - out.result(rows); return; } out.start(`Fetching ${service} history`); - const first = await fetchPage(page); - out.stop(first.ms); - - if (first.rows.length === 0) { - p.log.warning("No history found."); - return; - } + try { + const first = await fetchPage(page); + out.stop(first.ms); - const allRows = [...first.rows]; - let hasMore = first.hasMore; - - while (true) { - const options = allRows.map((row) => ({ - value: getId(row), - label: label(row), - hint: hint(row), - })); - - if (hasMore) { - options.push({ - value: LOAD_MORE, - label: chalk.blue.bold("↓ Load more…"), - hint: `page ${page + 1}`, - }); + if (first.rows.length === 0) { + p.log.warning("No history found."); + return; } - const selected = await p.select({ - message: `${allRows.length} requests β€” select one to view`, - options, - maxItems: 15, - }); + const allRows = [...first.rows]; + let hasMore = first.hasMore; + + while (true) { + const options = allRows.map((row) => ({ + value: getId(row), + label: label(row), + hint: hint(row), + })); + + if (hasMore) { + options.push({ + value: LOAD_MORE, + label: chalk.blue.bold("↓ Load more…"), + hint: `page ${page + 1}`, + }); + } - if (p.isCancel(selected)) { - p.cancel("Cancelled"); - return; - } + const selected = await p.select({ + message: `${allRows.length} requests β€” select one to view`, + options, + maxItems: 15, + }); - if (selected === LOAD_MORE) { - page++; - const ls = p.spinner(); - ls.start(`Loading page ${page}`); - const next = await fetchPage(page); - ls.stop("Done"); + if (p.isCancel(selected)) { + p.cancel("Cancelled"); + return; + } - if (next.rows.length === 0) { - hasMore = false; - p.log.warning("No more results."); + if (selected === LOAD_MORE) { + page++; + const ls = p.spinner(); + ls.start(`Loading page ${page}`); + const next = await fetchPage(page); + ls.stop("Done"); + + if (next.rows.length === 0) { + hasMore = false; + p.log.warning("No more results."); + continue; + } + + allRows.push(...next.rows); + hasMore = next.hasMore; continue; } - allRows.push(...next.rows); - hasMore = next.hasMore; - continue; - } - - const match = allRows.find((r) => getId(r) === selected); - if (match) out.result(match); + const match = allRows.find((r) => getId(r) === selected); + if (match) out.result(match); - const back = await p.confirm({ message: "Back to list?" }); - if (p.isCancel(back) || !back) return; + const back = await p.confirm({ message: "Back to list?" }); + if (p.isCancel(back) || !back) return; + } + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); } }, }); diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index ccfc494..5aa9dbe 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,6 +1,5 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -20,21 +19,25 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/markdownify"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const sgai = await createClient(!!args.json); - const params: scrapegraphai.MarkdownifyParams = { - website_url: args.url, - }; + const fetchConfig: Record = {}; + if (args.stealth) fetchConfig.stealth = true; + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); + const scrapeOptions: Record = { format: "markdown" }; + if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Converting to markdown"); - const result = await scrapegraphai.markdownify(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); + const t0 = performance.now(); + try { + const result = await sgai.scrape(args.url, scrapeOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index b0517eb..8339f27 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,12 +1,11 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ meta: { name: "scrape", - description: "Get raw HTML content from a URL", + description: "Scrape content from a URL (markdown, html, screenshot, or branding)", }, args: { url: { @@ -14,27 +13,37 @@ export default defineCommand({ description: "Website URL to scrape", required: true, }, + format: { + type: "string", + alias: "f", + description: "Output format: markdown (default), html, screenshot, branding", + }, stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - branding: { type: "boolean", description: "Extract branding info (+2 credits)" }, - "country-code": { type: "string", description: "ISO country code for geo-targeting" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/scrape"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const sgai = await createClient(!!args.json); - const params: scrapegraphai.ScrapeParams = { website_url: args.url }; + const fetchConfig: Record = {}; + if (args.stealth) fetchConfig.stealth = true; + if (args.country) fetchConfig.country = args.country; - if (args.stealth) params.stealth = true; - if (args.branding) params.branding = true; - if (args["country-code"]) params.country_code = args["country-code"]; + const scrapeOptions: Record = {}; + if (args.format) scrapeOptions.format = args.format; + if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Scraping"); - const result = await scrapegraphai.scrape(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); + const t0 = performance.now(); + try { + const result = await sgai.scrape(args.url, scrapeOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/search-scraper.ts b/src/commands/search-scraper.ts deleted file mode 100644 index 041e32c..0000000 --- a/src/commands/search-scraper.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "search-scraper", - description: "Search the web and extract data with AI", - }, - args: { - prompt: { - type: "positional", - description: "Search query and extraction instructions", - required: true, - }, - "num-results": { - type: "string", - description: "Number of websites to scrape (3-20, default 3)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/site instead of 10)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/searchscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SearchScraperParams = { - user_prompt: args.prompt, - }; - - if (args["num-results"]) params.num_results = Number(args["num-results"]); - if (args["no-extraction"]) params.extraction_mode = false; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); - - out.start("Searching"); - const result = await scrapegraphai.searchScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/search.ts b/src/commands/search.ts new file mode 100644 index 0000000..24b56c9 --- /dev/null +++ b/src/commands/search.ts @@ -0,0 +1,51 @@ +import { defineCommand } from "citty"; +import { createClient } from "../lib/client.js"; +import * as log from "../lib/log.js"; + +export default defineCommand({ + meta: { + name: "search", + description: "Search the web and extract data with AI", + }, + args: { + query: { + type: "positional", + description: "Search query", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt for search results", + }, + "num-results": { + type: "string", + description: "Number of websites to scrape (1-20, default 3)", + }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/search"); + const sgai = await createClient(!!args.json); + + const searchOptions: Record = {}; + if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); + if (args.schema) searchOptions.schema = JSON.parse(args.schema); + if (args.prompt) searchOptions.prompt = args.prompt; + if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; + + out.start("Searching"); + const t0 = performance.now(); + try { + const result = await sgai.search(args.query, searchOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); diff --git a/src/commands/sitemap.ts b/src/commands/sitemap.ts deleted file mode 100644 index 2120b16..0000000 --- a/src/commands/sitemap.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "sitemap", - description: "Get all URLs from a website's sitemap", - }, - args: { - url: { - type: "positional", - description: "Website URL", - required: true, - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/sitemap"); - const key = await resolveApiKey(!!args.json); - - out.start("Fetching sitemap"); - const result = await scrapegraphai.sitemap(key, { website_url: args.url }); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/smart-scraper.ts b/src/commands/smart-scraper.ts deleted file mode 100644 index be3d2a4..0000000 --- a/src/commands/smart-scraper.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "smart-scraper", - description: "Extract structured data from a URL using AI", - }, - args: { - url: { - type: "positional", - description: "Website URL to scrape", - required: true, - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt", - required: true, - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - pages: { type: "string", description: "Total pages to scrape (1-100)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - cookies: { type: "string", description: "Cookies as JSON object string" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - "plain-text": { type: "boolean", description: "Return plain text instead of JSON" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SmartScraperParams = { - website_url: args.url, - user_prompt: args.prompt, - }; - - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.scrolls) params.number_of_scrolls = Number(args.scrolls); - if (args.pages) params.total_pages = Number(args.pages); - if (args.stealth) params.stealth = true; - if (args.cookies) params.cookies = JSON.parse(args.cookies); - if (args.headers) params.headers = JSON.parse(args.headers); - if (args["plain-text"]) params.plain_text = true; - - out.start("Scraping"); - const result = await scrapegraphai.smartScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/validate.ts b/src/commands/validate.ts deleted file mode 100644 index dd2c81d..0000000 --- a/src/commands/validate.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "validate", - description: "Validate your API key (health check)", - }, - args: { - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - out.start("Checking API health"); - const result = await scrapegraphai.checkHealth(key); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/lib/client.ts b/src/lib/client.ts new file mode 100644 index 0000000..a83df1c --- /dev/null +++ b/src/lib/client.ts @@ -0,0 +1,18 @@ +import { scrapegraphai } from "scrapegraph-js"; +import { resolveApiKey } from "./folders.js"; + +let cached: ReturnType | null = null; + +export async function createClient(quiet = false) { + const apiKey = await resolveApiKey(quiet); + + if (cached) return cached; + + const baseUrl = process.env.SGAI_API_URL || undefined; + const timeout = process.env.SGAI_TIMEOUT_S + ? Number(process.env.SGAI_TIMEOUT_S) * 1000 + : undefined; + + cached = scrapegraphai({ apiKey, baseUrl, timeout }); + return cached; +} diff --git a/src/utils/banner.ts b/src/utils/banner.ts index 66c6386..d2bbe9c 100644 --- a/src/utils/banner.ts +++ b/src/utils/banner.ts @@ -30,8 +30,8 @@ export function showBanner() { console.log(text); console.log(chalk.hex(BANNER_COLOR)(TAGLINE)); console.log(chalk.hex(BANNER_COLOR)(`v${getVersion()}`)); - if (process.env.JUST_SCRAPE_API_URL) { - console.log(chalk.yellow(`β†’ Custom API: ${process.env.JUST_SCRAPE_API_URL}`)); + if (process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL) { + console.log(chalk.yellow(`β†’ Custom API: ${process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL}`)); } console.log(); }