Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 101 additions & 180 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "just-scrape",
"version": "0.2.1",
"version": "0.3.0",
"description": "ScrapeGraph AI CLI tool",
"type": "module",
"main": "dist/cli.mjs",
Expand Down Expand Up @@ -28,7 +28,7 @@
"chalk": "^5.4.1",
"citty": "^0.1.6",
"dotenv": "^17.2.4",
"scrapegraph-js": "^1.0.0"
"scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration"
},
"devDependencies": {
"@biomejs/biome": "^1.9.4",
Expand Down
10 changes: 3 additions & 7 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,13 @@ const main = defineCommand({
description: "ScrapeGraph AI CLI tool",
},
subCommands: {
"smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default),
"search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default),
extract: () => import("./commands/extract.js").then((m) => m.default),
search: () => import("./commands/search.js").then((m) => m.default),
scrape: () => import("./commands/scrape.js").then((m) => m.default),
markdownify: () => import("./commands/markdownify.js").then((m) => m.default),
crawl: () => import("./commands/crawl.js").then((m) => m.default),
sitemap: () => import("./commands/sitemap.js").then((m) => m.default),
scrape: () => import("./commands/scrape.js").then((m) => m.default),
"agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default),
"generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default),
history: () => import("./commands/history.js").then((m) => m.default),
credits: () => import("./commands/credits.js").then((m) => m.default),
validate: () => import("./commands/validate.js").then((m) => m.default),
},
});

Expand Down
51 changes: 0 additions & 51 deletions src/commands/agentic-scraper.ts

This file was deleted.

78 changes: 42 additions & 36 deletions src/commands/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { defineCommand } from "citty";
import * as scrapegraphai from "scrapegraph-js";
import { resolveApiKey } from "../lib/folders.js";
import { createClient } from "../lib/client.js";
import * as log from "../lib/log.js";

const POLL_INTERVAL_MS = 3000;

export default defineCommand({
meta: {
name: "crawl",
Expand All @@ -14,49 +15,54 @@ export default defineCommand({
description: "Starting URL to crawl",
required: true,
},
prompt: {
type: "string",
alias: "p",
description: "Extraction prompt (required when extraction mode is on)",
},
"no-extraction": {
type: "boolean",
description: "Return markdown only (2 credits/page instead of 10)",
},
"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
depth: { type: "string", description: "Crawl depth (default 1)" },
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
rules: { type: "string", description: "Crawl rules as JSON object string" },
"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
"max-pages": { type: "string", description: "Maximum pages to crawl (default 50)" },
"max-depth": { type: "string", description: "Crawl depth (default 2)" },
"max-links-per-page": { type: "string", description: "Max links per page (default 10)" },
"allow-external": { type: "boolean", description: "Allow crawling external domains" },
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
const key = await resolveApiKey(!!args.json);
out.docs("https://docs.scrapegraphai.com/api-reference/crawl");
const sgai = await createClient(!!args.json);

const base: Record<string, unknown> = { url: args.url };
if (args["max-pages"]) base.max_pages = Number(args["max-pages"]);
if (args.depth) base.depth = Number(args.depth);
if (args.rules) base.rules = JSON.parse(args.rules);
if (args["no-sitemap"]) base.sitemap = false;
if (args.stealth) base.stealth = true;
const crawlOptions: Record<string, unknown> = {};
if (args["max-pages"]) crawlOptions.maxPages = Number(args["max-pages"]);
if (args["max-depth"]) crawlOptions.maxDepth = Number(args["max-depth"]);
if (args["max-links-per-page"])
crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]);
if (args["allow-external"]) crawlOptions.allowExternal = true;
if (args.stealth) crawlOptions.fetchConfig = { stealth: true };

if (args["no-extraction"]) {
base.extraction_mode = false;
} else {
if (args.prompt) base.prompt = args.prompt;
if (args.schema) base.schema = JSON.parse(args.schema);
}
out.start("Crawling");
const t0 = performance.now();
try {
const job = await sgai.crawl.start(args.url, crawlOptions as any);
const jobId = (job.data as { id: string }).id;

const params = base as scrapegraphai.CrawlParams;
if (!jobId) {
out.stop(Math.round(performance.now() - t0));
out.result(job.data);
return;
}

out.start("Crawling");
const result = await scrapegraphai.crawl(key, params, out.poll);
out.stop(result.elapsedMs);
// Poll until the crawl completes
while (true) {
await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
const status = await sgai.crawl.status(jobId);
const statusData = status.data as { status: string; [key: string]: unknown };
out.poll(statusData.status);

if (result.data) out.result(result.data);
else out.error(result.error);
if (statusData.status === "completed" || statusData.status === "failed" || statusData.status === "cancelled") {
out.stop(Math.round(performance.now() - t0));
out.result(status.data);
return;
}
}
} catch (err) {
out.stop(Math.round(performance.now() - t0));
out.error(err instanceof Error ? err.message : String(err));
}
},
});
19 changes: 11 additions & 8 deletions src/commands/credits.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { defineCommand } from "citty";
import * as scrapegraphai from "scrapegraph-js";
import { resolveApiKey } from "../lib/folders.js";
import { createClient } from "../lib/client.js";
import * as log from "../lib/log.js";

export default defineCommand({
Expand All @@ -13,13 +12,17 @@ export default defineCommand({
},
run: async ({ args }) => {
const out = log.create(!!args.json);
const key = await resolveApiKey(!!args.json);
const sgai = await createClient(!!args.json);

out.start("Fetching credits");
const result = await scrapegraphai.getCredits(key);
out.stop(result.elapsedMs);

if (result.data) out.result(result.data);
else out.error(result.error);
const t0 = performance.now();
try {
const result = await sgai.credits();
out.stop(Math.round(performance.now() - t0));
out.result(result.data);
} catch (err) {
out.stop(Math.round(performance.now() - t0));
out.error(err instanceof Error ? err.message : String(err));
}
},
});
57 changes: 57 additions & 0 deletions src/commands/extract.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import { defineCommand } from "citty";
import { createClient } from "../lib/client.js";
import * as log from "../lib/log.js";

export default defineCommand({
meta: {
name: "extract",
description: "Extract structured data from a URL using AI",
},
args: {
url: {
type: "positional",
description: "Website URL to scrape",
required: true,
},
prompt: {
type: "string",
alias: "p",
description: "Extraction prompt",
required: true,
},
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" },
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
cookies: { type: "string", description: "Cookies as JSON object string" },
headers: { type: "string", description: "Custom headers as JSON object string" },
country: { type: "string", description: "ISO country code for geo-targeting" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/api-reference/extract");
const sgai = await createClient(!!args.json);

const fetchConfig: Record<string, unknown> = {};
if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls);
if (args.stealth) fetchConfig.stealth = true;
if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies);
if (args.headers) fetchConfig.headers = JSON.parse(args.headers);
if (args.country) fetchConfig.country = args.country;

const extractOptions: Record<string, unknown> = { prompt: args.prompt };
if (args.schema) extractOptions.schema = JSON.parse(args.schema);
if (Object.keys(fetchConfig).length > 0) extractOptions.fetchConfig = fetchConfig;

out.start("Extracting");
const t0 = performance.now();
try {
const result = await sgai.extract(args.url, extractOptions as any);
out.stop(Math.round(performance.now() - t0));
out.result(result.data);
} catch (err) {
out.stop(Math.round(performance.now() - t0));
out.error(err instanceof Error ? err.message : String(err));
}
},
});
37 changes: 0 additions & 37 deletions src/commands/generate-schema.ts

This file was deleted.

Loading
Loading