-
-
Notifications
You must be signed in to change notification settings - Fork 543
Add logo URL checker script #26152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+470
−1
Merged
Add logo URL checker script #26152
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
aaf4265
Add logo URL checker script
rursache 290f3e7
Add per-domain rate-limit backoff for 429 responses
rursache ad6e1c2
Lower default concurrency and add default delay
rursache 93f874a
Replace custom CSV parser with repo's csvtojson dependency
rursache e44ebfa
Replace magic-byte image detection with probe-image-size
rursache 52e4d2a
Add tests for check_logos script
rursache 457cc83
typed + moved to scripts/commands/db
BellezaEmporium 8338de8
Update import path for checkAll function
BellezaEmporium d7dc11d
fix test and move it to tests/commands/db
BellezaEmporium File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,6 @@ | ||
| /node_modules/ | ||
| /.artifacts/ | ||
| .DS_Store | ||
| /temp/ | ||
| /temp/ | ||
| /dead_logos*.json | ||
| /scripts/dead_logos*.json |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,362 @@ | ||
| #!/usr/bin/env node | ||
| /** | ||
| * Scan data/logos.csv for broken logo URLs and write dead entries to a JSON file. | ||
| * | ||
| * A URL is considered dead if: | ||
| * - The connection fails or times out | ||
| * - The HTTP response is not 2xx (429s are retried with exponential backoff) | ||
| * - The Content-Type header is not image/* | ||
| * | ||
| * Requires: Node.js 18+ (uses built-in fetch) | ||
| * | ||
| * Usage: | ||
| * # Full scan of data/logos.csv | ||
| * node scripts/check_logos.js | ||
| * | ||
| * # Re-check a previous result (overwrites the same file by default) | ||
| * node scripts/check_logos.js --recheck dead_logos.json --concurrency 10 --delay 500 | ||
| * | ||
| * # Keep re-checking until no more URLs recover | ||
| * node scripts/check_logos.js --recheck dead_logos.json --loop --concurrency 10 --delay 500 | ||
| */ | ||
|
|
||
| import { readFileSync, writeFileSync, mkdirSync } from 'node:fs' | ||
| import { parseArgs } from 'node:util' | ||
| import { dirname, resolve } from 'node:path' | ||
| import csv2json from 'csvtojson' | ||
| import probe from 'probe-image-size' | ||
|
|
||
| process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0' | ||
|
|
||
| const DEFAULT_CONCURRENCY = 10 | ||
| const DEFAULT_TIMEOUT = 15_000 // ms per request | ||
| const DEFAULT_DELAY_MS = 200 | ||
| const MAX_RETRIES = 4 | ||
| const RETRY_BASE = 2.0 // seconds; doubles each retry + jitter | ||
|
|
||
| const CSV_PATH = resolve(__dirname, '../../../', 'data', 'logos.csv') | ||
| const DEFAULT_OUTPUT = resolve(__dirname, '../../../', 'dead_logos.json') | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // CSV loader (uses the same csvtojson library as the rest of the repo) | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| async function loadCsv(path: string) { | ||
| return csv2json({ trim: true, delimiter: ',', eol: '\r\n' }).fromFile(path) | ||
| } | ||
|
|
||
| function loadJson(path: string) { | ||
| return JSON.parse(readFileSync(path, 'utf-8')) | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Output helpers | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| function save(dead: any[], path: string) { | ||
| mkdirSync(dirname(path), { recursive: true }) | ||
| const sorted = [...dead].sort((a, b) => (a.channel || '').localeCompare(b.channel || '')) | ||
| writeFileSync(path, JSON.stringify(sorted, null, 2) + '\n', 'utf-8') | ||
| } | ||
|
|
||
| function printSummary(dead: any[]) { | ||
| const reasons: Record<string, number> = {} | ||
| for (const r of dead) { | ||
| const reason = r._reason | ||
| let key | ||
| if (reason.startsWith('HTTP ')) key = reason | ||
| else if (reason.includes('timeout')) key = 'timeout' | ||
| else if (['ECONNREFUSED', 'ENOTFOUND', 'ECONNRESET', 'UND_ERR'].some(kw => reason.includes(kw))) key = 'connection error' | ||
| else if (reason.startsWith('bad content-type')) key = 'bad content-type' | ||
| else key = 'other' | ||
| reasons[key] = (reasons[key] || 0) + 1 | ||
| } | ||
| console.log(' Reason breakdown:') | ||
| const sorted = Object.entries(reasons).sort((a, b) => b[1] - a[1]) | ||
| for (const [reason, count] of sorted) { | ||
| console.log(` ${String(count).padStart(6)} ${reason}`) | ||
| } | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Progress formatting | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| function fmtProgress(checked: number, total: number, dead: number, elapsed: number) { | ||
| const rate = elapsed > 0 ? checked / elapsed : 0 | ||
| const eta = rate > 0 && checked < total ? (total - checked) / rate : 0 | ||
| return `[${checked}/${total}] ${(checked / total * 100).toFixed(1)}% dead: ${dead} ${rate.toFixed(1)}/s eta: ${(eta / 60).toFixed(1)}min` | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Fetch with method fallback | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| async function fetchUrl(url: string, method: string, timeout: number) { | ||
| const controller = new AbortController() | ||
| const timer = setTimeout(() => controller.abort(), timeout) | ||
| try { | ||
| const resp = await fetch(url, { | ||
| method, | ||
| signal: controller.signal, | ||
| redirect: 'follow', | ||
| headers: { 'User-Agent': 'Mozilla/5.0 (compatible; logo-checker/1.0)' }, | ||
| }) | ||
| return { | ||
| status: resp.status, | ||
| contentType: resp.headers.get('content-type') || '', | ||
| retryAfter: resp.headers.get('retry-after'), | ||
|
rursache marked this conversation as resolved.
|
||
| } | ||
| } finally { | ||
| clearTimeout(timer) | ||
| } | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Per-domain rate-limit tracking | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| function getDomain(url: string) { | ||
| try { return new URL(url).hostname } catch { return '' } | ||
| } | ||
|
|
||
| // Maps domain → timestamp (ms) before which no requests should be sent | ||
| const domainBackoff = new Map<string, number>() | ||
|
|
||
| function setDomainBackoff(domain: string, delaySec: number) { | ||
| const until = Date.now() + delaySec * 1000 | ||
| const existing = domainBackoff.get(domain) || 0 | ||
| if (until > existing) domainBackoff.set(domain, until) | ||
| } | ||
|
|
||
| async function waitForDomain(domain: string) { | ||
| const until = domainBackoff.get(domain) | ||
| if (!until) return | ||
| const wait = until - Date.now() | ||
| if (wait > 0) await new Promise(r => setTimeout(r, wait)) | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Runner | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| async function checkAll(rows: any[], concurrency: number, timeout: number, delayMs: number, liveOutput: string) { | ||
| const total = rows.length | ||
| const dead: any[] = [] | ||
| const aliveUrls = new Set() | ||
| let checked = 0 | ||
| let pending = total | ||
| let dirty = false | ||
| const start = performance.now() | ||
| const delayS = delayMs / 1000 | ||
|
|
||
| const queue: any[] = [] | ||
| const waiters: ((item: any) => void)[] = [] | ||
|
|
||
| function enqueue(item: any) { | ||
| if (waiters.length > 0) { | ||
| waiters.shift()!(item) | ||
| } else { | ||
| queue.push(item) | ||
| } | ||
| } | ||
|
|
||
| function dequeue() { | ||
| if (queue.length > 0) return Promise.resolve(queue.shift()) | ||
| return new Promise(r => waiters.push(r)) | ||
| } | ||
|
|
||
| for (const row of rows) { | ||
| enqueue({ row, attempt: 0, method: 'HEAD' }) | ||
| } | ||
|
|
||
| function flushLive() { | ||
| if (!liveOutput) return | ||
| const remaining = rows.filter(r => !aliveUrls.has(r.url || '')) | ||
| save(remaining, liveOutput) | ||
| } | ||
|
|
||
| function markResolved(row: any, isDead: boolean, reason: string) { | ||
| checked++ | ||
| pending-- | ||
| if (checked % 500 === 0 || checked === total) { | ||
| const elapsed = (performance.now() - start) / 1000 | ||
| console.log(' ' + fmtProgress(checked, total, dead.length, elapsed)) | ||
| } | ||
| if (isDead) { | ||
| const entry = { ...row } | ||
| delete entry._reason | ||
| entry._reason = reason | ||
| dead.push(entry) | ||
| } else { | ||
| aliveUrls.add(row.url || '') | ||
| dirty = true | ||
| } | ||
| if (pending === 0) { | ||
| for (let i = 0; i < concurrency; i++) enqueue(null) | ||
| } | ||
| } | ||
|
|
||
| function requeueLater(item: any, delay: number) { | ||
| setTimeout(() => enqueue(item), delay * 1000) | ||
| } | ||
|
|
||
| const heartbeat = setInterval(() => { | ||
| const elapsed = (performance.now() - start) / 1000 | ||
| console.log(' ' + fmtProgress(checked, total, dead.length, elapsed)) | ||
| if (dirty) { | ||
| flushLive() | ||
| dirty = false | ||
| } | ||
| }, 15_000) | ||
|
|
||
| async function worker() { | ||
| while (true) { | ||
| const item = await dequeue() | ||
| if (item === null) return | ||
|
|
||
| const { row, attempt, method } = item | ||
| const url = (row.url || '').trim() | ||
|
|
||
| if (!url) { | ||
| markResolved(row, true, 'empty url') | ||
| continue | ||
| } | ||
|
|
||
| const domain = getDomain(url) | ||
| await waitForDomain(domain) | ||
|
|
||
| let result | ||
| try { | ||
| result = await fetchUrl(url, method, timeout) | ||
| if (delayS > 0) await new Promise(r => setTimeout(r, delayMs)) | ||
| } catch (err: Error | any) { | ||
| const msg = err.name === 'AbortError' ? 'timeout' : String(err.cause?.code || err.message || err) | ||
| if (msg.includes('EMFILE') && attempt < MAX_RETRIES) { | ||
| requeueLater({ row, attempt: attempt + 1, method }, 1.0 + Math.random()) | ||
| continue | ||
| } | ||
| markResolved(row, true, msg) | ||
| continue | ||
| } | ||
|
|
||
| const { status, contentType, retryAfter } = result | ||
|
|
||
| if (status === 429) { | ||
| const wait = retryAfter ? parseFloat(retryAfter) : RETRY_BASE * (2 ** attempt) | ||
| const cappedWait = Math.min(wait, 60) + Math.random() * 2 | ||
| setDomainBackoff(domain, cappedWait) | ||
| if (attempt >= MAX_RETRIES) { | ||
| markResolved(row, true, 'HTTP 429 (gave up after retries)') | ||
| } else { | ||
| requeueLater({ row, attempt: attempt + 1, method }, cappedWait) | ||
| } | ||
| continue | ||
| } | ||
|
|
||
| if (status === 405 && method === 'HEAD') { | ||
| enqueue({ row, attempt, method: 'GET' }) | ||
| continue | ||
| } | ||
|
|
||
| if (!(status >= 200 && status < 300)) { | ||
| markResolved(row, true, `HTTP ${status}`) | ||
| continue | ||
| } | ||
|
|
||
| if (contentType && !contentType.startsWith('image/')) { | ||
| if (contentType.includes('octet-stream')) { | ||
| const probeResult = await probe(url).catch(() => null) | ||
| if (probeResult) { | ||
| markResolved(row, false, 'ok') | ||
| } else { | ||
| markResolved(row, true, `bad content-type: ${contentType} (not an image)`) | ||
| } | ||
| continue | ||
| } | ||
| markResolved(row, true, `bad content-type: ${contentType}`) | ||
| continue | ||
| } | ||
|
|
||
| markResolved(row, false, 'ok') | ||
| } | ||
| } | ||
|
|
||
| await Promise.all(Array.from({ length: concurrency }, () => worker())) | ||
| clearInterval(heartbeat) | ||
|
|
||
| flushLive() | ||
| const elapsed = (performance.now() - start) / 1000 | ||
| console.log(` Finished in ${(elapsed / 60).toFixed(1)}min — ${dead.length}/${total} still dead`) | ||
| return dead | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // CLI | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| async function main() { | ||
| const { values } = parseArgs({ | ||
| options: { | ||
| concurrency: { type: 'string', default: String(DEFAULT_CONCURRENCY) }, | ||
| timeout: { type: 'string', default: String(DEFAULT_TIMEOUT / 1000) }, | ||
| delay: { type: 'string', default: String(DEFAULT_DELAY_MS) }, | ||
| output: { type: 'string' }, | ||
| recheck: { type: 'string' }, | ||
| loop: { type: 'boolean', default: false }, | ||
| }, | ||
| strict: true, | ||
| }) | ||
|
|
||
| const concurrency = parseInt(values.concurrency, 10) | ||
| const timeout = parseInt(values.timeout, 10) * 1000 | ||
| const delayMs = parseInt(values.delay, 10) | ||
|
|
||
| if (values.loop && !values.recheck) { | ||
| console.error('Error: --loop requires --recheck') | ||
| process.exit(1) | ||
| } | ||
|
|
||
| let rows | ||
| let output | ||
|
|
||
| if (values.recheck) { | ||
| const recheckPath = resolve(values.recheck) | ||
| rows = loadJson(recheckPath) | ||
| output = values.output ? resolve(values.output) : recheckPath | ||
| console.log(`Rechecking ${recheckPath} (${rows.length} entries)`) | ||
| } else { | ||
| rows = await loadCsv(CSV_PATH) | ||
| output = values.output ? resolve(values.output) : DEFAULT_OUTPUT | ||
| } | ||
|
|
||
| const liveOutput = values.recheck ? output : null | ||
|
|
||
| if (values.loop) { | ||
| for (let iteration = 1; iteration < 1000; iteration++) { | ||
| console.log(`\n--- Iteration ${iteration} (${rows.length} entries) ---`) | ||
| const dead = await checkAll(rows, concurrency, timeout, delayMs, liveOutput || `${output}.iter${iteration}.json`) | ||
| printSummary(dead) | ||
| if (!values.recheck) save(dead, output) | ||
| console.log(` Saved ${dead.length} entries to ${output}`) | ||
|
|
||
| const cleared = rows.length - dead.length | ||
| if (cleared === 0) { | ||
| console.log(`\nStabilized after ${iteration} iteration(s). ${dead.length} confirmed dead.`) | ||
| break | ||
| } | ||
| console.log(` Cleared ${cleared} URLs this pass — running again...`) | ||
| rows = dead | ||
| } | ||
| } else { | ||
| const dead = await checkAll(rows, concurrency, timeout, delayMs, liveOutput || `${output}.iter1.json`) | ||
| printSummary(dead) | ||
| if (!values.recheck) save(dead, output) | ||
| console.log(`\nWritten to ${output}`) | ||
| } | ||
| } | ||
|
|
||
| export { checkAll } | ||
|
|
||
| const isDirectRun = process.argv[1] && resolve(process.argv[1]) === resolve(__filename) | ||
| if (isDirectRun) main().catch(console.error) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.