Skip to content

Commit 5e7bc96

Browse files
authored
Merge pull request #55 from d-zero-dev/claude/improve-crawl-abort-handling-q0Wwo
Implement graceful crawl abort via AbortController
2 parents 81c9f6f + f57eb1a commit 5e7bc96

5 files changed

Lines changed: 114 additions & 15 deletions

File tree

ARCHITECTURE.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,23 @@ URL を deal() で受け取り:
217217
- スクレイピングはインプロセス(`@d-zero/beholder`)で実行。各 URL ごとにブラウザを起動・終了
218218
- `push()` で発見した新 URL を動的にキューに追加
219219
- `onPush` コールバックで `withoutHashAndAuth` による重複排除
220+
- `signal` オプションで `AbortSignal` を渡し、中断時に新規ワーカーの起動を停止
221+
222+
### クロール中断メカニズム
223+
224+
```
225+
CLI シグナルハンドラ(SIGINT / SIGHUP 等)
226+
→ CrawlerOrchestrator.abort()
227+
→ Crawler.abort()
228+
→ AbortController.abort()
229+
→ deal() の signal オプション経由で新規ワーカー起動を停止
230+
→ 実行中のワーカーは正常完了まで継続
231+
→ 全ワーカー完了後 deal() が resolve → crawlEnd イベント emit
232+
```
233+
234+
- `Crawler` は内部に `AbortController` を保持し、`signal` getter で `AbortSignal` を公開
235+
- `CrawlerOrchestrator` のコンストラクタで `archive``error` イベントを監視し、アーカイブエラー発生時にも `Crawler.abort()` を呼び出す
236+
- CLI の `killed()` ハンドラでは `abort()` 後に `garbageCollect()`(ゾンビ Chromium プロセスの終了)→ `process.exit()` を実行
220237

221238
### 主要定数
222239

packages/@nitpicker/cli/src/commands/crawl.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,10 @@ type LogType = 'verbose' | 'normal' | 'silent';
139139
/**
140140
* Sets up signal handlers for graceful shutdown and starts event logging.
141141
*
142-
* Registers SIGINT/SIGBREAK/SIGHUP/SIGABRT handlers that kill zombie
143-
* Chromium processes before exiting, then delegates to {@link eventAssignments}
144-
* for progress output.
142+
* Registers SIGINT/SIGBREAK/SIGHUP/SIGABRT handlers that abort the
143+
* crawl via {@link CrawlerOrchestrator.abort}, then kill zombie Chromium
144+
* processes and exit. The abort signal propagates through the dealer's
145+
* AbortSignal mechanism so no new workers are launched.
145146
* @param trigger - Display label for the crawl (URL or stub file path)
146147
* @param orchestrator - The initialized CrawlerOrchestrator instance
147148
* @param config - The resolved archive configuration
@@ -155,6 +156,7 @@ function run(
155156
logType: LogType,
156157
) {
157158
const killed = () => {
159+
orchestrator.abort();
158160
orchestrator.garbageCollect();
159161
process.exit();
160162
};

packages/@nitpicker/crawler/src/crawler-orchestrator.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,14 @@ export class CrawlerOrchestrator extends EventEmitter<CrawlEvent> {
162162
}
163163

164164
/**
165-
* Abort the current crawl and archive operations.
165+
* Abort the current crawl operation.
166166
*
167-
* Delegates to the archive's abort method, which stops all in-progress
168-
* database writes and cleans up temporary resources.
169-
* @returns The result of the archive abort operation.
167+
* Delegates to the crawler's AbortController so that the dealer stops
168+
* launching new workers. Currently running workers will finish, after
169+
* which `deal()` resolves and `crawlEnd` is emitted normally.
170170
*/
171171
abort() {
172-
return this.#archive.abort();
172+
this.#crawler.abort();
173173
}
174174

175175
/**

packages/@nitpicker/crawler/src/crawler/crawler.spec.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,75 @@ describe('Crawler', () => {
186186
});
187187
});
188188

189+
describe('abort()', () => {
190+
it('abort() 後に deal() の signal オプションに渡された AbortSignal が aborted になる', async () => {
191+
const { deal } = await import('@d-zero/dealer');
192+
const { default: Crawler } = await import('./crawler.js');
193+
194+
let receivedSignal: AbortSignal | undefined;
195+
196+
vi.mocked(deal).mockImplementation((_items, _factory, options) => {
197+
receivedSignal = options?.signal;
198+
return Promise.resolve();
199+
});
200+
201+
const crawler = new Crawler(defaultOptions);
202+
crawler.start(parseUrl('https://example.com/')!);
203+
204+
await vi.waitFor(() => {
205+
expect(receivedSignal).toBeDefined();
206+
});
207+
208+
expect(receivedSignal!.aborted).toBe(false);
209+
crawler.abort();
210+
expect(receivedSignal!.aborted).toBe(true);
211+
});
212+
213+
it('deal 正常完了時に crawlEnd イベントが emit される', async () => {
214+
const { deal } = await import('@d-zero/dealer');
215+
const { default: Crawler } = await import('./crawler.js');
216+
217+
vi.mocked(deal).mockImplementation((_items, _factory, options) => {
218+
// Simulate: abort is called, deal checks signal and resolves normally
219+
expect(options?.signal).toBeInstanceOf(AbortSignal);
220+
return Promise.resolve();
221+
});
222+
223+
const crawler = new Crawler(defaultOptions);
224+
let crawlEndEmitted = false;
225+
crawler.on('crawlEnd', () => {
226+
crawlEndEmitted = true;
227+
});
228+
229+
crawler.start(parseUrl('https://example.com/')!);
230+
231+
await vi.waitFor(() => {
232+
expect(crawlEndEmitted).toBe(true);
233+
});
234+
});
235+
236+
it('二重 abort でもエラーにならない', async () => {
237+
const { deal } = await import('@d-zero/dealer');
238+
const { default: Crawler } = await import('./crawler.js');
239+
240+
vi.mocked(deal).mockResolvedValue();
241+
242+
const crawler = new Crawler(defaultOptions);
243+
crawler.start(parseUrl('https://example.com/')!);
244+
245+
crawler.abort();
246+
expect(() => crawler.abort()).not.toThrow();
247+
expect(crawler.signal.aborted).toBe(true);
248+
});
249+
250+
it('signal getter が AbortSignal を返す', async () => {
251+
const { default: Crawler } = await import('./crawler.js');
252+
const crawler = new Crawler(defaultOptions);
253+
expect(crawler.signal).toBeInstanceOf(AbortSignal);
254+
expect(crawler.signal.aborted).toBe(false);
255+
});
256+
});
257+
189258
describe('worker-level error handling', () => {
190259
it('ワーカー内の例外が error イベントとして emit され処理が継続する', async () => {
191260
const { deal } = await import('@d-zero/dealer');

packages/@nitpicker/crawler/src/crawler/crawler.ts

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ export type { CrawlerOptions } from './types.js';
5151
* configurable parallelism up to {@link Crawler.MAX_PROCESS_LENGTH}.
5252
*/
5353
export default class Crawler extends EventEmitter<CrawlerEventTypes> {
54-
/** Flag set by `abort()` to signal in-progress tasks to exit early. */
55-
#aborted = false;
54+
/** Controller used to cancel the deal-based crawl via its AbortSignal. */
55+
readonly #abortController = new AbortController();
5656
/** Tracks discovered URLs, their scrape status, and deduplication. */
5757
readonly #linkList = new LinkList();
5858
/** Merged crawler configuration (user overrides + defaults). */
@@ -69,6 +69,16 @@ export default class Crawler extends EventEmitter<CrawlerEventTypes> {
6969
/** Maps hostnames to their scope URLs. Defines the crawl boundary for internal/external classification. */
7070
readonly #scope = new Map<string /* hostname */, ExURL[]>();
7171

72+
/**
73+
* The AbortSignal associated with this crawler's AbortController.
74+
*
75+
* Passed to `deal()` so that it stops launching new workers after abort.
76+
* Also available to the orchestrator for forwarding to other subsystems.
77+
*/
78+
get signal(): AbortSignal {
79+
return this.#abortController.signal;
80+
}
81+
7282
/**
7383
* Create a new Crawler instance.
7484
* @param options - Configuration options for crawling behavior. All fields have
@@ -113,12 +123,13 @@ export default class Crawler extends EventEmitter<CrawlerEventTypes> {
113123
/**
114124
* Abort the current crawl operation.
115125
*
116-
* Sets the aborted flag and immediately emits a `crawlEnd` event.
117-
* In-progress scrape tasks will check the flag and exit early.
126+
* Signals the AbortController so that the dealer stops launching new
127+
* workers. Currently running workers will finish, after which `deal()`
128+
* resolves and `crawlEnd` is emitted by the normal completion path in
129+
* {@link #runDeal}.
118130
*/
119131
abort() {
120-
this.#aborted = true;
121-
void this.emit('crawlEnd', {});
132+
this.#abortController.abort();
122133
}
123134

124135
/**
@@ -525,7 +536,6 @@ export default class Crawler extends EventEmitter<CrawlerEventTypes> {
525536
this.#linkList.progress(url);
526537

527538
return async () => {
528-
if (this.#aborted) return;
529539
const log = createTimedUpdate(update, this.#options.verbose);
530540

531541
try {
@@ -617,6 +627,7 @@ export default class Crawler extends EventEmitter<CrawlerEventTypes> {
617627
limit: concurrency,
618628
interval: this.#options.interval,
619629
verbose: this.#options.verbose || !process.stdout.isTTY,
630+
signal: this.#abortController.signal,
620631
header: (_progress, done, total, limit) => {
621632
return formatCrawlProgress({
622633
done,

0 commit comments

Comments
 (0)