diff --git a/publisher-scrape-bot/.gitignore b/publisher-scrape-bot/.gitignore new file mode 100644 index 00000000..51ad6b60 --- /dev/null +++ b/publisher-scrape-bot/.gitignore @@ -0,0 +1,4 @@ +.DS_Store +__pycache__/ +*.pyc +.venv/ diff --git a/publisher-scrape-bot/README.md b/publisher-scrape-bot/README.md new file mode 100644 index 00000000..ca0ab640 --- /dev/null +++ b/publisher-scrape-bot/README.md @@ -0,0 +1,103 @@ +# Publisher Scraper Bot + +This bot crawls configured publisher websites and creates Open Library records +using the Open Library Python client (`openlibrary-client` / `olclient`). + +## Repository Layout + +- `import_publisher_books.py`: crawl + parse + import entrypoint +- `publishers/base.py`: parser contract and parsed-book model +- `publishers/__init__.py`: parser registry +- `publishers/artanuji.py`: `artanuji` publisher scraper/parser +- `tests/`: unit tests for payload generation, parser extraction, and CLI guards + +## Supported Publishers + +- `artanuji` + +Additional publishers can be added by creating a new parser module in +`publishers/` and registering it in `publishers/__init__.py`. + +## What The Bot Does + +1. Visits publisher book pages by numeric ID. +2. Extracts metadata (title, author, ISBN, date, pages, category, description, + cover URL when available). +3. Converts extracted metadata into an Open Library create payload. +4. Calls `olclient` Python APIs to create the record on Open Library. +5. Optionally skips books that already exist in Open Library by ISBN. + +The crawler framework supports multiple publishers via the parser registry in +`publishers/__init__.py`. + +## Prerequisites + +- Python 3.10+ (standard library only for this repo) +- Open Library Python client installed (`openlibrary-client`) + +## Setup + +```bash +python3 -m venv .venv +source .venv/bin/activate +``` + +The bot requires `openlibrary-client` to upload/check books. + +Install dependencies from repo root (or install `openlibrary-client`) before +running non-dry imports. + +Run tests: + +```bash +python3 -m unittest discover -s tests -p 'test_*.py' +``` + +## Usage + +Dry run (recommended first): + +```bash +python3 import_publisher_books.py \ + artanuji \ + --start-id 650 \ + --end-id 730 \ + --dry-run +``` + +Create records: + +```bash +python3 import_publisher_books.py \ + artanuji \ + --start-id 650 \ + --end-id 730 \ + --skip-existing-isbn +``` + +## Useful Flags + +- `--sleep-seconds`: request throttling delay +- `--request-timeout`: HTTP timeout +- `--max-books`: cap created records in one run +- `--skip-existing-isbn`: skip records already present in Open Library +- `--dry-run`: print create commands without writing to Open Library + +## Adding A Publisher + +1. Add `publishers/.py`. +2. Implement `parse()` with publisher-specific field extraction. +3. Register it in `publishers/__init__.py` under `PARSERS`. +4. Add parser tests in `tests/test_.py`. +5. Run a small ID range with `--dry-run` to validate parsing/output. + +`publishers/.py` should implement the `PublisherParser` protocol: + - `page_url(item_id: int) -> str` + - `parse(html: str, item_id: int) -> ParsedBook | None` + +## PR Checklist + +- Run `python3 -m py_compile import_publisher_books.py publishers/*.py`. +- Run `python3 -m unittest discover -s tests -p 'test_*.py'`. +- Run at least one dry-run import command for the target publisher. +- Confirm README examples and flags match the actual CLI behavior. diff --git a/publisher-scrape-bot/import_publisher_books.py b/publisher-scrape-bot/import_publisher_books.py new file mode 100755 index 00000000..0e7670ae --- /dev/null +++ b/publisher-scrape-bot/import_publisher_books.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path +from typing import TYPE_CHECKING, Any +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + +ROOT_DIR = Path(__file__).resolve().parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +from publishers import PARSERS + +if TYPE_CHECKING: + from publishers.base import ParsedBook + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Crawl publisher site and create books via openlibrary-client." + ) + parser.add_argument( + "publisher", + choices=sorted(PARSERS.keys()), + help="Publisher parser to use (example: artanuji)", + ) + parser.add_argument("--start-id", type=int, default=1, help="Start book id") + parser.add_argument( + "--end-id", type=int, default=5000, help="End book id (inclusive)" + ) + parser.add_argument( + "--sleep-seconds", + type=float, + default=0.4, + help="Delay between requests to avoid overloading publisher site", + ) + parser.add_argument( + "--request-timeout", + type=float, + default=20.0, + help="HTTP timeout in seconds", + ) + parser.add_argument( + "--skip-existing-isbn", + action="store_true", + help="Check Open Library by ISBN and skip if a book already exists", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Crawl and parse only; print create payloads without creating", + ) + parser.add_argument( + "--max-books", + type=int, + default=0, + help="Stop after creating this many books (0 = unlimited)", + ) + return parser.parse_args() + + +def validate_args(args: argparse.Namespace) -> str | None: + if args.start_id < 1: + return "--start-id must be >= 1" + if args.end_id < args.start_id: + return "--end-id must be >= --start-id" + if args.sleep_seconds < 0: + return "--sleep-seconds must be >= 0" + if args.request_timeout <= 0: + return "--request-timeout must be > 0" + if args.max_books < 0: + return "--max-books must be >= 0" + return None + + +def fetch_html(url: str, timeout: float) -> str | None: + req = Request( + url, + headers={ + "User-Agent": "OpenGeoLibraryBot/0.1 (+https://openlibrary.org)", + "Accept-Language": "en-US,en;q=0.9,ka;q=0.8", + }, + ) + try: + with urlopen(req, timeout=timeout) as response: + content_type = response.headers.get_content_charset() or "utf-8" + return response.read().decode(content_type, errors="replace") + except HTTPError as exc: + if exc.code in (403, 404): + return None + raise + except URLError: + return None + + +def load_ol_client() -> tuple[Any, Any]: + from olclient import common as ol_common + from olclient.openlibrary import OpenLibrary + + return OpenLibrary, ol_common + + +def parsed_book_to_ol_book(book: ParsedBook, ol_common: Any) -> Any: + identifiers = {} + if book.isbn_13: + identifiers["isbn_13"] = [book.isbn_13] + if book.isbn_10: + identifiers["isbn_10"] = [book.isbn_10] + + return ol_common.Book( + title=book.title, + authors=[ol_common.Author(name=book.author)], + publisher=book.publisher, + publish_date=book.publish_date, + identifiers=identifiers, + number_of_pages=book.number_of_pages, + description=book.description, + subject=book.subject, + cover=book.cover_url, + ) + + +def run_ol_create( + ol: Any, ol_common: Any, book: ParsedBook, dry_run: bool +) -> tuple[bool, str]: + payload_json = json.dumps(book.to_openlibrary_create_payload(), ensure_ascii=False) + if dry_run: + return True, f"DRY RUN: {payload_json}" + + try: + created = ol.create_book(parsed_book_to_ol_book(book, ol_common)) + created_olid = getattr(created, "olid", "") + if created_olid: + return True, f"created={created_olid}" + return True, "created" + except Exception as exc: + return False, str(exc) + + +def ol_book_exists_by_isbn(ol: Any, isbn: str) -> bool: + try: + edition = ol.Edition.get(isbn=isbn) + except Exception: + return False + return edition is not None + + +def process_book( + book: ParsedBook, + args: argparse.Namespace, + ol: Any, + ol_common: Any, +) -> tuple[str, str]: + payload = book.to_openlibrary_create_payload() + + identifiers = payload.get("identifiers", {}) + isbn = "" + if identifiers.get("isbn_13"): + isbn = identifiers["isbn_13"][0] + elif identifiers.get("isbn_10"): + isbn = identifiers["isbn_10"][0] + + if args.skip_existing_isbn and isbn and ol_book_exists_by_isbn(ol, isbn): + return "skipped", f"id={book.source_id} isbn={isbn} already exists" + + ok, output = run_ol_create(ol, ol_common, book, args.dry_run) + if ok: + return "created", f"id={book.source_id} {book.title} -> {output}" + return "failed", f"id={book.source_id} {book.title} -> {output}" + + +def main() -> int: + args = parse_args() + if arg_error := validate_args(args): + print(arg_error, file=sys.stderr) + return 2 + + parser = PARSERS[args.publisher] + ol = None + ol_common = None + + if not args.dry_run or args.skip_existing_isbn: + try: + OpenLibrary, ol_common = load_ol_client() + ol = OpenLibrary() + except Exception as exc: + print(f"Failed to initialize openlibrary-client: {exc}", file=sys.stderr) + return 2 + + created = 0 + skipped = 0 + failed = 0 + misses = 0 + + for item_id in range(args.start_id, args.end_id + 1): + url = parser.page_url(item_id) + html = fetch_html(url, timeout=args.request_timeout) + + if not html: + misses += 1 + continue + + parsed = parser.parse(html, item_id) + if not parsed: + misses += 1 + time.sleep(args.sleep_seconds) + continue + + misses = 0 + + status, detail = process_book(parsed, args, ol, ol_common) + if status == "created": + created += 1 + print(f"OK {detail}") + elif status == "skipped": + skipped += 1 + print(f"SKIP {detail}") + else: + failed += 1 + print(f"FAIL {detail}", file=sys.stderr) + + if args.max_books and created >= args.max_books: + print(f"Reached --max-books={args.max_books}; stopping.") + break + + time.sleep(args.sleep_seconds) + + print( + "Summary: " + f"created={created} skipped={skipped} failed={failed} " + f"range={args.start_id}-{args.end_id} misses={misses}" + ) + return 1 if failed > 0 else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/publisher-scrape-bot/publishers/__init__.py b/publisher-scrape-bot/publishers/__init__.py new file mode 100644 index 00000000..6a08c5e2 --- /dev/null +++ b/publisher-scrape-bot/publishers/__init__.py @@ -0,0 +1,7 @@ +"""Publisher parser registry.""" + +from publishers.artanuji import ArtanujiParser + +PARSERS = { + "artanuji": ArtanujiParser(), +} diff --git a/publisher-scrape-bot/publishers/artanuji.py b/publisher-scrape-bot/publishers/artanuji.py new file mode 100644 index 00000000..fe09d593 --- /dev/null +++ b/publisher-scrape-bot/publishers/artanuji.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import html as ihtml +import re +from urllib.parse import urljoin + +from publishers.base import ParsedBook + + +class ArtanujiParser: + name = "artanuji" + base_url = "https://www.artanuji.ge/book_ge.php?id={id}" + + _STOP_DESCRIPTION_LABELS = ( + "ინგლისურიდან თარგმნა", + "რუსულიდან თარგმნა", + "გერმანულიდან თარგმნა", + "ფრანგულიდან თარგმნა", + "გაზიარება", + "ავტორის წიგნები", + "ამავე კატეგორიაში", + "ყიდვა", + ) + + def page_url(self, item_id: int) -> str: + return self.base_url.format(id=item_id) + + def parse(self, html: str, item_id: int) -> ParsedBook | None: + if "book_ge.php" not in html or "ISBN" not in html: + return None + + title = self._extract_title(html) + author = self._extract_author(html) + isbn_raw = self._extract_field(html, "ISBN") + publish_date = self._extract_field(html, "გამოცემის თარიღი") or "" + pages_raw = self._extract_field(html, "გვერდები") + subject = self._extract_field(html, "კატეგორია") + description = self._extract_description(html) + cover_url = self._extract_cover_url(html, item_id) + + if not title or not author or not isbn_raw: + return None + + isbn_10, isbn_13 = self._classify_isbn(isbn_raw) + if not (isbn_10 or isbn_13): + return None + + pages = None + if pages_raw: + page_match = re.search(r"\d+", pages_raw) + pages = int(page_match.group(0)) if page_match else None + + year_match = re.search(r"\b(1[89]\d{2}|20\d{2}|21\d{2})\b", publish_date) + publish_year = year_match.group(1) if year_match else publish_date.strip() + + return ParsedBook( + source=self.name, + source_id=item_id, + source_url=self.page_url(item_id), + title=title, + author=author, + publisher="არტანუჯი", + publish_date=publish_year, + isbn_13=isbn_13, + isbn_10=isbn_10, + number_of_pages=pages, + description=description, + subject=subject, + cover_url=cover_url, + ) + + def _extract_title(self, html: str) -> str | None: + match = re.search(r"]*>(.*?)", html, flags=re.IGNORECASE | re.DOTALL) + if not match: + return None + return self._clean_text(match.group(1)) + + def _extract_author(self, html: str) -> str | None: + # Most pages place author name in

under the title block. + match = re.search(r"]*>(.*?)

", html, flags=re.IGNORECASE | re.DOTALL) + if not match: + return None + return self._clean_text(match.group(1)) + + def _extract_field(self, html: str, label: str) -> str | None: + prefix = f"{label}:" + for line in self._to_text_lines(html): + if line.startswith(prefix): + return self._clean_text(line[len(prefix) :]) + return None + + def _extract_description(self, html: str) -> str | None: + text = self._to_text_lines(html) + if not text: + return None + + start_idx = None + for idx, line in enumerate(text): + if line.startswith("ყიდვა"): + start_idx = idx + 1 + break + if start_idx is None: + start_idx = 0 + + out = [] + for line in text[start_idx:]: + if any(line.startswith(stop) for stop in self._STOP_DESCRIPTION_LABELS): + break + if len(line) >= 35: + out.append(line) + if not out: + return None + return " ".join(out).strip() + + def _extract_cover_url(self, html: str, item_id: int) -> str | None: + og = re.search( + r']+property=["\']og:image["\'][^>]+content=["\']([^"\']+)["\']', + html, + flags=re.IGNORECASE, + ) + if og: + return urljoin(self.page_url(item_id), self._clean_text(og.group(1))) + + # Fallback to the first image on the page. + image = re.search( + r']+src=["\']([^"\']+)["\']', + html, + flags=re.IGNORECASE, + ) + if image: + return urljoin(self.page_url(item_id), self._clean_text(image.group(1))) + return None + + @staticmethod + def _classify_isbn(raw: str) -> tuple[str | None, str | None]: + digits = re.sub(r"[^0-9Xx]", "", raw).upper() + if len(digits) == 13: + return None, digits + if len(digits) == 10: + return digits, None + # Some pages include both ISBN-10/13 separated by punctuation. + isbn_13 = None + isbn_10 = None + for token in re.findall(r"[0-9Xx-]{10,20}", raw): + value = re.sub(r"[^0-9Xx]", "", token).upper() + if len(value) == 13 and not isbn_13: + isbn_13 = value + elif len(value) == 10 and not isbn_10: + isbn_10 = value + return isbn_10, isbn_13 + + @staticmethod + def _clean_text(value: str) -> str: + value = re.sub(r"<[^>]+>", " ", value) + value = ihtml.unescape(value) + value = re.sub(r"\s+", " ", value) + return value.strip() + + def _to_text_lines(self, html: str) -> list[str]: + html = re.sub( + r"<(script|style)\b[^>]*>.*?", + " ", + html, + flags=re.IGNORECASE | re.DOTALL, + ) + html = re.sub(r"", "\n", html, flags=re.IGNORECASE) + html = re.sub(r"", "\n", html, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", " ", html) + text = ihtml.unescape(text) + lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()] + return [line for line in lines if line] diff --git a/publisher-scrape-bot/publishers/base.py b/publisher-scrape-bot/publishers/base.py new file mode 100644 index 00000000..61f69323 --- /dev/null +++ b/publisher-scrape-bot/publishers/base.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Protocol + + +@dataclass +class ParsedBook: + source: str + source_id: int + source_url: str + title: str + author: str + publisher: str + publish_date: str + isbn_13: str | None = None + isbn_10: str | None = None + number_of_pages: int | None = None + description: str | None = None + subject: str | None = None + cover_url: str | None = None + + def to_openlibrary_create_payload(self) -> dict: + identifiers = {} + if self.isbn_13: + identifiers["isbn_13"] = [self.isbn_13] + if self.isbn_10: + identifiers["isbn_10"] = [self.isbn_10] + + payload = { + "title": self.title, + "author": self.author, + "publisher": self.publisher, + "publish_date": self.publish_date, + "identifiers": identifiers, + } + if self.number_of_pages: + payload["number_of_pages"] = self.number_of_pages + if self.description: + payload["description"] = self.description + if self.subject: + payload["subject"] = self.subject + if self.cover_url: + payload["cover"] = self.cover_url + return payload + + +class PublisherParser(Protocol): + name: str + base_url: str + + def page_url(self, item_id: int) -> str: ... + + def parse(self, html: str, item_id: int) -> ParsedBook | None: ... diff --git a/publisher-scrape-bot/requirements.txt b/publisher-scrape-bot/requirements.txt new file mode 100644 index 00000000..e155810a --- /dev/null +++ b/publisher-scrape-bot/requirements.txt @@ -0,0 +1 @@ +openlibrary-client==0.0.30 diff --git a/publisher-scrape-bot/tests/test_artanuji.py b/publisher-scrape-bot/tests/test_artanuji.py new file mode 100644 index 00000000..b247be94 --- /dev/null +++ b/publisher-scrape-bot/tests/test_artanuji.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import sys +import unittest +from pathlib import Path + +BOT_DIR = Path(__file__).resolve().parents[1] +if str(BOT_DIR) not in sys.path: + sys.path.insert(0, str(BOT_DIR)) + +from publishers.artanuji import ArtanujiParser + + +class ArtanujiParserTest(unittest.TestCase): + def setUp(self) -> None: + self.parser = ArtanujiParser() + + def test_parse_extracts_expected_fields(self) -> None: + html = """ + + + + + +

ზღვის წიგნი

+

Nino Beridze

+
ISBN: 978-9941-11-222-3
+
გამოცემის თარიღი: 2021
+
გვერდები: 304
+
კატეგორია: პროზა
+

ყიდვა

+

ეს არის ტესტური აღწერა, რომელიც საკმარისად გრძელია ველის შესავსებად.

+

გაზიარება

+ book_ge.php + + + """ + + parsed = self.parser.parse(html, item_id=123) + + self.assertIsNotNone(parsed) + assert parsed is not None + self.assertEqual(parsed.title, "ზღვის წიგნი") + self.assertEqual(parsed.author, "Nino Beridze") + self.assertEqual(parsed.isbn_13, "9789941112223") + self.assertEqual(parsed.publish_date, "2021") + self.assertEqual(parsed.number_of_pages, 304) + self.assertEqual(parsed.subject, "პროზა") + self.assertTrue(parsed.description and "ტესტური აღწერა" in parsed.description) + self.assertEqual(parsed.cover_url, "https://www.artanuji.ge/images/book-1.jpg") + + def test_parse_returns_none_for_non_book_page(self) -> None: + html = "

Home

" + parsed = self.parser.parse(html, item_id=7) + self.assertIsNone(parsed) + + +if __name__ == "__main__": + unittest.main() diff --git a/publisher-scrape-bot/tests/test_base.py b/publisher-scrape-bot/tests/test_base.py new file mode 100644 index 00000000..fe42df59 --- /dev/null +++ b/publisher-scrape-bot/tests/test_base.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import sys +import unittest +from pathlib import Path + +BOT_DIR = Path(__file__).resolve().parents[1] +if str(BOT_DIR) not in sys.path: + sys.path.insert(0, str(BOT_DIR)) + +from publishers.base import ParsedBook + + +class ParsedBookPayloadTest(unittest.TestCase): + def test_payload_includes_optional_fields_when_present(self) -> None: + book = ParsedBook( + source="artanuji", + source_id=1, + source_url="https://example.org/book/1", + title="Example", + author="Jane Doe", + publisher="Example Publisher", + publish_date="2024", + isbn_13="9781234567897", + number_of_pages=288, + description="Desc", + subject="Fiction", + cover_url="https://example.org/cover.jpg", + ) + + payload = book.to_openlibrary_create_payload() + + self.assertEqual(payload["title"], "Example") + self.assertEqual(payload["identifiers"]["isbn_13"], ["9781234567897"]) + self.assertEqual(payload["number_of_pages"], 288) + self.assertEqual(payload["description"], "Desc") + self.assertEqual(payload["subject"], "Fiction") + self.assertEqual(payload["cover"], "https://example.org/cover.jpg") + + def test_payload_omits_optional_fields_when_absent(self) -> None: + book = ParsedBook( + source="artanuji", + source_id=2, + source_url="https://example.org/book/2", + title="Example 2", + author="John Smith", + publisher="Example Publisher", + publish_date="2023", + ) + + payload = book.to_openlibrary_create_payload() + + self.assertEqual(payload["identifiers"], {}) + self.assertNotIn("number_of_pages", payload) + self.assertNotIn("description", payload) + self.assertNotIn("subject", payload) + self.assertNotIn("cover", payload) + + +if __name__ == "__main__": + unittest.main() diff --git a/publisher-scrape-bot/tests/test_import_script.py b/publisher-scrape-bot/tests/test_import_script.py new file mode 100644 index 00000000..3a1f5a78 --- /dev/null +++ b/publisher-scrape-bot/tests/test_import_script.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import argparse +import sys +import unittest +from pathlib import Path +from types import SimpleNamespace + +BOT_DIR = Path(__file__).resolve().parents[1] +if str(BOT_DIR) not in sys.path: + sys.path.insert(0, str(BOT_DIR)) + +from import_publisher_books import ( + parsed_book_to_ol_book, + validate_args, +) +from publishers.base import ParsedBook + + +class ImportScriptTest(unittest.TestCase): + def test_validate_args_accepts_valid_values(self) -> None: + args = argparse.Namespace( + start_id=1, + end_id=10, + sleep_seconds=0.1, + request_timeout=5.0, + max_books=0, + ) + self.assertIsNone(validate_args(args)) + + def test_validate_args_rejects_invalid_range(self) -> None: + args = argparse.Namespace( + start_id=12, + end_id=10, + sleep_seconds=0.1, + request_timeout=5.0, + max_books=0, + ) + self.assertEqual(validate_args(args), "--end-id must be >= --start-id") + + def test_parsed_book_to_ol_book_maps_fields(self) -> None: + captured = {} + + class FakeAuthor: + def __init__(self, name: str) -> None: + self.name = name + + class FakeBook: + def __init__(self, **kwargs) -> None: + captured.update(kwargs) + + fake_common = SimpleNamespace(Author=FakeAuthor, Book=FakeBook) + parsed = ParsedBook( + source="publisher", + source_id=5, + source_url="https://example.org/book/5", + title="Example Title", + author="Jane Doe", + publisher="Example Press", + publish_date="2025", + isbn_13="9781234567897", + number_of_pages=210, + description="Example description", + subject="Example subject", + cover_url="https://example.org/cover.jpg", + ) + + parsed_book_to_ol_book(parsed, fake_common) + + self.assertEqual(captured["title"], "Example Title") + self.assertEqual(captured["authors"][0].name, "Jane Doe") + self.assertEqual(captured["publisher"], "Example Press") + self.assertEqual(captured["publish_date"], "2025") + self.assertEqual(captured["identifiers"]["isbn_13"], ["9781234567897"]) + self.assertEqual(captured["number_of_pages"], 210) + self.assertEqual(captured["description"], "Example description") + self.assertEqual(captured["subject"], "Example subject") + self.assertEqual(captured["cover"], "https://example.org/cover.jpg") + + +if __name__ == "__main__": + unittest.main()