Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from unittest import TestCase
from unittest.mock import patch

import requests
from bs4 import BeautifulSoup
from welearn_database.data.models import WeLearnDocument

Expand Down Expand Up @@ -139,7 +140,6 @@ def test_run(self, mock_get_page):

@patch("welearn_datastack.plugins.scrapers.ird_le_mag.IRDLeMagCollector._get_page")
def test_run_request_exception(self, mock_get_page):
import requests

mock_get_page.side_effect = requests.exceptions.RequestException(
"Network error"
Expand Down
61 changes: 61 additions & 0 deletions tests/test_scraping_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from unittest import TestCase

from welearn_datastack.utils_.scraping_utils import (
add_space_after_closing_sign,
add_space_before_capital_letter,
clean_return_to_line,
format_cc_license,
remove_extra_whitespace,
remove_html_stuff,
)


class TestScrapingUtils(TestCase):
def test_remove_extra_whitespace(self):
input_str = "Lorem ipsum"
awaited_str = "Lorem ipsum"
ret = remove_extra_whitespace(input_str)
self.assertEqual(ret, awaited_str)

def test_remove_html_stuff(self):
input_str = "<p>Lorem&nbsp;ipsum</p>"
awaited_str = "Lorem ipsum\n"
ret = remove_html_stuff(input_str)
self.assertEqual(awaited_str, ret)

def test_format_cc_license(self):
input_str = "CC-BY-SA-4.0"
awaited_str = "https://creativecommons.org/licenses/by-sa/4.0/"
ret = format_cc_license(input_str)
self.assertEqual(ret, awaited_str)

def test_clean_return_to_line(self):
input_str = "Lorem." "Ipsum"

awaited_str = "Lorem.Ipsum"
ret = clean_return_to_line(input_str)
self.assertEqual(ret, awaited_str)
Comment on lines +32 to +37

def test_add_space_after_closing_sign_point(self):
input_str = "Lorem.Ipsum"
awaited_str = "Lorem. Ipsum"
ret = add_space_after_closing_sign(input_str)
self.assertEqual(ret, awaited_str)

def test_add_space_after_closing_sign_closing_quote(self):
input_str = "Lorem»Ipsum"
awaited_str = "Lorem» Ipsum"
ret = add_space_after_closing_sign(input_str)
self.assertEqual(ret, awaited_str)

def test_add_space_after_closing_sign_open_quote(self):
input_str = "«Lorem Ipsum»"
awaited_str = "«Lorem Ipsum»"
ret = add_space_after_closing_sign(input_str)
self.assertEqual(ret, awaited_str)

def test_add_space_before_capital_letter(self):
input_str = "LoremIpsum"
awaited_str = "Lorem Ipsum"
ret = add_space_before_capital_letter(input_str)
self.assertEqual(ret, awaited_str)
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from welearn_datastack.modules.retrieve_data_from_files import retrieve_ids_from_csv
from welearn_datastack.utils_.database_utils import create_db_session
from welearn_datastack.utils_.path_utils import setup_local_path
from welearn_datastack.utils_.virtual_environement_utils import load_dotenv_local

log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
log_format: str = os.getenv(
Expand Down Expand Up @@ -134,4 +135,5 @@ def main() -> None:


if __name__ == "__main__":
load_dotenv_local()
main()
Comment on lines 137 to 139
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from welearn_datastack.modules.retrieve_data_from_files import retrieve_ids_from_csv
from welearn_datastack.utils_.database_utils import create_db_session
from welearn_datastack.utils_.path_utils import setup_local_path
from welearn_datastack.utils_.virtual_environement_utils import load_dotenv_local

log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
log_format: str = os.getenv(
Expand Down Expand Up @@ -230,4 +231,5 @@ def main() -> None:


if __name__ == "__main__":
load_dotenv_local()
main()
Comment on lines 233 to 235
42 changes: 39 additions & 3 deletions welearn_datastack/plugins/scrapers/ird_le_mag.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import datetime
import json
import logging
import os
import re
import time
from typing import List

import pydantic
Expand All @@ -23,6 +25,12 @@
get_http_code_from_exception,
get_new_https_session,
)
from welearn_datastack.utils_.scraping_utils import (
add_space_after_closing_sign,
add_space_before_capital_letter,
clean_return_to_line,
clean_text,
)

logger = logging.getLogger(__name__)

Expand All @@ -37,6 +45,8 @@ class IRDLeMagCollector(IPluginScrapeCollector):

def __init__(self):
super().__init__()
self.page_delay = int(os.environ.get("PAGE_DELAY", 2))
self.batch_delay = int(os.environ.get("BATCH_DELAY", 10))

@staticmethod
def _get_page(url: str) -> str:
Expand Down Expand Up @@ -130,18 +140,44 @@ def _extract_description(soup: BeautifulSoup) -> str:
raise NoDescriptionFoundError from e
return desc

@staticmethod
def correct_text_syntax(content: str) -> str:
"""
The content of the page is not well formatted, we need to clean it and add spaces after closing signs and before capital letters

:param content: the content of the page as a string
:return: the content of the page with the correct syntax
"""
Comment on lines +145 to +150
return add_space_before_capital_letter(
add_space_after_closing_sign(clean_return_to_line(clean_text(content)))
)

def run(self, documents: list[WeLearnDocument]) -> list[WrapperRetrieveDocument]:
logger.info("Running IRDLeMagCollector plugin")
ret: List[WrapperRetrieveDocument] = []
for document in documents:
for i, document in enumerate(documents):
if i > 0:
logger.info(
f"Waiting for {self.page_delay} seconds before scraping the next page to avoid being blocked by the server",
)
time.sleep(self.page_delay)
if i % 10 == 0:
logger.info(
f"Waiting for {self.batch_delay - self.page_delay} seconds before scraping the next batch of pages to avoid being blocked by the server",
)
time.sleep(self.batch_delay - self.page_delay)
try:
page = self._get_page(document.url)
soup = BeautifulSoup(page, "html.parser")
if not page:
raise NoContent
document.full_content = self._extract_content(page)
document.full_content = self.correct_text_syntax(
self._extract_content(page)
)
document.title = self._extract_title(soup)
document.description = self._extract_description(soup)
document.description = self.correct_text_syntax(
self._extract_description(soup)
)
document.details = {
"authors": self._extract_authors(soup),
"type": "article",
Expand Down
53 changes: 23 additions & 30 deletions welearn_datastack/utils_/scraping_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import re
import unicodedata
from html import unescape
from html.parser import HTMLParser

Expand Down Expand Up @@ -70,36 +71,6 @@ def format_cc_license(license: str) -> str:
)


def get_url_license_from_dc_format(soup: BeautifulSoup) -> str:
"""
Extract the license of the document from the DC.rights meta tag.
:param soup: BeautifulSoup object of the document.
:return: License of the document well formated.
"""
soup_license = soup.find("meta", {"name": "DC.rights"})
license = soup_license["content"] # type: ignore

en_license_name = "OpenEdition Books License".lower().split()
fr_license_name = "Licence OpenEdition Books".lower().split()
en_license_name.sort()
fr_license_name.sort()
other_known_licenses = [en_license_name, fr_license_name]

license_split_n_sort = license.lower().split() # type: ignore
license_split_n_sort.sort()

if license.startswith("Creative Commons"): # type: ignore
# It's a CC license
full_cc_code = license.split(" - ")[-1].strip() # type: ignore
rights_code = full_cc_code.split(" ")[1].strip()
version = full_cc_code.split(" ")[2].strip()
well_formated_license = f"https://creativecommons.org/licenses/{rights_code.lower()}/{version.lower()}/"
return well_formated_license
elif license_split_n_sort in other_known_licenses:
return "https://www.openedition.org/12554"
return license # type: ignore


def extract_property_from_html(
soup_find: Tag | NavigableString | None,
mandatory: bool = True,
Expand Down Expand Up @@ -156,6 +127,28 @@ def clean_text(content: str) -> str:
return remove_extra_whitespace(remove_html_stuff(content)).strip()


def add_space_after_closing_sign(string: str) -> str:
"""
Add a space after a closing sign if there is not already one
Args:
string (str): the string to clean
Returns:
str: the cleaned string
"""
return re.sub(r"([.»\")\]}])(?=[^\s.,;:!?)»\]}])", r"\1 ", string)


def add_space_before_capital_letter(string: str) -> str:
"""
Add a space before a capital letter if there is not already one
Args:
string (str): the string to clean
Returns:
str: the cleaned string
"""
return re.sub(r"([a-zàâäéèêëîïôöùûüÿç])([A-ZÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ])", r"\1 \2", string)

Comment on lines +130 to +150

def get_url_without_hal_like_versionning(url: str) -> str:
"""
Get the URL without the versionning part
Expand Down
Loading