CyberCRI · lpi-tn · Apr 14, 2026 · Apr 14, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/tests/document_collector_hub/plugins_test/test_irl_le_mag.py b/tests/document_collector_hub/plugins_test/test_irl_le_mag.py
@@ -3,6 +3,7 @@
 from unittest import TestCase
 from unittest.mock import patch
 
+import requests
 from bs4 import BeautifulSoup
 from welearn_database.data.models import WeLearnDocument
 
@@ -139,7 +140,6 @@ def test_run(self, mock_get_page):
 
     @patch("welearn_datastack.plugins.scrapers.ird_le_mag.IRDLeMagCollector._get_page")
     def test_run_request_exception(self, mock_get_page):
-        import requests
 
         mock_get_page.side_effect = requests.exceptions.RequestException(
             "Network error"

diff --git a/tests/test_scraping_utils.py b/tests/test_scraping_utils.py
@@ -0,0 +1,61 @@
+from unittest import TestCase
+
+from welearn_datastack.utils_.scraping_utils import (
+    add_space_after_closing_sign,
+    add_space_before_capital_letter,
+    clean_return_to_line,
+    format_cc_license,
+    remove_extra_whitespace,
+    remove_html_stuff,
+)
+
+
+class TestScrapingUtils(TestCase):
+    def test_remove_extra_whitespace(self):
+        input_str = "Lorem               ipsum"
+        awaited_str = "Lorem ipsum"
+        ret = remove_extra_whitespace(input_str)
+        self.assertEqual(ret, awaited_str)
+
+    def test_remove_html_stuff(self):
+        input_str = "<p>Lorem&nbsp;ipsum</p>"
+        awaited_str = "Lorem ipsum\n"
+        ret = remove_html_stuff(input_str)
+        self.assertEqual(awaited_str, ret)
+
+    def test_format_cc_license(self):
+        input_str = "CC-BY-SA-4.0"
+        awaited_str = "https://creativecommons.org/licenses/by-sa/4.0/"
+        ret = format_cc_license(input_str)
+        self.assertEqual(ret, awaited_str)
+
+    def test_clean_return_to_line(self):
+        input_str = "Lorem." "Ipsum"
+
+        awaited_str = "Lorem.Ipsum"
+        ret = clean_return_to_line(input_str)
+        self.assertEqual(ret, awaited_str)
+
+    def test_add_space_after_closing_sign_point(self):
+        input_str = "Lorem.Ipsum"
+        awaited_str = "Lorem. Ipsum"
+        ret = add_space_after_closing_sign(input_str)
+        self.assertEqual(ret, awaited_str)
+
+    def test_add_space_after_closing_sign_closing_quote(self):
+        input_str = "Lorem»Ipsum"
+        awaited_str = "Lorem» Ipsum"
+        ret = add_space_after_closing_sign(input_str)
+        self.assertEqual(ret, awaited_str)
+
+    def test_add_space_after_closing_sign_open_quote(self):
+        input_str = "«Lorem Ipsum»"
+        awaited_str = "«Lorem Ipsum»"
+        ret = add_space_after_closing_sign(input_str)
+        self.assertEqual(ret, awaited_str)
+
+    def test_add_space_before_capital_letter(self):
+        input_str = "LoremIpsum"
+        awaited_str = "Lorem Ipsum"
+        ret = add_space_before_capital_letter(input_str)
+        self.assertEqual(ret, awaited_str)
diff --git a/welearn_datastack/nodes_workflow/DocumentVectorizer/document_vectorizer.py b/welearn_datastack/nodes_workflow/DocumentVectorizer/document_vectorizer.py
@@ -13,6 +13,7 @@
 from welearn_datastack.modules.retrieve_data_from_files import retrieve_ids_from_csv
 from welearn_datastack.utils_.database_utils import create_db_session
 from welearn_datastack.utils_.path_utils import setup_local_path
+from welearn_datastack.utils_.virtual_environement_utils import load_dotenv_local
 
 log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
 log_format: str = os.getenv(
@@ -134,4 +135,5 @@ def main() -> None:
 
 
 if __name__ == "__main__":
+    load_dotenv_local()
     main()
diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
@@ -25,6 +25,7 @@
 from welearn_datastack.modules.retrieve_data_from_files import retrieve_ids_from_csv
 from welearn_datastack.utils_.database_utils import create_db_session
 from welearn_datastack.utils_.path_utils import setup_local_path
+from welearn_datastack.utils_.virtual_environement_utils import load_dotenv_local
 
 log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
 log_format: str = os.getenv(
@@ -230,4 +231,5 @@ def main() -> None:
 
 
 if __name__ == "__main__":
+    load_dotenv_local()
     main()
diff --git a/welearn_datastack/plugins/scrapers/ird_le_mag.py b/welearn_datastack/plugins/scrapers/ird_le_mag.py
@@ -1,7 +1,9 @@
 import datetime
 import json
 import logging
+import os
 import re
+import time
 from typing import List
 
 import pydantic
@@ -23,6 +25,12 @@
     get_http_code_from_exception,
     get_new_https_session,
 )
+from welearn_datastack.utils_.scraping_utils import (
+    add_space_after_closing_sign,
+    add_space_before_capital_letter,
+    clean_return_to_line,
+    clean_text,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -37,6 +45,8 @@ class IRDLeMagCollector(IPluginScrapeCollector):
 
     def __init__(self):
         super().__init__()
+        self.page_delay = int(os.environ.get("PAGE_DELAY", 2))
+        self.batch_delay = int(os.environ.get("BATCH_DELAY", 10))
 
     @staticmethod
     def _get_page(url: str) -> str:
@@ -130,18 +140,44 @@ def _extract_description(soup: BeautifulSoup) -> str:
             raise NoDescriptionFoundError from e
         return desc
 
+    @staticmethod
+    def correct_text_syntax(content: str) -> str:
+        """
+            The content of the page is not well formatted, we need to clean it and add spaces after closing signs and before capital letters
+
+        :param content: the content of the page as a string
+        :return: the content of the page with the correct syntax
+        """
+        return add_space_before_capital_letter(
+            add_space_after_closing_sign(clean_return_to_line(clean_text(content)))
+        )
+
     def run(self, documents: list[WeLearnDocument]) -> list[WrapperRetrieveDocument]:
         logger.info("Running IRDLeMagCollector plugin")
         ret: List[WrapperRetrieveDocument] = []
-        for document in documents:
+        for i, document in enumerate(documents):
+            if i > 0:
+                logger.info(
+                    f"Waiting for {self.page_delay} seconds before scraping the next page to avoid being blocked by the server",
+                )
+                time.sleep(self.page_delay)
+                if i % 10 == 0:
+                    logger.info(
+                        f"Waiting for {self.batch_delay - self.page_delay} seconds before scraping the next batch of pages to avoid being blocked by the server",
+                    )
+                    time.sleep(self.batch_delay - self.page_delay)
             try:
                 page = self._get_page(document.url)
                 soup = BeautifulSoup(page, "html.parser")
                 if not page:
                     raise NoContent
-                document.full_content = self._extract_content(page)
+                document.full_content = self.correct_text_syntax(
+                    self._extract_content(page)
+                )
                 document.title = self._extract_title(soup)
-                document.description = self._extract_description(soup)
+                document.description = self.correct_text_syntax(
+                    self._extract_description(soup)
+                )
                 document.details = {
                     "authors": self._extract_authors(soup),
                     "type": "article",

diff --git a/welearn_datastack/utils_/scraping_utils.py b/welearn_datastack/utils_/scraping_utils.py
@@ -1,5 +1,6 @@
 import logging
 import re
+import unicodedata
 from html import unescape
 from html.parser import HTMLParser
 
@@ -70,36 +71,6 @@ def format_cc_license(license: str) -> str:
     )
 
 
-def get_url_license_from_dc_format(soup: BeautifulSoup) -> str:
-    """
-    Extract the license of the document from the DC.rights meta tag.
-    :param soup: BeautifulSoup object of the document.
-    :return: License of the document well formated.
-    """
-    soup_license = soup.find("meta", {"name": "DC.rights"})
-    license = soup_license["content"]  # type: ignore
-
-    en_license_name = "OpenEdition Books License".lower().split()
-    fr_license_name = "Licence OpenEdition Books".lower().split()
-    en_license_name.sort()
-    fr_license_name.sort()
-    other_known_licenses = [en_license_name, fr_license_name]
-
-    license_split_n_sort = license.lower().split()  # type: ignore
-    license_split_n_sort.sort()
-
-    if license.startswith("Creative Commons"):  # type: ignore
-        # It's a CC license
-        full_cc_code = license.split(" - ")[-1].strip()  # type: ignore
-        rights_code = full_cc_code.split(" ")[1].strip()
-        version = full_cc_code.split(" ")[2].strip()
-        well_formated_license = f"https://creativecommons.org/licenses/{rights_code.lower()}/{version.lower()}/"
-        return well_formated_license
-    elif license_split_n_sort in other_known_licenses:
-        return "https://www.openedition.org/12554"
-    return license  # type: ignore
-
-
 def extract_property_from_html(
     soup_find: Tag | NavigableString | None,
     mandatory: bool = True,
@@ -156,6 +127,28 @@ def clean_text(content: str) -> str:
     return remove_extra_whitespace(remove_html_stuff(content)).strip()
 
 
+def add_space_after_closing_sign(string: str) -> str:
+    """
+    Add a space after a closing sign if there is not already one
+    Args:
+        string (str): the string to clean
+    Returns:
+        str: the cleaned string
+    """
+    return re.sub(r"([.»\")\]}])(?=[^\s.,;:!?)»\]}])", r"\1 ", string)
+
+
+def add_space_before_capital_letter(string: str) -> str:
+    """
+    Add a space before a capital letter if there is not already one
+    Args:
+        string (str): the string to clean
+    Returns:
+        str: the cleaned string
+    """
+    return re.sub(r"([a-zàâäéèêëîïôöùûüÿç])([A-ZÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ])", r"\1 \2", string)
+
+
 def get_url_without_hal_like_versionning(url: str) -> str:
     """
     Get the URL without the versionning part