epflgraph · aitor-perez-epfl · Mar 9, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -176,7 +176,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # Abstra
 # Abstra is an AI-powered process automation framework.

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -15,6 +15,7 @@ services:
       RABBITMQ_DEFAULT_PASS: ${AI_RABBITMQ_DEV_CELERY_PASS}
     volumes:
       - rabbitmq_data:/var/lib/rabbitmq
+      - ./rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
 
   # Redis with AOF persistence
   redis:
@@ -99,7 +100,7 @@ services:
     network_mode: "host"
     volumes:
       - ./nginx.graphai-https.conf:/etc/nginx/conf.d/default.conf:ro
-      - ./.certs:/etc/nginx/certs:ro
+      - /home/dockerhost/graphcert.cede-apps.ch:/etc/nginx/certs:ro
     restart: unless-stopped
 
 networks:

diff --git a/graphai/api/image/router.py b/graphai/api/image/router.py
@@ -129,12 +129,24 @@ async def extract_text(data: ExtractTextRequest):
     method = data.method
     force = data.force
     no_cache = data.no_cache
-    api_token = data.google_api_token
-    openai_token = data.openai_api_token
-    gemini_token = data.gemini_api_token
+    google_api_token = data.google_api_token
+    openai_api_token = data.openai_api_token
+    gemini_api_token = data.gemini_api_token
+    rcp_api_token = data.rcp_api_token
     model_type = data.model_type
     enable_tikz = data.enable_tikz
-    task_id = ocr_job(token, force, no_cache, method, api_token, openai_token, gemini_token, model_type, enable_tikz)
+    task_id = ocr_job(
+        token,
+        force=force,
+        no_cache=no_cache,
+        method=method,
+        google_api_token=google_api_token,
+        openai_api_token=openai_api_token,
+        gemini_api_token=gemini_api_token,
+        rcp_api_token=rcp_api_token,
+        model_type=model_type,
+        enable_tikz=enable_tikz,
+    )
     return {'task_id': task_id}
 
 

diff --git a/graphai/api/image/schemas.py b/graphai/api/image/schemas.py
@@ -166,10 +166,9 @@ class ExtractTextRequest(BaseModel):
         description="The token that identifies the requested file"
     )
 
-    method: Literal['google', 'tesseract', 'openai', 'gemini'] = Field(
+    method: Literal['tesseract', 'google', 'openai', 'gemini', 'rcp'] = Field(
         title="Method",
-        description="OCR method. Available methods are 'google' (default), 'openai', 'gemini',"
-                    "and 'tesseract' (not recommended)",
+        description="OCR method. Available methods are 'tesseract' (not recommended), 'google' (default), 'openai', 'gemini' and 'rcp'",
         default="google"
     )
 
@@ -187,36 +186,37 @@ class ExtractTextRequest(BaseModel):
 
     google_api_token: Union[str, None] = Field(
         title="Google API token",
-        description="Token that authenticates the user on the Google OCR API."
-                    "Without a valid token, Google OCR will fail. Not required for Tesseract, OpenAI, or Gemini.",
+        description="Token that authenticates the user on the Google OCR API. Without a valid token, Google OCR will fail. Only required for method 'google'.",
         default=None
     )
 
     openai_api_token: Union[str, None] = Field(
         title="OpenAI API token",
-        description="Token that authenticates the user on the OpenAI API."
-                    "Without a valid token, OpenAI OCR will fail. Not required for Tesseract, Google, or Gemini.",
+        description="Token that authenticates the user on the OpenAI API. Without a valid token, OpenAI OCR will fail. Only required for method 'openai'.",
         default=None
     )
 
     gemini_api_token: Union[str, None] = Field(
         title="Gemini API token",
-        description="Token that authenticates the user on the Gemini API."
-                    "Without a valid token, Gemini OCR will fail. Not required for Tesseract, Google, or OpenAI.",
+        description="Token that authenticates the user on the Gemini API. Without a valid token, Gemini OCR will fail. Only required for method 'gemini'.",
+        default=None
+    )
+
+    rcp_api_token: Union[str, None] = Field(
+        title="RCP API token",
+        description="Token that authenticates the user on the RCP platform. Without a valid token, RCP OCR will fail. Only required for method 'rcp'.",
         default=None
     )
 
     model_type: Union[str, None] = Field(
         title="Model type",
-        description="For OpenAI and Gemini options, allows the user to specify the model that they want to use. "
-                    "Do not specify this option unless you know exactly what you are doing.",
+        description="For LLM-based options, allows the user to specify the model that they want to use. Do not specify this option unless you know exactly what you are doing.",
         default=None
     )
 
     enable_tikz: bool = Field(
         title="Enable TikZ",
-        description="For PDF OCR, if True, attempts to extract any figures as valid TikZ. If False, "
-                    "replaces the figures with an alt text describing them instead.",
+        description="For PDF OCR, if True, attempts to extract any figures as valid TikZ. If False, replaces the figures with an alt text describing them instead.",
         default=False
     )
 

diff --git a/graphai/celery/image/jobs.py b/graphai/celery/image/jobs.py
@@ -11,6 +11,7 @@
     extract_slide_text_task,
     extract_slide_text_callback_task,
     convert_pdf_to_pages_task,
+    fanout_pdf_ocr_task,
     extract_multi_image_text_task,
     collect_multi_image_ocr_task
 )
@@ -24,39 +25,77 @@
 
 
 def retrieve_image_from_url_job(url, force=False, no_cache=False):
+    ##################
+    # Cache lookup
+    ##################
     if not force:
-        direct_lookup_task_id = direct_lookup_generic_job(cache_lookup_retrieve_image_from_url_task, url,
-                                                          False, DEFAULT_TIMEOUT)
+        direct_lookup_task_id = direct_lookup_generic_job(
+            cache_lookup_retrieve_image_from_url_task,
+            url,
+            False,
+            DEFAULT_TIMEOUT
+        )
         if direct_lookup_task_id is not None:
             return direct_lookup_task_id
 
+    ##################
+    # Retrieve image
+    ##################
     # First retrieve the file, and then do the database callback
-    task_list = [retrieve_image_from_url_task.s(url, None),
-                 retrieve_image_from_url_callback_task.s(url)]
-    if not no_cache:
-        task_list += get_slide_fingerprint_chain_list(None, None, ignore_fp_results=True)
-    else:
+    task_list = [
+        retrieve_image_from_url_task.s(url, None),
+        retrieve_image_from_url_callback_task.s(url),
+    ]
+
+    if no_cache:
         task_list += [add_token_status_to_single_image_results_callback_task.s()]
+    else:
+        task_list += get_slide_fingerprint_chain_list(
+            token=None,
+            origin_token=None,
+            ignore_fp_results=True,
+        )
+
     task = chain(task_list)
     task = task.apply_async(priority=2)
     return task.id
 
 
-def upload_image_from_file_job(contents, file_extension, origin, origin_info, force=False, no_cache=False):
+def upload_image_from_file_job(
+    contents,
+    file_extension,
+    origin,
+    origin_info,
+    force=False,
+    no_cache=False,
+):
     effective_url = create_origin_token_using_info(origin, origin_info)
+
     if not force:
-        direct_lookup_task_id = direct_lookup_generic_job(cache_lookup_retrieve_image_from_url_task, effective_url,
-                                                          False, DEFAULT_TIMEOUT)
+        direct_lookup_task_id = direct_lookup_generic_job(
+            cache_lookup_retrieve_image_from_url_task,
+            effective_url,
+            False,
+            DEFAULT_TIMEOUT,
+        )
+
         if direct_lookup_task_id is not None:
             return direct_lookup_task_id
+
     task_list = [
         upload_image_from_file_task.s(contents, file_extension),
-        retrieve_image_from_url_callback_task.s(effective_url)
+        retrieve_image_from_url_callback_task.s(effective_url),
     ]
-    if not no_cache:
-        task_list += get_slide_fingerprint_chain_list(None, None, ignore_fp_results=True)
-    else:
+
+    if no_cache:
         task_list += [add_token_status_to_single_image_results_callback_task.s()]
+    else:
+        task_list += get_slide_fingerprint_chain_list(
+            token=None,
+            origin_token=None,
+            ignore_fp_results=True
+        )
+
     task = chain(task_list)
     task = task.apply_async(priority=2)
     return task.id
@@ -85,45 +124,72 @@ def fingerprint_job(token, force):
     return task.id
 
 
-def ocr_job(token, force=False, no_cache=False, method='google',
-            api_token=None, openai_token=None, gemini_token=None,
-            model_type=None, enable_tikz=True):
+def ocr_job(
+    token,
+    force=False,
+    no_cache=False,
+    method='google',
+    google_api_token=None,
+    openai_api_token=None,
+    gemini_api_token=None,
+    rcp_api_token=None,
+    model_type=None,
+    enable_tikz=False,
+):
     ##################
     # OCR cache lookup
     ##################
     if not force and not no_cache:
-        direct_lookup_task_id = direct_lookup_generic_job(cache_lookup_extract_slide_text_task, token,
-                                                          False, DEFAULT_TIMEOUT, method)
+        direct_lookup_task_id = direct_lookup_generic_job(
+            cache_lookup_extract_slide_text_task,
+            token,
+            False,
+            DEFAULT_TIMEOUT,
+            method,
+        )
         if direct_lookup_task_id is not None:
             return direct_lookup_task_id
 
     #####################
     # OCR computation job
     #####################
-    if not is_pdf(token):
+    if is_pdf(token):
         task_list = [
-            extract_slide_text_task.s(token, method,
-                                      api_token, openai_token, gemini_token, model_type, enable_tikz)
+            convert_pdf_to_pages_task.s(token),
+            fanout_pdf_ocr_task.s(
+                method,
+                google_api_token,
+                openai_api_token,
+                gemini_api_token,
+                rcp_api_token,
+                model_type,
+                enable_tikz,
+            ),
         ]
     else:
-        n_parallel = 8
         task_list = [
-            convert_pdf_to_pages_task.s(token),
-            group(
-                extract_multi_image_text_task.s(i,
-                                                n_parallel,
-                                                method,
-                                                api_token,
-                                                openai_token,
-                                                gemini_token,
-                                                model_type,
-                                                enable_tikz)
-                for i in range(n_parallel)
-            ),
-            collect_multi_image_ocr_task.s()
+            extract_slide_text_task.s(
+                token,
+                method,
+                google_api_token,
+                openai_api_token,
+                gemini_api_token,
+                rcp_api_token,
+                model_type,
+                enable_tikz,
+            )
         ]
+
+    ##################
+    # OCR cache write
+    ##################
     if not no_cache:
         task_list.append(extract_slide_text_callback_task.s(token, force))
+
+    ##################
+    # Run task list
+    ##################
     task = chain(task_list)
     task = task.apply_async(priority=2)
+
     return task.id