Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

# Abstra
# Abstra is an AI-powered process automation framework.
Expand Down
3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ services:
RABBITMQ_DEFAULT_PASS: ${AI_RABBITMQ_DEV_CELERY_PASS}
volumes:
- rabbitmq_data:/var/lib/rabbitmq
- ./rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro

# Redis with AOF persistence
redis:
Expand Down Expand Up @@ -99,7 +100,7 @@ services:
network_mode: "host"
volumes:
- ./nginx.graphai-https.conf:/etc/nginx/conf.d/default.conf:ro
- ./.certs:/etc/nginx/certs:ro
- /home/dockerhost/graphcert.cede-apps.ch:/etc/nginx/certs:ro
restart: unless-stopped

networks:
Expand Down
20 changes: 16 additions & 4 deletions graphai/api/image/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,24 @@ async def extract_text(data: ExtractTextRequest):
method = data.method
force = data.force
no_cache = data.no_cache
api_token = data.google_api_token
openai_token = data.openai_api_token
gemini_token = data.gemini_api_token
google_api_token = data.google_api_token
openai_api_token = data.openai_api_token
gemini_api_token = data.gemini_api_token
rcp_api_token = data.rcp_api_token
model_type = data.model_type
enable_tikz = data.enable_tikz
task_id = ocr_job(token, force, no_cache, method, api_token, openai_token, gemini_token, model_type, enable_tikz)
task_id = ocr_job(
token,
force=force,
no_cache=no_cache,
method=method,
google_api_token=google_api_token,
openai_api_token=openai_api_token,
gemini_api_token=gemini_api_token,
rcp_api_token=rcp_api_token,
model_type=model_type,
enable_tikz=enable_tikz,
)
return {'task_id': task_id}


Expand Down
26 changes: 13 additions & 13 deletions graphai/api/image/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,9 @@ class ExtractTextRequest(BaseModel):
description="The token that identifies the requested file"
)

method: Literal['google', 'tesseract', 'openai', 'gemini'] = Field(
method: Literal['tesseract', 'google', 'openai', 'gemini', 'rcp'] = Field(
title="Method",
description="OCR method. Available methods are 'google' (default), 'openai', 'gemini',"
"and 'tesseract' (not recommended)",
description="OCR method. Available methods are 'tesseract' (not recommended), 'google' (default), 'openai', 'gemini' and 'rcp'",
default="google"
)

Expand All @@ -187,36 +186,37 @@ class ExtractTextRequest(BaseModel):

google_api_token: Union[str, None] = Field(
title="Google API token",
description="Token that authenticates the user on the Google OCR API."
"Without a valid token, Google OCR will fail. Not required for Tesseract, OpenAI, or Gemini.",
description="Token that authenticates the user on the Google OCR API. Without a valid token, Google OCR will fail. Only required for method 'google'.",
default=None
)

openai_api_token: Union[str, None] = Field(
title="OpenAI API token",
description="Token that authenticates the user on the OpenAI API."
"Without a valid token, OpenAI OCR will fail. Not required for Tesseract, Google, or Gemini.",
description="Token that authenticates the user on the OpenAI API. Without a valid token, OpenAI OCR will fail. Only required for method 'openai'.",
default=None
)

gemini_api_token: Union[str, None] = Field(
title="Gemini API token",
description="Token that authenticates the user on the Gemini API."
"Without a valid token, Gemini OCR will fail. Not required for Tesseract, Google, or OpenAI.",
description="Token that authenticates the user on the Gemini API. Without a valid token, Gemini OCR will fail. Only required for method 'gemini'.",
default=None
)

rcp_api_token: Union[str, None] = Field(
title="RCP API token",
description="Token that authenticates the user on the RCP platform. Without a valid token, RCP OCR will fail. Only required for method 'rcp'.",
default=None
)

model_type: Union[str, None] = Field(
title="Model type",
description="For OpenAI and Gemini options, allows the user to specify the model that they want to use. "
"Do not specify this option unless you know exactly what you are doing.",
description="For LLM-based options, allows the user to specify the model that they want to use. Do not specify this option unless you know exactly what you are doing.",
default=None
)

enable_tikz: bool = Field(
title="Enable TikZ",
description="For PDF OCR, if True, attempts to extract any figures as valid TikZ. If False, "
"replaces the figures with an alt text describing them instead.",
description="For PDF OCR, if True, attempts to extract any figures as valid TikZ. If False, replaces the figures with an alt text describing them instead.",
default=False
)

Expand Down
138 changes: 102 additions & 36 deletions graphai/celery/image/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
extract_slide_text_task,
extract_slide_text_callback_task,
convert_pdf_to_pages_task,
fanout_pdf_ocr_task,
extract_multi_image_text_task,
collect_multi_image_ocr_task
)
Expand All @@ -24,39 +25,77 @@


def retrieve_image_from_url_job(url, force=False, no_cache=False):
##################
# Cache lookup
##################
if not force:
direct_lookup_task_id = direct_lookup_generic_job(cache_lookup_retrieve_image_from_url_task, url,
False, DEFAULT_TIMEOUT)
direct_lookup_task_id = direct_lookup_generic_job(
cache_lookup_retrieve_image_from_url_task,
url,
False,
DEFAULT_TIMEOUT
)
if direct_lookup_task_id is not None:
return direct_lookup_task_id

##################
# Retrieve image
##################
# First retrieve the file, and then do the database callback
task_list = [retrieve_image_from_url_task.s(url, None),
retrieve_image_from_url_callback_task.s(url)]
if not no_cache:
task_list += get_slide_fingerprint_chain_list(None, None, ignore_fp_results=True)
else:
task_list = [
retrieve_image_from_url_task.s(url, None),
retrieve_image_from_url_callback_task.s(url),
]

if no_cache:
task_list += [add_token_status_to_single_image_results_callback_task.s()]
else:
task_list += get_slide_fingerprint_chain_list(
token=None,
origin_token=None,
ignore_fp_results=True,
)

task = chain(task_list)
task = task.apply_async(priority=2)
return task.id


def upload_image_from_file_job(contents, file_extension, origin, origin_info, force=False, no_cache=False):
def upload_image_from_file_job(
contents,
file_extension,
origin,
origin_info,
force=False,
no_cache=False,
):
effective_url = create_origin_token_using_info(origin, origin_info)

if not force:
direct_lookup_task_id = direct_lookup_generic_job(cache_lookup_retrieve_image_from_url_task, effective_url,
False, DEFAULT_TIMEOUT)
direct_lookup_task_id = direct_lookup_generic_job(
cache_lookup_retrieve_image_from_url_task,
effective_url,
False,
DEFAULT_TIMEOUT,
)

if direct_lookup_task_id is not None:
return direct_lookup_task_id

task_list = [
upload_image_from_file_task.s(contents, file_extension),
retrieve_image_from_url_callback_task.s(effective_url)
retrieve_image_from_url_callback_task.s(effective_url),
]
if not no_cache:
task_list += get_slide_fingerprint_chain_list(None, None, ignore_fp_results=True)
else:

if no_cache:
task_list += [add_token_status_to_single_image_results_callback_task.s()]
else:
task_list += get_slide_fingerprint_chain_list(
token=None,
origin_token=None,
ignore_fp_results=True
)

task = chain(task_list)
task = task.apply_async(priority=2)
return task.id
Expand Down Expand Up @@ -85,45 +124,72 @@ def fingerprint_job(token, force):
return task.id


def ocr_job(token, force=False, no_cache=False, method='google',
api_token=None, openai_token=None, gemini_token=None,
model_type=None, enable_tikz=True):
def ocr_job(
token,
force=False,
no_cache=False,
method='google',
google_api_token=None,
openai_api_token=None,
gemini_api_token=None,
rcp_api_token=None,
model_type=None,
enable_tikz=False,
):
##################
# OCR cache lookup
##################
if not force and not no_cache:
direct_lookup_task_id = direct_lookup_generic_job(cache_lookup_extract_slide_text_task, token,
False, DEFAULT_TIMEOUT, method)
direct_lookup_task_id = direct_lookup_generic_job(
cache_lookup_extract_slide_text_task,
token,
False,
DEFAULT_TIMEOUT,
method,
)
if direct_lookup_task_id is not None:
return direct_lookup_task_id

#####################
# OCR computation job
#####################
if not is_pdf(token):
if is_pdf(token):
task_list = [
extract_slide_text_task.s(token, method,
api_token, openai_token, gemini_token, model_type, enable_tikz)
convert_pdf_to_pages_task.s(token),
fanout_pdf_ocr_task.s(
method,
google_api_token,
openai_api_token,
gemini_api_token,
rcp_api_token,
model_type,
enable_tikz,
),
]
else:
n_parallel = 8
task_list = [
convert_pdf_to_pages_task.s(token),
group(
extract_multi_image_text_task.s(i,
n_parallel,
method,
api_token,
openai_token,
gemini_token,
model_type,
enable_tikz)
for i in range(n_parallel)
),
collect_multi_image_ocr_task.s()
extract_slide_text_task.s(
token,
method,
google_api_token,
openai_api_token,
gemini_api_token,
rcp_api_token,
model_type,
enable_tikz,
)
]

##################
# OCR cache write
##################
if not no_cache:
task_list.append(extract_slide_text_callback_task.s(token, force))

##################
# Run task list
##################
task = chain(task_list)
task = task.apply_async(priority=2)

return task.id
Loading
Loading