diff --git a/.gitignore b/.gitignore index dc55edb..5ad18c9 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,7 @@ htmlcov/ # IDEs .vscode/ .idea/ +*.ipynb # OS .DS_Store @@ -64,3 +65,6 @@ data/embedding_dataset_from_questions_all_chunks_no_unclassified.json # Git .specify .github/prompts + +# Logs +*.log \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore index 9466895..5918f22 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -29,3 +29,7 @@ /prd_documents /prd_dedup /annotations +/processed-james +/processed-sgp +/raw-prd-sgp +/esim_prd_synthetic_qa_with_subgroups diff --git a/data/raw-prd-sgp.dvc b/data/raw-prd-sgp.dvc new file mode 100644 index 0000000..5f3045d --- /dev/null +++ b/data/raw-prd-sgp.dvc @@ -0,0 +1,6 @@ +outs: +- md5: cac377080c3ef911d358961d6bfd19ac.dir + size: 32930661 + nfiles: 29 + hash: md5 + path: raw-prd-sgp diff --git a/data/working_groups_mapping.json.dvc b/data/working_groups_mapping.json.dvc index 8a5b4d9..cd3f2f9 100644 --- a/data/working_groups_mapping.json.dvc +++ b/data/working_groups_mapping.json.dvc @@ -1,5 +1,5 @@ outs: -- md5: b70592bdeac5c03634a60d09d0a8fbc7 - size: 9901 +- md5: 43d32045e3db97435255cb18015a4e1f + size: 10411 hash: md5 path: working_groups_mapping.json diff --git a/gsma_dataset_creation/cli.py b/gsma_dataset_creation/cli.py index 088937d..09110cf 100644 --- a/gsma_dataset_creation/cli.py +++ b/gsma_dataset_creation/cli.py @@ -61,9 +61,29 @@ def setup_logging(log_level: str) -> None: logger.add( lambda msg: typer.echo(msg, err=True), level=log_level.upper(), - format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}", + format = ( + "{time:YYYY-MM-DD HH:mm:ss} | " + "{level:<8} | " + "{file}:{line} | " + "{message}" + ) ) + # File output + logger.add( + "GSMA-data-creation.log", + level=log_level.upper(), + format=( + "{time:YYYY-MM-DD HH:mm:ss} | " + "{level:<8} | " + "{file}:{line} | " + "{message}" + ), + rotation="20 MB", + retention="30 days", + compression="zip", + enqueue=True # Safe for async/multiprocessing + ) @app.command() def process( diff --git a/gsma_dataset_creation/qa_schema.py b/gsma_dataset_creation/qa_schema.py index 05ff3ca..202fa4f 100644 --- a/gsma_dataset_creation/qa_schema.py +++ b/gsma_dataset_creation/qa_schema.py @@ -37,10 +37,11 @@ def get_qa_response_schema(config: QAConfig) -> Dict[str, Any]: "question_type": { "type": "string", "enum": [ - "factual", - "comprehension", - "analytical", - "definition", + "multiple_choice", + "multiple_choice_explained", + "true_false", + "open_ended", + "open_ended_no_context" ], }, }, diff --git a/gsma_dataset_creation/question_generator.py b/gsma_dataset_creation/question_generator.py index 005c14d..1913248 100644 --- a/gsma_dataset_creation/question_generator.py +++ b/gsma_dataset_creation/question_generator.py @@ -12,6 +12,7 @@ from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, cast +import traceback from loguru import logger from openai import ( @@ -148,6 +149,7 @@ async def _api_call() -> Any: ) except APIStatusError as e: # Check if it's a structured output compatibility issue + logger.error(traceback.format_exc()) error_msg = str(e.message) if hasattr(e, "message") else str(e) if e.status_code == 400 and any( keyword in error_msg.lower() @@ -169,6 +171,7 @@ async def _api_call() -> Any: # Re-raise other API status errors raise except Exception as e: + logger.error(traceback.format_exc()) # For unexpected exceptions, log details for debugging logger.debug( f"Unexpected exception type: {type(e).__name__}, message: {str(e)}" @@ -194,6 +197,7 @@ async def _api_call() -> Any: **extra_params, ) except Exception as fallback_error: + logger.error(traceback.format_exc()) logger.error(f"Fallback also failed: {fallback_error}") raise fallback_error raise diff --git a/gsma_dataset_creation/retry_utils.py b/gsma_dataset_creation/retry_utils.py index af75df7..9a8e498 100644 --- a/gsma_dataset_creation/retry_utils.py +++ b/gsma_dataset_creation/retry_utils.py @@ -129,6 +129,8 @@ async def wrapper(*args: Any, **kwargs: Any) -> Any: f"Failed {func.__name__} after {config.max_retries} retries: " f"{type(e).__name__}: {e}" ) + import traceback + logger.error(traceback.format_exc()) raise return wrapper diff --git a/gsma_dataset_creation/similarity/similarity_calculator.py b/gsma_dataset_creation/similarity/similarity_calculator.py index 23aa3d0..5d2ec57 100644 --- a/gsma_dataset_creation/similarity/similarity_calculator.py +++ b/gsma_dataset_creation/similarity/similarity_calculator.py @@ -5,6 +5,7 @@ using efficient approximate nearest neighbor search. """ +import os import json from datetime import datetime from pathlib import Path @@ -97,6 +98,16 @@ def build_faiss_index( norms = np.linalg.norm(embeddings, axis=1, keepdims=True) embeddings = embeddings / norms + # Minimum number of points needed per centroid for effective clustering + min_points_per_centroid = 39 + + # Adjust index type if dataset too small + if n_chunks < min_points_per_centroid: + logger.debug( + f"Dataset size {n_chunks} is sufficient for IVFFlat index" + ) + self.faiss_index_type = "Flat" + # Build index based on type if self.faiss_index_type == "Flat": # Flat index for exact search (slower but accurate) @@ -112,6 +123,10 @@ def build_faiss_index( if self.nlist > n_chunks // 10: self.nlist = max(1, n_chunks // 10) + # Ensure nlist is appropriate for dataset size + if self.nlist * min_points_per_centroid > n_chunks: + self.nlist = max(1, n_chunks // min_points_per_centroid) + logger.info(f"Building IVFFlat index with nlist={self.nlist}") # Create quantizer and index @@ -181,8 +196,15 @@ def compute_similarities( k_search = min( self.k + 1, len(self.chunk_metadata) ) # +1 to exclude self-similarity + + # In some environment, or when Faiss is built with MKL, + # this can cause segmentation faults, crashes or weird failures during index.search(). + # This change limits OpenMP for multithreading to a single thread + os.environ.setdefault("OMP_NUM_THREADS", "1") + scores, indices = self.faiss_index.search(query_embeddings, k_search) + # Process results all_similarities = [] similarities_computed = 0 diff --git a/gsma_dataset_creation/similarity_cli.py b/gsma_dataset_creation/similarity_cli.py index 0bc232e..3e41ee2 100644 --- a/gsma_dataset_creation/similarity_cli.py +++ b/gsma_dataset_creation/similarity_cli.py @@ -17,7 +17,22 @@ def setup_logging(log_level: str) -> None: logger.add( lambda msg: typer.echo(msg, err=True), level=log_level.upper(), - format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}", + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {file}:{line} | {message}", + ) + # File output + logger.add( + "GSMA-data-creation.log", + level=log_level.upper(), + format=( + "{time:YYYY-MM-DD HH:mm:ss} | " + "{level:<8} | " + "{file}:{line} | " + "{message}" + ), + rotation="20 MB", + retention="30 days", + compression="zip", + enqueue=True # Safe for async/multiprocessing ) @@ -534,6 +549,8 @@ def ranker( raise typer.Exit(1) except Exception as e: logger.error(f"❌ Unexpected error: {e}") + import traceback + logger.error(traceback.format_exc()) raise typer.Exit(1) diff --git a/pipelines/annotation/dvc.lock b/pipelines/annotation/dvc.lock index 7330482..e205328 100644 --- a/pipelines/annotation/dvc.lock +++ b/pipelines/annotation/dvc.lock @@ -1,32 +1,33 @@ schema: '2.0' stages: add_subgroups: - cmd: uv run gsma add-subgroup-to-dataset --dataset-repo mantisnlp/gsma_prd_synthetic_qa - --working-groups data/working_groups_mapping.json --output data/gsma_prd_synthetic_qa_with_subgroups - --log-level INFO + cmd: uv run gsma add-subgroup-to-dataset --dataset-repo + jamesaidev001/esim_prd_synthetic_qa --working-groups + data/working_groups_mapping.json --output + data/esim_prd_synthetic_qa_with_subgroups --log-level INFO deps: - path: data/working_groups_mapping.json hash: md5 - md5: 2adbc799e706c013db55d1fb3338878c - size: 10629 + md5: 43d32045e3db97435255cb18015a4e1f + size: 10411 - path: gsma_dataset_creation/cli.py hash: md5 - md5: 1bfad6c8fd2ce3455f7cdf9a1f1e0518 - size: 80990 + md5: e1d291f8c6f59a65f0a0914c283a3968 + size: 82670 - path: gsma_dataset_creation/subgroup_adder.py hash: md5 md5: 6d25cfda36dd581cf975bd4453a850eb size: 14752 outs: - - path: data/gsma_prd_synthetic_qa_with_subgroups + - path: data/esim_prd_synthetic_qa_with_subgroups hash: md5 - md5: b1d1eff79e4d53a1cdcab707c068f5fe.dir - size: 423824806 + md5: e4df70e8c255276e52c4e2af1347f112.dir + size: 21318642 nfiles: 4 upload_tsg_annotation: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_with_subgroups - --subgroup TSG --sample-size 100 --dataset-name-prefix gsma_annotation_tsg --logger-level - INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_with_subgroups --subgroup TSG --sample-size 100 + --dataset-name-prefix gsma_annotation_tsg --logger-level INFO deps: - path: data/gsma_prd_synthetic_with_subgroups hash: md5 @@ -42,9 +43,9 @@ stages: md5: 3afc719e94c38002e8a124fe99344648 size: 13916 upload_annotation@@FASG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup FASG --sample-size 1000 --dataset-name-prefix gsma_annotation_FASG - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup FASG --sample-size + 1000 --dataset-name-prefix gsma_annotation_FASG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -60,9 +61,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@PQTN: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup PQTN --sample-size 1000 --dataset-name-prefix gsma_annotation_PQTN - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup PQTN --sample-size + 1000 --dataset-name-prefix gsma_annotation_PQTN --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -78,9 +79,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@TSG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup TSG --sample-size 1000 --dataset-name-prefix gsma_annotation_TSG - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup TSG --sample-size + 1000 --dataset-name-prefix gsma_annotation_TSG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -96,9 +97,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@SAM: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup SAM --sample-size 1000 --dataset-name-prefix gsma_annotation_SAM - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup SAM --sample-size + 1000 --dataset-name-prefix gsma_annotation_SAM --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -114,9 +115,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@eSim: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_with_subgroups - --subgroup eSim --sample-size 1000 --dataset-name-prefix gsma_annotation_eSim - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_with_subgroups --subgroup eSim --sample-size 1000 + --dataset-name-prefix gsma_annotation_eSim --logger-level INFO deps: - path: data/gsma_prd_synthetic_with_subgroups hash: md5 @@ -132,9 +133,9 @@ stages: md5: b0e96df363386ac9af0328636a6c3e3b size: 20248 upload_annotation@@IG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup IG --sample-size 1000 --dataset-name-prefix gsma_annotation_IG --logger-level - INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup IG --sample-size 1000 + --dataset-name-prefix gsma_annotation_IG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -150,9 +151,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@SG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup SG --sample-size 1000 --dataset-name-prefix gsma_annotation_SG --logger-level - INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup SG --sample-size 1000 + --dataset-name-prefix gsma_annotation_SG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -168,9 +169,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@NG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup NG --sample-size 1000 --dataset-name-prefix gsma_annotation_NG --logger-level - INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup NG --sample-size 1000 + --dataset-name-prefix gsma_annotation_NG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -186,9 +187,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@IDS: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup IDS --sample-size 1000 --dataset-name-prefix gsma_annotation_IDS - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup IDS --sample-size + 1000 --dataset-name-prefix gsma_annotation_IDS --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -204,9 +205,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@RCS: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup RCS --sample-size 1000 --dataset-name-prefix gsma_annotation_RCS - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup RCS --sample-size + 1000 --dataset-name-prefix gsma_annotation_RCS --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -222,9 +223,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@OPG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup OPG --sample-size 1000 --dataset-name-prefix gsma_annotation_OPG - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup OPG --sample-size + 1000 --dataset-name-prefix gsma_annotation_OPG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -240,9 +241,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@WAS: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup WAS --sample-size 1000 --dataset-name-prefix gsma_annotation_WAS - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup WAS --sample-size + 1000 --dataset-name-prefix gsma_annotation_WAS --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -258,9 +259,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@SAS: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup SAS --sample-size 1000 --dataset-name-prefix gsma_annotation_SAS - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup SAS --sample-size + 1000 --dataset-name-prefix gsma_annotation_SAS --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -276,9 +277,10 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@CERTIFICATION_POLICY: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup CERTIFICATION_POLICY --sample-size 1000 --dataset-name-prefix gsma_annotation_CERTIFICATION_POLICY - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup CERTIFICATION_POLICY + --sample-size 1000 --dataset-name-prefix + gsma_annotation_CERTIFICATION_POLICY --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -294,9 +296,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@EID: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup EID --sample-size 1000 --dataset-name-prefix gsma_annotation_EID - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup EID --sample-size + 1000 --dataset-name-prefix gsma_annotation_EID --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -312,8 +314,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@COMPLIANCE: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup COMPLIANCE --sample-size 1000 --dataset-name-prefix gsma_annotation_COMPLIANCE + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup COMPLIANCE + --sample-size 1000 --dataset-name-prefix gsma_annotation_COMPLIANCE --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -330,8 +333,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@REQUIREMENTS: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup REQUIREMENTS --sample-size 1000 --dataset-name-prefix gsma_annotation_REQUIREMENTS + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup REQUIREMENTS + --sample-size 1000 --dataset-name-prefix gsma_annotation_REQUIREMENTS --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -348,9 +352,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@ESA: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup ESA --sample-size 1000 --dataset-name-prefix gsma_annotation_ESA - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup ESA --sample-size + 1000 --dataset-name-prefix gsma_annotation_ESA --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -366,8 +370,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@TECHNICAL: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup TECHNICAL --sample-size 1000 --dataset-name-prefix gsma_annotation_TECHNICAL + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup TECHNICAL + --sample-size 1000 --dataset-name-prefix gsma_annotation_TECHNICAL --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -384,9 +389,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation@@TESTING: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup TESTING --sample-size 1000 --dataset-name-prefix gsma_annotation_TESTING - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup TESTING --sample-size + 1000 --dataset-name-prefix gsma_annotation_TESTING --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -402,9 +407,9 @@ stages: md5: f116942d041709137200bf5bd1b4fec8 size: 18809 upload_annotation_working_groups@@IG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup IG --sample-size 1000 --dataset-name-prefix gsma_annotation_IG --logger-level - INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup IG --sample-size 1000 + --dataset-name-prefix gsma_annotation_IG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -420,8 +425,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@@Oig_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Oig_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Oig_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Oig_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -438,9 +444,10 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@@Other: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Other --working-group NG --sample-size 1000 --dataset-name-prefix - gsma_annotation_ng_Other --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Other --working-group + NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Other + --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -456,9 +463,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_working_groups@@SG: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup SG --sample-size 1000 --dataset-name-prefix gsma_annotation_SG --logger-level - INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup SG --sample-size 1000 + --dataset-name-prefix gsma_annotation_SG --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -474,8 +481,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@@Security: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Security --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Security + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Security --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -492,9 +500,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_working_groups@@SAS: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup SAS --sample-size 1000 --dataset-name-prefix gsma_annotation_SAS - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup SAS --sample-size + 1000 --dataset-name-prefix gsma_annotation_SAS --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -510,8 +518,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@@Esa_scheme: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Esa_scheme --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Esa_scheme + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Esa_scheme --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -528,8 +537,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@@Certification_policy: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Certification_policy --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Certification_policy + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Certification_policy --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -546,8 +556,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@@Upg_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Upg_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Upg_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Upg_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -564,8 +575,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@@Requirements: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Requirements --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Requirements + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Requirements --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -582,8 +594,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@@Testing: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Testing --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Testing + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Testing --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -600,8 +613,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@@Mc_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Mc_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Mc_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Mc_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -618,8 +632,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@@Tg: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Tg --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Tg --working-group NG + --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -636,8 +651,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@@Compliance: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Compliance --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Compliance + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Compliance --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -654,8 +670,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@@Nrg_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Nrg_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Nrg_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Nrg_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -672,9 +689,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_working_groups@@PQTN: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup PQTN --sample-size 1000 --dataset-name-prefix gsma_annotation_PQTN - --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup PQTN --sample-size + 1000 --dataset-name-prefix gsma_annotation_PQTN --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -690,26 +707,28 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@Compliance: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Compliance --working-group eSIM --sample-size 1000 --dataset-name-prefix - gsma_annotation_esim_Compliance --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Compliance + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Compliance --logger-level DEBUG deps: - - path: data/gsma_prd_synthetic_qa_with_subgroups + - path: data/esim_prd_synthetic_qa_with_subgroups hash: md5 - md5: a683e4a7744b9c6fbf18c5b4eceba3dc.dir - size: 423827710 - nfiles: 5 + md5: 90bab7cea26f10c19bec4eeb4e7cd8ca.dir + size: 21358490 + nfiles: 9 - path: gsma_dataset_creation/argilla_cli.py hash: md5 - md5: 77a9aaa9df4253afa88578dc676b4e27 - size: 17579 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py hash: md5 - md5: ca99922bcb3c06c428edc22d8e8ec7ad - size: 20868 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 upload_annotation_esim_subgroups@Esa_scheme: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Esa_scheme --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Esa_scheme + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Esa_scheme --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -726,44 +745,47 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@Technical: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Technical --working-group eSIM --sample-size 1000 --dataset-name-prefix - gsma_annotation_esim_Technical --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Technical + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Technical --logger-level DEBUG deps: - - path: data/gsma_prd_synthetic_qa_with_subgroups + - path: data/esim_prd_synthetic_qa_with_subgroups hash: md5 - md5: 021a45de31d599951e8c55d21411cdd9.dir - size: 423876334 + md5: b0d1d9b50e059bbdedcf50dc972a095e.dir + size: 21350978 nfiles: 7 - path: gsma_dataset_creation/argilla_cli.py hash: md5 - md5: 77a9aaa9df4253afa88578dc676b4e27 - size: 17579 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py hash: md5 - md5: ca99922bcb3c06c428edc22d8e8ec7ad - size: 20868 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 upload_annotation_esim_subgroups@Testing: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Testing --working-group eSIM --sample-size 1000 --dataset-name-prefix - gsma_annotation_esim_Testing --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Testing + --working-group eSim --sample-size 1000 --dataset-name-prefix esim_Testing + --logger-level DEBUG deps: - - path: data/gsma_prd_synthetic_qa_with_subgroups + - path: data/esim_prd_synthetic_qa_with_subgroups hash: md5 - md5: 571a5ce098aad2fc70df866491e3aa38.dir - size: 423892630 + md5: bda3d5917c494486079b70900772e88b.dir + size: 21357146 nfiles: 8 - path: gsma_dataset_creation/argilla_cli.py hash: md5 - md5: 77a9aaa9df4253afa88578dc676b4e27 - size: 17579 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py hash: md5 - md5: ca99922bcb3c06c428edc22d8e8ec7ad - size: 20868 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 upload_annotation_esim_subgroups@Eid_scheme: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Eid_scheme --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Eid_scheme + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Eid_scheme --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -780,26 +802,28 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@Requirements: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Requirements --working-group eSIM --sample-size 1000 --dataset-name-prefix - gsma_annotation_esim_Requirements --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Requirements + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Requirements --logger-level DEBUG deps: - - path: data/gsma_prd_synthetic_qa_with_subgroups + - path: data/esim_prd_synthetic_qa_with_subgroups hash: md5 - md5: 15cf4b5c8b536bc22d68a5632f036423.dir - size: 423909206 - nfiles: 10 + md5: 5753034ec4f32916361da8666a21e06b.dir + size: 21326074 + nfiles: 6 - path: gsma_dataset_creation/argilla_cli.py hash: md5 - md5: 77a9aaa9df4253afa88578dc676b4e27 - size: 17579 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py hash: md5 - md5: ca99922bcb3c06c428edc22d8e8ec7ad - size: 20868 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 upload_annotation_esim_subgroups@Certification_policy: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Certification_policy --working-group eSIM --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Certification_policy + --working-group eSIM --sample-size 1000 --dataset-name-prefix gsma_annotation_esim_Certification_policy --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -816,27 +840,29 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_esim_subgroups@Security: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Security --working-group eSIM --sample-size 1000 --dataset-name-prefix - gsma_annotation_esim_Security --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Security + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Security --logger-level DEBUG deps: - - path: data/gsma_prd_synthetic_qa_with_subgroups + - path: data/esim_prd_synthetic_qa_with_subgroups hash: md5 - md5: 1ca6a5b2083300b20e253a9050aaedcc.dir - size: 423920998 - nfiles: 12 + md5: 19c5a9baedceb8e4debde5b306d03487.dir + size: 21324010 + nfiles: 5 - path: gsma_dataset_creation/argilla_cli.py hash: md5 - md5: 77a9aaa9df4253afa88578dc676b4e27 - size: 17579 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py hash: md5 - md5: ca99922bcb3c06c428edc22d8e8ec7ad - size: 20868 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 upload_annotation_ng_subgroups@Isag: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Isag --working-group NG --sample-size 1000 --dataset-name-prefix - gsma_annotation_ng_Isag --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Isag --working-group + NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Isag + --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -852,9 +878,10 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Other: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Other --working-group NG --sample-size 1000 --dataset-name-prefix - gsma_annotation_ng_Other --logger-level INFO + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Other --working-group + NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Other + --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups hash: md5 @@ -870,8 +897,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Tg: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Tg --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Tg --working-group NG + --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -888,8 +916,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Oig_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Oig_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Oig_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Oig_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -906,8 +935,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Nrg_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Nrg_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Nrg_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Nrg_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -924,8 +954,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Upg_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Upg_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Upg_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Upg_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -942,8 +973,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Mc_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Mc_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Mc_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Mc_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -960,8 +992,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Ursp_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Ursp_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Ursp_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Ursp_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -978,8 +1011,9 @@ stages: md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 upload_annotation_ng_subgroups@Fnr_isig: - cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups - --subgroup Fnr_isig --working-group NG --sample-size 1000 --dataset-name-prefix + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/gsma_prd_synthetic_qa_with_subgroups --subgroup Fnr_isig + --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Fnr_isig --logger-level INFO deps: - path: data/gsma_prd_synthetic_qa_with_subgroups @@ -995,3 +1029,98 @@ stages: hash: md5 md5: ca99922bcb3c06c428edc22d8e8ec7ad size: 20868 + upload_annotation_esim_subgroups@Certification: + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Certification + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Certification --logger-level DEBUG + deps: + - path: data/esim_prd_synthetic_qa_with_subgroups + hash: md5 + md5: 0b6db7608974cd2d0444a389e326bd97.dir + size: 21361810 + nfiles: 12 + - path: gsma_dataset_creation/argilla_cli.py + hash: md5 + md5: 96da29c6bf660ee8add034e08fec92ab + size: 58419 + - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py + hash: md5 + md5: 71199696b1c81567d1da0a9da07391b1 + size: 22119 + upload_annotation_esim_subgroups@EID_Scheme: + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup EID_Scheme + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_EID_Scheme --logger-level DEBUG + deps: + - path: data/esim_prd_synthetic_qa_with_subgroups + hash: md5 + md5: 9888acfb189bda92e3a61dd70ca20af6.dir + size: 21358794 + nfiles: 10 + - path: gsma_dataset_creation/argilla_cli.py + hash: md5 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 + - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py + hash: md5 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 + upload_annotation_esim_subgroups@eSA_Scheme: + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup eSA_Scheme + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_eSA_Scheme --logger-level DEBUG + deps: + - path: data/esim_prd_synthetic_qa_with_subgroups + hash: md5 + md5: c67e63493fe27a7cf2d7e59cf28b00bf.dir + size: 21361506 + nfiles: 11 + - path: gsma_dataset_creation/argilla_cli.py + hash: md5 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 + - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py + hash: md5 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 + upload_annotation_esim_subgroups@Compliance_Policy: + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Compliance_Policy + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Compliance_Policy --logger-level DEBUG + deps: + - path: data/esim_prd_synthetic_qa_with_subgroups + hash: md5 + md5: 1f1f681e66dc2e549cb97eed3525ef36.dir + size: 21318946 + nfiles: 5 + - path: gsma_dataset_creation/argilla_cli.py + hash: md5 + md5: 96da29c6bf660ee8add034e08fec92ab + size: 58419 + - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py + hash: md5 + md5: 71199696b1c81567d1da0a9da07391b1 + size: 22119 + upload_annotation_esim_subgroups@Certification_Policy: + cmd: uv run gsma argilla upload-by-subgroup --dataset-path + data/esim_prd_synthetic_qa_with_subgroups --subgroup Certification_Policy + --working-group eSim --sample-size 1000 --dataset-name-prefix + esim_Certification_Policy --logger-level DEBUG + deps: + - path: data/esim_prd_synthetic_qa_with_subgroups + hash: md5 + md5: 20ebbdfeeb6d07d58d505189b73b43bf.dir + size: 21363226 + nfiles: 12 + - path: gsma_dataset_creation/argilla_cli.py + hash: md5 + md5: cf30c259ec2abc16adad41b33830495b + size: 57170 + - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py + hash: md5 + md5: 630c3037409fc170eeb6ba05357087aa + size: 22164 diff --git a/pipelines/annotation/dvc.yaml b/pipelines/annotation/dvc.yaml index 17cdddf..b780802 100644 --- a/pipelines/annotation/dvc.yaml +++ b/pipelines/annotation/dvc.yaml @@ -4,17 +4,17 @@ stages: wdir: ../.. cmd: >- uv run gsma add-subgroup-to-dataset - --dataset-repo mantisnlp/gsma_prd_synthetic_qa + --dataset-repo jamesaidev001/esim_prd_synthetic_qa --working-groups data/working_groups_mapping.json - --output data/gsma_prd_synthetic_qa_with_subgroups + --output data/esim_prd_synthetic_qa_with_subgroups --log-level INFO deps: - data/working_groups_mapping.json - gsma_dataset_creation/cli.py - gsma_dataset_creation/subgroup_adder.py outs: - - data/gsma_prd_synthetic_qa_with_subgroups - desc: "Add subgroup classification to HuggingFace dataset from Hub (mantisnlp/gsma_prd_synthetic_qa) based on working groups mapping" + - data/esim_prd_synthetic_qa_with_subgroups + desc: "Add subgroup classification to HuggingFace dataset from Hub (jamesaidev001/esim_prd_synthetic_qa) based on working groups mapping" # Step 2: Upload working groups without subgroups to Argilla for annotation # NOTE: Includes automatic retry with exponential backoff for SQLite lock errors. @@ -52,26 +52,26 @@ stages: # eSIM has multiple subgroups that all share one workspace upload_annotation_esim_subgroups: foreach: + - Certification_Policy + - Compliance + - EID_Scheme - Requirements + - Security - Technical - Testing - - Compliance - - Security - - Certification_policy - - Eid_scheme - - Esa_scheme + - eSA_Scheme do: wdir: ../.. cmd: >- uv run gsma argilla upload-by-subgroup - --dataset-path data/gsma_prd_synthetic_qa_with_subgroups + --dataset-path data/esim_prd_synthetic_qa_with_subgroups --subgroup ${item} - --working-group eSIM + --working-group eSim --sample-size 1000 - --dataset-name-prefix gsma_annotation_esim_${item} - --logger-level INFO + --dataset-name-prefix esim_${item} + --logger-level DEBUG deps: - - data/gsma_prd_synthetic_qa_with_subgroups + - data/esim_prd_synthetic_qa_with_subgroups - gsma_dataset_creation/argilla_cli.py - gsma_dataset_creation/validation/argilla_subgroup_uploader.py desc: "Upload 1000 Q&A pairs from eSIM/${item} subgroup to Argilla for quality annotation (eSIM workspace)" diff --git a/pyproject.toml b/pyproject.toml index 53d62c5..cc52fd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "gsma-dataset-creation" -version = "0.1.0" +version = "0.1.1" description = "GSMA document processing pipeline" readme = "README.md" requires-python = ">=3.11"