diff --git a/.gitignore b/.gitignore
index dc55edb..5ad18c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,7 @@ htmlcov/
# IDEs
.vscode/
.idea/
+*.ipynb
# OS
.DS_Store
@@ -64,3 +65,6 @@ data/embedding_dataset_from_questions_all_chunks_no_unclassified.json
# Git
.specify
.github/prompts
+
+# Logs
+*.log
\ No newline at end of file
diff --git a/data/.gitignore b/data/.gitignore
index 9466895..5918f22 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -29,3 +29,7 @@
/prd_documents
/prd_dedup
/annotations
+/processed-james
+/processed-sgp
+/raw-prd-sgp
+/esim_prd_synthetic_qa_with_subgroups
diff --git a/data/raw-prd-sgp.dvc b/data/raw-prd-sgp.dvc
new file mode 100644
index 0000000..5f3045d
--- /dev/null
+++ b/data/raw-prd-sgp.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: cac377080c3ef911d358961d6bfd19ac.dir
+ size: 32930661
+ nfiles: 29
+ hash: md5
+ path: raw-prd-sgp
diff --git a/data/working_groups_mapping.json.dvc b/data/working_groups_mapping.json.dvc
index 8a5b4d9..cd3f2f9 100644
--- a/data/working_groups_mapping.json.dvc
+++ b/data/working_groups_mapping.json.dvc
@@ -1,5 +1,5 @@
outs:
-- md5: b70592bdeac5c03634a60d09d0a8fbc7
- size: 9901
+- md5: 43d32045e3db97435255cb18015a4e1f
+ size: 10411
hash: md5
path: working_groups_mapping.json
diff --git a/gsma_dataset_creation/cli.py b/gsma_dataset_creation/cli.py
index 088937d..09110cf 100644
--- a/gsma_dataset_creation/cli.py
+++ b/gsma_dataset_creation/cli.py
@@ -61,9 +61,29 @@ def setup_logging(log_level: str) -> None:
logger.add(
lambda msg: typer.echo(msg, err=True),
level=log_level.upper(),
- format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
+ format = (
+ "{time:YYYY-MM-DD HH:mm:ss} | "
+ "{level:<8} | "
+ "{file}:{line} | "
+ "{message}"
+ )
)
+ # File output
+ logger.add(
+ "GSMA-data-creation.log",
+ level=log_level.upper(),
+ format=(
+ "{time:YYYY-MM-DD HH:mm:ss} | "
+ "{level:<8} | "
+ "{file}:{line} | "
+ "{message}"
+ ),
+ rotation="20 MB",
+ retention="30 days",
+ compression="zip",
+ enqueue=True # Safe for async/multiprocessing
+ )
@app.command()
def process(
diff --git a/gsma_dataset_creation/qa_schema.py b/gsma_dataset_creation/qa_schema.py
index 05ff3ca..202fa4f 100644
--- a/gsma_dataset_creation/qa_schema.py
+++ b/gsma_dataset_creation/qa_schema.py
@@ -37,10 +37,11 @@ def get_qa_response_schema(config: QAConfig) -> Dict[str, Any]:
"question_type": {
"type": "string",
"enum": [
- "factual",
- "comprehension",
- "analytical",
- "definition",
+ "multiple_choice",
+ "multiple_choice_explained",
+ "true_false",
+ "open_ended",
+ "open_ended_no_context"
],
},
},
diff --git a/gsma_dataset_creation/question_generator.py b/gsma_dataset_creation/question_generator.py
index 005c14d..1913248 100644
--- a/gsma_dataset_creation/question_generator.py
+++ b/gsma_dataset_creation/question_generator.py
@@ -12,6 +12,7 @@
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, cast
+import traceback
from loguru import logger
from openai import (
@@ -148,6 +149,7 @@ async def _api_call() -> Any:
)
except APIStatusError as e:
# Check if it's a structured output compatibility issue
+ logger.error(traceback.format_exc())
error_msg = str(e.message) if hasattr(e, "message") else str(e)
if e.status_code == 400 and any(
keyword in error_msg.lower()
@@ -169,6 +171,7 @@ async def _api_call() -> Any:
# Re-raise other API status errors
raise
except Exception as e:
+ logger.error(traceback.format_exc())
# For unexpected exceptions, log details for debugging
logger.debug(
f"Unexpected exception type: {type(e).__name__}, message: {str(e)}"
@@ -194,6 +197,7 @@ async def _api_call() -> Any:
**extra_params,
)
except Exception as fallback_error:
+ logger.error(traceback.format_exc())
logger.error(f"Fallback also failed: {fallback_error}")
raise fallback_error
raise
diff --git a/gsma_dataset_creation/retry_utils.py b/gsma_dataset_creation/retry_utils.py
index af75df7..9a8e498 100644
--- a/gsma_dataset_creation/retry_utils.py
+++ b/gsma_dataset_creation/retry_utils.py
@@ -129,6 +129,8 @@ async def wrapper(*args: Any, **kwargs: Any) -> Any:
f"Failed {func.__name__} after {config.max_retries} retries: "
f"{type(e).__name__}: {e}"
)
+ import traceback
+ logger.error(traceback.format_exc())
raise
return wrapper
diff --git a/gsma_dataset_creation/similarity/similarity_calculator.py b/gsma_dataset_creation/similarity/similarity_calculator.py
index 23aa3d0..5d2ec57 100644
--- a/gsma_dataset_creation/similarity/similarity_calculator.py
+++ b/gsma_dataset_creation/similarity/similarity_calculator.py
@@ -5,6 +5,7 @@
using efficient approximate nearest neighbor search.
"""
+import os
import json
from datetime import datetime
from pathlib import Path
@@ -97,6 +98,16 @@ def build_faiss_index(
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / norms
+ # Minimum number of points needed per centroid for effective clustering
+ min_points_per_centroid = 39
+
+ # Adjust index type if dataset too small
+ if n_chunks < min_points_per_centroid:
+ logger.debug(
+ f"Dataset size {n_chunks} is sufficient for IVFFlat index"
+ )
+ self.faiss_index_type = "Flat"
+
# Build index based on type
if self.faiss_index_type == "Flat":
# Flat index for exact search (slower but accurate)
@@ -112,6 +123,10 @@ def build_faiss_index(
if self.nlist > n_chunks // 10:
self.nlist = max(1, n_chunks // 10)
+ # Ensure nlist is appropriate for dataset size
+ if self.nlist * min_points_per_centroid > n_chunks:
+ self.nlist = max(1, n_chunks // min_points_per_centroid)
+
logger.info(f"Building IVFFlat index with nlist={self.nlist}")
# Create quantizer and index
@@ -181,8 +196,15 @@ def compute_similarities(
k_search = min(
self.k + 1, len(self.chunk_metadata)
) # +1 to exclude self-similarity
+
+ # In some environment, or when Faiss is built with MKL,
+ # this can cause segmentation faults, crashes or weird failures during index.search().
+ # This change limits OpenMP for multithreading to a single thread
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
+
scores, indices = self.faiss_index.search(query_embeddings, k_search)
+
# Process results
all_similarities = []
similarities_computed = 0
diff --git a/gsma_dataset_creation/similarity_cli.py b/gsma_dataset_creation/similarity_cli.py
index 0bc232e..3e41ee2 100644
--- a/gsma_dataset_creation/similarity_cli.py
+++ b/gsma_dataset_creation/similarity_cli.py
@@ -17,7 +17,22 @@ def setup_logging(log_level: str) -> None:
logger.add(
lambda msg: typer.echo(msg, err=True),
level=log_level.upper(),
- format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {file}:{line} | {message}",
+ )
+ # File output
+ logger.add(
+ "GSMA-data-creation.log",
+ level=log_level.upper(),
+ format=(
+ "{time:YYYY-MM-DD HH:mm:ss} | "
+ "{level:<8} | "
+ "{file}:{line} | "
+ "{message}"
+ ),
+ rotation="20 MB",
+ retention="30 days",
+ compression="zip",
+ enqueue=True # Safe for async/multiprocessing
)
@@ -534,6 +549,8 @@ def ranker(
raise typer.Exit(1)
except Exception as e:
logger.error(f"❌ Unexpected error: {e}")
+ import traceback
+ logger.error(traceback.format_exc())
raise typer.Exit(1)
diff --git a/pipelines/annotation/dvc.lock b/pipelines/annotation/dvc.lock
index 7330482..e205328 100644
--- a/pipelines/annotation/dvc.lock
+++ b/pipelines/annotation/dvc.lock
@@ -1,32 +1,33 @@
schema: '2.0'
stages:
add_subgroups:
- cmd: uv run gsma add-subgroup-to-dataset --dataset-repo mantisnlp/gsma_prd_synthetic_qa
- --working-groups data/working_groups_mapping.json --output data/gsma_prd_synthetic_qa_with_subgroups
- --log-level INFO
+ cmd: uv run gsma add-subgroup-to-dataset --dataset-repo
+ jamesaidev001/esim_prd_synthetic_qa --working-groups
+ data/working_groups_mapping.json --output
+ data/esim_prd_synthetic_qa_with_subgroups --log-level INFO
deps:
- path: data/working_groups_mapping.json
hash: md5
- md5: 2adbc799e706c013db55d1fb3338878c
- size: 10629
+ md5: 43d32045e3db97435255cb18015a4e1f
+ size: 10411
- path: gsma_dataset_creation/cli.py
hash: md5
- md5: 1bfad6c8fd2ce3455f7cdf9a1f1e0518
- size: 80990
+ md5: e1d291f8c6f59a65f0a0914c283a3968
+ size: 82670
- path: gsma_dataset_creation/subgroup_adder.py
hash: md5
md5: 6d25cfda36dd581cf975bd4453a850eb
size: 14752
outs:
- - path: data/gsma_prd_synthetic_qa_with_subgroups
+ - path: data/esim_prd_synthetic_qa_with_subgroups
hash: md5
- md5: b1d1eff79e4d53a1cdcab707c068f5fe.dir
- size: 423824806
+ md5: e4df70e8c255276e52c4e2af1347f112.dir
+ size: 21318642
nfiles: 4
upload_tsg_annotation:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_with_subgroups
- --subgroup TSG --sample-size 100 --dataset-name-prefix gsma_annotation_tsg --logger-level
- INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_with_subgroups --subgroup TSG --sample-size 100
+ --dataset-name-prefix gsma_annotation_tsg --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_with_subgroups
hash: md5
@@ -42,9 +43,9 @@ stages:
md5: 3afc719e94c38002e8a124fe99344648
size: 13916
upload_annotation@@FASG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup FASG --sample-size 1000 --dataset-name-prefix gsma_annotation_FASG
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup FASG --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_FASG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -60,9 +61,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@PQTN:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup PQTN --sample-size 1000 --dataset-name-prefix gsma_annotation_PQTN
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup PQTN --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_PQTN --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -78,9 +79,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@TSG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup TSG --sample-size 1000 --dataset-name-prefix gsma_annotation_TSG
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup TSG --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_TSG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -96,9 +97,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@SAM:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup SAM --sample-size 1000 --dataset-name-prefix gsma_annotation_SAM
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup SAM --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_SAM --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -114,9 +115,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@eSim:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_with_subgroups
- --subgroup eSim --sample-size 1000 --dataset-name-prefix gsma_annotation_eSim
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_with_subgroups --subgroup eSim --sample-size 1000
+ --dataset-name-prefix gsma_annotation_eSim --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_with_subgroups
hash: md5
@@ -132,9 +133,9 @@ stages:
md5: b0e96df363386ac9af0328636a6c3e3b
size: 20248
upload_annotation@@IG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup IG --sample-size 1000 --dataset-name-prefix gsma_annotation_IG --logger-level
- INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup IG --sample-size 1000
+ --dataset-name-prefix gsma_annotation_IG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -150,9 +151,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@SG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup SG --sample-size 1000 --dataset-name-prefix gsma_annotation_SG --logger-level
- INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup SG --sample-size 1000
+ --dataset-name-prefix gsma_annotation_SG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -168,9 +169,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@NG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup NG --sample-size 1000 --dataset-name-prefix gsma_annotation_NG --logger-level
- INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup NG --sample-size 1000
+ --dataset-name-prefix gsma_annotation_NG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -186,9 +187,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@IDS:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup IDS --sample-size 1000 --dataset-name-prefix gsma_annotation_IDS
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup IDS --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_IDS --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -204,9 +205,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@RCS:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup RCS --sample-size 1000 --dataset-name-prefix gsma_annotation_RCS
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup RCS --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_RCS --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -222,9 +223,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@OPG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup OPG --sample-size 1000 --dataset-name-prefix gsma_annotation_OPG
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup OPG --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_OPG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -240,9 +241,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@WAS:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup WAS --sample-size 1000 --dataset-name-prefix gsma_annotation_WAS
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup WAS --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_WAS --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -258,9 +259,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@SAS:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup SAS --sample-size 1000 --dataset-name-prefix gsma_annotation_SAS
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup SAS --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_SAS --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -276,9 +277,10 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@CERTIFICATION_POLICY:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup CERTIFICATION_POLICY --sample-size 1000 --dataset-name-prefix gsma_annotation_CERTIFICATION_POLICY
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup CERTIFICATION_POLICY
+ --sample-size 1000 --dataset-name-prefix
+ gsma_annotation_CERTIFICATION_POLICY --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -294,9 +296,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@EID:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup EID --sample-size 1000 --dataset-name-prefix gsma_annotation_EID
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup EID --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_EID --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -312,8 +314,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@COMPLIANCE:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup COMPLIANCE --sample-size 1000 --dataset-name-prefix gsma_annotation_COMPLIANCE
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup COMPLIANCE
+ --sample-size 1000 --dataset-name-prefix gsma_annotation_COMPLIANCE
--logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -330,8 +333,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@REQUIREMENTS:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup REQUIREMENTS --sample-size 1000 --dataset-name-prefix gsma_annotation_REQUIREMENTS
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup REQUIREMENTS
+ --sample-size 1000 --dataset-name-prefix gsma_annotation_REQUIREMENTS
--logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -348,9 +352,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@ESA:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup ESA --sample-size 1000 --dataset-name-prefix gsma_annotation_ESA
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup ESA --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_ESA --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -366,8 +370,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@TECHNICAL:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup TECHNICAL --sample-size 1000 --dataset-name-prefix gsma_annotation_TECHNICAL
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup TECHNICAL
+ --sample-size 1000 --dataset-name-prefix gsma_annotation_TECHNICAL
--logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -384,9 +389,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation@@TESTING:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup TESTING --sample-size 1000 --dataset-name-prefix gsma_annotation_TESTING
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup TESTING --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_TESTING --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -402,9 +407,9 @@ stages:
md5: f116942d041709137200bf5bd1b4fec8
size: 18809
upload_annotation_working_groups@@IG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup IG --sample-size 1000 --dataset-name-prefix gsma_annotation_IG --logger-level
- INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup IG --sample-size 1000
+ --dataset-name-prefix gsma_annotation_IG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -420,8 +425,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@@Oig_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Oig_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Oig_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Oig_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -438,9 +444,10 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@@Other:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Other --working-group NG --sample-size 1000 --dataset-name-prefix
- gsma_annotation_ng_Other --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Other --working-group
+ NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Other
+ --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -456,9 +463,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_working_groups@@SG:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup SG --sample-size 1000 --dataset-name-prefix gsma_annotation_SG --logger-level
- INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup SG --sample-size 1000
+ --dataset-name-prefix gsma_annotation_SG --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -474,8 +481,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@@Security:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Security --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Security
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Security --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -492,9 +500,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_working_groups@@SAS:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup SAS --sample-size 1000 --dataset-name-prefix gsma_annotation_SAS
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup SAS --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_SAS --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -510,8 +518,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@@Esa_scheme:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Esa_scheme --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Esa_scheme
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Esa_scheme --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -528,8 +537,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@@Certification_policy:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Certification_policy --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Certification_policy
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Certification_policy --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -546,8 +556,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@@Upg_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Upg_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Upg_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Upg_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -564,8 +575,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@@Requirements:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Requirements --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Requirements
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Requirements --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -582,8 +594,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@@Testing:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Testing --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Testing
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Testing --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -600,8 +613,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@@Mc_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Mc_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Mc_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Mc_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -618,8 +632,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@@Tg:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Tg --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Tg --working-group NG
+ --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg
--logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -636,8 +651,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@@Compliance:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Compliance --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Compliance
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Compliance --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -654,8 +670,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@@Nrg_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Nrg_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Nrg_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Nrg_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -672,9 +689,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_working_groups@@PQTN:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup PQTN --sample-size 1000 --dataset-name-prefix gsma_annotation_PQTN
- --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup PQTN --sample-size
+ 1000 --dataset-name-prefix gsma_annotation_PQTN --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -690,26 +707,28 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@Compliance:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Compliance --working-group eSIM --sample-size 1000 --dataset-name-prefix
- gsma_annotation_esim_Compliance --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Compliance
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Compliance --logger-level DEBUG
deps:
- - path: data/gsma_prd_synthetic_qa_with_subgroups
+ - path: data/esim_prd_synthetic_qa_with_subgroups
hash: md5
- md5: a683e4a7744b9c6fbf18c5b4eceba3dc.dir
- size: 423827710
- nfiles: 5
+ md5: 90bab7cea26f10c19bec4eeb4e7cd8ca.dir
+ size: 21358490
+ nfiles: 9
- path: gsma_dataset_creation/argilla_cli.py
hash: md5
- md5: 77a9aaa9df4253afa88578dc676b4e27
- size: 17579
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
- path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
hash: md5
- md5: ca99922bcb3c06c428edc22d8e8ec7ad
- size: 20868
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
upload_annotation_esim_subgroups@Esa_scheme:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Esa_scheme --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Esa_scheme
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Esa_scheme --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -726,44 +745,47 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@Technical:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Technical --working-group eSIM --sample-size 1000 --dataset-name-prefix
- gsma_annotation_esim_Technical --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Technical
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Technical --logger-level DEBUG
deps:
- - path: data/gsma_prd_synthetic_qa_with_subgroups
+ - path: data/esim_prd_synthetic_qa_with_subgroups
hash: md5
- md5: 021a45de31d599951e8c55d21411cdd9.dir
- size: 423876334
+ md5: b0d1d9b50e059bbdedcf50dc972a095e.dir
+ size: 21350978
nfiles: 7
- path: gsma_dataset_creation/argilla_cli.py
hash: md5
- md5: 77a9aaa9df4253afa88578dc676b4e27
- size: 17579
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
- path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
hash: md5
- md5: ca99922bcb3c06c428edc22d8e8ec7ad
- size: 20868
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
upload_annotation_esim_subgroups@Testing:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Testing --working-group eSIM --sample-size 1000 --dataset-name-prefix
- gsma_annotation_esim_Testing --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Testing
+ --working-group eSim --sample-size 1000 --dataset-name-prefix esim_Testing
+ --logger-level DEBUG
deps:
- - path: data/gsma_prd_synthetic_qa_with_subgroups
+ - path: data/esim_prd_synthetic_qa_with_subgroups
hash: md5
- md5: 571a5ce098aad2fc70df866491e3aa38.dir
- size: 423892630
+ md5: bda3d5917c494486079b70900772e88b.dir
+ size: 21357146
nfiles: 8
- path: gsma_dataset_creation/argilla_cli.py
hash: md5
- md5: 77a9aaa9df4253afa88578dc676b4e27
- size: 17579
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
- path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
hash: md5
- md5: ca99922bcb3c06c428edc22d8e8ec7ad
- size: 20868
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
upload_annotation_esim_subgroups@Eid_scheme:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Eid_scheme --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Eid_scheme
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Eid_scheme --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -780,26 +802,28 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@Requirements:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Requirements --working-group eSIM --sample-size 1000 --dataset-name-prefix
- gsma_annotation_esim_Requirements --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Requirements
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Requirements --logger-level DEBUG
deps:
- - path: data/gsma_prd_synthetic_qa_with_subgroups
+ - path: data/esim_prd_synthetic_qa_with_subgroups
hash: md5
- md5: 15cf4b5c8b536bc22d68a5632f036423.dir
- size: 423909206
- nfiles: 10
+ md5: 5753034ec4f32916361da8666a21e06b.dir
+ size: 21326074
+ nfiles: 6
- path: gsma_dataset_creation/argilla_cli.py
hash: md5
- md5: 77a9aaa9df4253afa88578dc676b4e27
- size: 17579
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
- path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
hash: md5
- md5: ca99922bcb3c06c428edc22d8e8ec7ad
- size: 20868
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
upload_annotation_esim_subgroups@Certification_policy:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Certification_policy --working-group eSIM --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Certification_policy
+ --working-group eSIM --sample-size 1000 --dataset-name-prefix
gsma_annotation_esim_Certification_policy --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -816,27 +840,29 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_esim_subgroups@Security:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Security --working-group eSIM --sample-size 1000 --dataset-name-prefix
- gsma_annotation_esim_Security --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Security
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Security --logger-level DEBUG
deps:
- - path: data/gsma_prd_synthetic_qa_with_subgroups
+ - path: data/esim_prd_synthetic_qa_with_subgroups
hash: md5
- md5: 1ca6a5b2083300b20e253a9050aaedcc.dir
- size: 423920998
- nfiles: 12
+ md5: 19c5a9baedceb8e4debde5b306d03487.dir
+ size: 21324010
+ nfiles: 5
- path: gsma_dataset_creation/argilla_cli.py
hash: md5
- md5: 77a9aaa9df4253afa88578dc676b4e27
- size: 17579
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
- path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
hash: md5
- md5: ca99922bcb3c06c428edc22d8e8ec7ad
- size: 20868
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
upload_annotation_ng_subgroups@Isag:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Isag --working-group NG --sample-size 1000 --dataset-name-prefix
- gsma_annotation_ng_Isag --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Isag --working-group
+ NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Isag
+ --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -852,9 +878,10 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Other:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Other --working-group NG --sample-size 1000 --dataset-name-prefix
- gsma_annotation_ng_Other --logger-level INFO
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Other --working-group
+ NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Other
+ --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
hash: md5
@@ -870,8 +897,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Tg:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Tg --working-group NG --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Tg --working-group NG
+ --sample-size 1000 --dataset-name-prefix gsma_annotation_ng_Tg
--logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -888,8 +916,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Oig_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Oig_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Oig_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Oig_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -906,8 +935,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Nrg_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Nrg_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Nrg_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Nrg_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -924,8 +954,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Upg_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Upg_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Upg_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Upg_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -942,8 +973,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Mc_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Mc_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Mc_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Mc_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -960,8 +992,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Ursp_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Ursp_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Ursp_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Ursp_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -978,8 +1011,9 @@ stages:
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
upload_annotation_ng_subgroups@Fnr_isig:
- cmd: uv run gsma argilla upload-by-subgroup --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
- --subgroup Fnr_isig --working-group NG --sample-size 1000 --dataset-name-prefix
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/gsma_prd_synthetic_qa_with_subgroups --subgroup Fnr_isig
+ --working-group NG --sample-size 1000 --dataset-name-prefix
gsma_annotation_ng_Fnr_isig --logger-level INFO
deps:
- path: data/gsma_prd_synthetic_qa_with_subgroups
@@ -995,3 +1029,98 @@ stages:
hash: md5
md5: ca99922bcb3c06c428edc22d8e8ec7ad
size: 20868
+ upload_annotation_esim_subgroups@Certification:
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Certification
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Certification --logger-level DEBUG
+ deps:
+ - path: data/esim_prd_synthetic_qa_with_subgroups
+ hash: md5
+ md5: 0b6db7608974cd2d0444a389e326bd97.dir
+ size: 21361810
+ nfiles: 12
+ - path: gsma_dataset_creation/argilla_cli.py
+ hash: md5
+ md5: 96da29c6bf660ee8add034e08fec92ab
+ size: 58419
+ - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
+ hash: md5
+ md5: 71199696b1c81567d1da0a9da07391b1
+ size: 22119
+ upload_annotation_esim_subgroups@EID_Scheme:
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup EID_Scheme
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_EID_Scheme --logger-level DEBUG
+ deps:
+ - path: data/esim_prd_synthetic_qa_with_subgroups
+ hash: md5
+ md5: 9888acfb189bda92e3a61dd70ca20af6.dir
+ size: 21358794
+ nfiles: 10
+ - path: gsma_dataset_creation/argilla_cli.py
+ hash: md5
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
+ - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
+ hash: md5
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
+ upload_annotation_esim_subgroups@eSA_Scheme:
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup eSA_Scheme
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_eSA_Scheme --logger-level DEBUG
+ deps:
+ - path: data/esim_prd_synthetic_qa_with_subgroups
+ hash: md5
+ md5: c67e63493fe27a7cf2d7e59cf28b00bf.dir
+ size: 21361506
+ nfiles: 11
+ - path: gsma_dataset_creation/argilla_cli.py
+ hash: md5
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
+ - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
+ hash: md5
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
+ upload_annotation_esim_subgroups@Compliance_Policy:
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Compliance_Policy
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Compliance_Policy --logger-level DEBUG
+ deps:
+ - path: data/esim_prd_synthetic_qa_with_subgroups
+ hash: md5
+ md5: 1f1f681e66dc2e549cb97eed3525ef36.dir
+ size: 21318946
+ nfiles: 5
+ - path: gsma_dataset_creation/argilla_cli.py
+ hash: md5
+ md5: 96da29c6bf660ee8add034e08fec92ab
+ size: 58419
+ - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
+ hash: md5
+ md5: 71199696b1c81567d1da0a9da07391b1
+ size: 22119
+ upload_annotation_esim_subgroups@Certification_Policy:
+ cmd: uv run gsma argilla upload-by-subgroup --dataset-path
+ data/esim_prd_synthetic_qa_with_subgroups --subgroup Certification_Policy
+ --working-group eSim --sample-size 1000 --dataset-name-prefix
+ esim_Certification_Policy --logger-level DEBUG
+ deps:
+ - path: data/esim_prd_synthetic_qa_with_subgroups
+ hash: md5
+ md5: 20ebbdfeeb6d07d58d505189b73b43bf.dir
+ size: 21363226
+ nfiles: 12
+ - path: gsma_dataset_creation/argilla_cli.py
+ hash: md5
+ md5: cf30c259ec2abc16adad41b33830495b
+ size: 57170
+ - path: gsma_dataset_creation/validation/argilla_subgroup_uploader.py
+ hash: md5
+ md5: 630c3037409fc170eeb6ba05357087aa
+ size: 22164
diff --git a/pipelines/annotation/dvc.yaml b/pipelines/annotation/dvc.yaml
index 17cdddf..b780802 100644
--- a/pipelines/annotation/dvc.yaml
+++ b/pipelines/annotation/dvc.yaml
@@ -4,17 +4,17 @@ stages:
wdir: ../..
cmd: >-
uv run gsma add-subgroup-to-dataset
- --dataset-repo mantisnlp/gsma_prd_synthetic_qa
+ --dataset-repo jamesaidev001/esim_prd_synthetic_qa
--working-groups data/working_groups_mapping.json
- --output data/gsma_prd_synthetic_qa_with_subgroups
+ --output data/esim_prd_synthetic_qa_with_subgroups
--log-level INFO
deps:
- data/working_groups_mapping.json
- gsma_dataset_creation/cli.py
- gsma_dataset_creation/subgroup_adder.py
outs:
- - data/gsma_prd_synthetic_qa_with_subgroups
- desc: "Add subgroup classification to HuggingFace dataset from Hub (mantisnlp/gsma_prd_synthetic_qa) based on working groups mapping"
+ - data/esim_prd_synthetic_qa_with_subgroups
+ desc: "Add subgroup classification to HuggingFace dataset from Hub (jamesaidev001/esim_prd_synthetic_qa) based on working groups mapping"
# Step 2: Upload working groups without subgroups to Argilla for annotation
# NOTE: Includes automatic retry with exponential backoff for SQLite lock errors.
@@ -52,26 +52,26 @@ stages:
# eSIM has multiple subgroups that all share one workspace
upload_annotation_esim_subgroups:
foreach:
+ - Certification_Policy
+ - Compliance
+ - EID_Scheme
- Requirements
+ - Security
- Technical
- Testing
- - Compliance
- - Security
- - Certification_policy
- - Eid_scheme
- - Esa_scheme
+ - eSA_Scheme
do:
wdir: ../..
cmd: >-
uv run gsma argilla upload-by-subgroup
- --dataset-path data/gsma_prd_synthetic_qa_with_subgroups
+ --dataset-path data/esim_prd_synthetic_qa_with_subgroups
--subgroup ${item}
- --working-group eSIM
+ --working-group eSim
--sample-size 1000
- --dataset-name-prefix gsma_annotation_esim_${item}
- --logger-level INFO
+ --dataset-name-prefix esim_${item}
+ --logger-level DEBUG
deps:
- - data/gsma_prd_synthetic_qa_with_subgroups
+ - data/esim_prd_synthetic_qa_with_subgroups
- gsma_dataset_creation/argilla_cli.py
- gsma_dataset_creation/validation/argilla_subgroup_uploader.py
desc: "Upload 1000 Q&A pairs from eSIM/${item} subgroup to Argilla for quality annotation (eSIM workspace)"
diff --git a/pyproject.toml b/pyproject.toml
index 53d62c5..cc52fd7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "gsma-dataset-creation"
-version = "0.1.0"
+version = "0.1.1"
description = "GSMA document processing pipeline"
readme = "README.md"
requires-python = ">=3.11"