Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ htmlcov/
# IDEs
.vscode/
.idea/
*.ipynb

# OS
.DS_Store
Expand All @@ -64,3 +65,6 @@ data/embedding_dataset_from_questions_all_chunks_no_unclassified.json
# Git
.specify
.github/prompts

# Logs
*.log
4 changes: 4 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@
/prd_documents
/prd_dedup
/annotations
/processed-james
/processed-sgp
/raw-prd-sgp
/esim_prd_synthetic_qa_with_subgroups
6 changes: 6 additions & 0 deletions data/raw-prd-sgp.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: cac377080c3ef911d358961d6bfd19ac.dir
size: 32930661
nfiles: 29
hash: md5
path: raw-prd-sgp
4 changes: 2 additions & 2 deletions data/working_groups_mapping.json.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: b70592bdeac5c03634a60d09d0a8fbc7
size: 9901
- md5: 43d32045e3db97435255cb18015a4e1f
size: 10411
hash: md5
path: working_groups_mapping.json
22 changes: 21 additions & 1 deletion gsma_dataset_creation/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,29 @@ def setup_logging(log_level: str) -> None:
logger.add(
lambda msg: typer.echo(msg, err=True),
level=log_level.upper(),
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | {message}",
format = (
"<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
"<level>{level:<8}</level> | "
"<cyan>{file}:{line}</cyan> | "
"{message}"
)
)

# File output
logger.add(
"GSMA-data-creation.log",
level=log_level.upper(),
format=(
"<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
"<level>{level:<8}</level> | "
"<cyan>{file}:{line}</cyan> | "
"{message}"
),
rotation="20 MB",
retention="30 days",
compression="zip",
enqueue=True # Safe for async/multiprocessing
)

@app.command()
def process(
Expand Down
9 changes: 5 additions & 4 deletions gsma_dataset_creation/qa_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ def get_qa_response_schema(config: QAConfig) -> Dict[str, Any]:
"question_type": {
"type": "string",
"enum": [
"factual",
"comprehension",
"analytical",
"definition",
"multiple_choice",
"multiple_choice_explained",
"true_false",
"open_ended",
"open_ended_no_context"
],
},
},
Expand Down
4 changes: 4 additions & 0 deletions gsma_dataset_creation/question_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, cast
import traceback

from loguru import logger
from openai import (
Expand Down Expand Up @@ -148,6 +149,7 @@ async def _api_call() -> Any:
)
except APIStatusError as e:
# Check if it's a structured output compatibility issue
logger.error(traceback.format_exc())
error_msg = str(e.message) if hasattr(e, "message") else str(e)
if e.status_code == 400 and any(
keyword in error_msg.lower()
Expand All @@ -169,6 +171,7 @@ async def _api_call() -> Any:
# Re-raise other API status errors
raise
except Exception as e:
logger.error(traceback.format_exc())
# For unexpected exceptions, log details for debugging
logger.debug(
f"Unexpected exception type: {type(e).__name__}, message: {str(e)}"
Expand All @@ -194,6 +197,7 @@ async def _api_call() -> Any:
**extra_params,
)
except Exception as fallback_error:
logger.error(traceback.format_exc())
logger.error(f"Fallback also failed: {fallback_error}")
raise fallback_error
raise
Expand Down
2 changes: 2 additions & 0 deletions gsma_dataset_creation/retry_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ async def wrapper(*args: Any, **kwargs: Any) -> Any:
f"Failed {func.__name__} after {config.max_retries} retries: "
f"{type(e).__name__}: {e}"
)
import traceback
logger.error(traceback.format_exc())
raise

return wrapper
Expand Down
22 changes: 22 additions & 0 deletions gsma_dataset_creation/similarity/similarity_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using efficient approximate nearest neighbor search.
"""

import os
import json
from datetime import datetime
from pathlib import Path
Expand Down Expand Up @@ -97,6 +98,16 @@ def build_faiss_index(
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / norms

# Minimum number of points needed per centroid for effective clustering
min_points_per_centroid = 39

# Adjust index type if dataset too small
if n_chunks < min_points_per_centroid:
logger.debug(
f"Dataset size {n_chunks} is sufficient for IVFFlat index"
)
self.faiss_index_type = "Flat"

# Build index based on type
if self.faiss_index_type == "Flat":
# Flat index for exact search (slower but accurate)
Expand All @@ -112,6 +123,10 @@ def build_faiss_index(
if self.nlist > n_chunks // 10:
self.nlist = max(1, n_chunks // 10)

# Ensure nlist is appropriate for dataset size
if self.nlist * min_points_per_centroid > n_chunks:
self.nlist = max(1, n_chunks // min_points_per_centroid)

logger.info(f"Building IVFFlat index with nlist={self.nlist}")

# Create quantizer and index
Expand Down Expand Up @@ -181,8 +196,15 @@ def compute_similarities(
k_search = min(
self.k + 1, len(self.chunk_metadata)
) # +1 to exclude self-similarity

# In some environment, or when Faiss is built with MKL,
# this can cause segmentation faults, crashes or weird failures during index.search().
# This change limits OpenMP for multithreading to a single thread
os.environ.setdefault("OMP_NUM_THREADS", "1")

scores, indices = self.faiss_index.search(query_embeddings, k_search)


# Process results
all_similarities = []
similarities_computed = 0
Expand Down
19 changes: 18 additions & 1 deletion gsma_dataset_creation/similarity_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,22 @@ def setup_logging(log_level: str) -> None:
logger.add(
lambda msg: typer.echo(msg, err=True),
level=log_level.upper(),
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | {message}",
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{file}:{line}</cyan> | {message}",
)
# File output
logger.add(
"GSMA-data-creation.log",
level=log_level.upper(),
format=(
"<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
"<level>{level:<8}</level> | "
"<cyan>{file}:{line}</cyan> | "
"{message}"
),
rotation="20 MB",
retention="30 days",
compression="zip",
enqueue=True # Safe for async/multiprocessing
)


Expand Down Expand Up @@ -534,6 +549,8 @@ def ranker(
raise typer.Exit(1)
except Exception as e:
logger.error(f"❌ Unexpected error: {e}")
import traceback
logger.error(traceback.format_exc())
raise typer.Exit(1)


Expand Down
Loading
Loading