From c1bb21501bff96c3983e8d7283d4efd5b91c92e0 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 19 Jan 2026 17:20:57 +0100 Subject: [PATCH 1/4] Normalize version fields to strings in import scripts Add normalize_version_fields function to convert version fields (which can be int, float, or str) to string type for consistency. Integrate version normalization into all import scripts: - bioconda: normalize package.version - bioconductor: normalize Version - biotools: normalize version and nested version fields - galaxytool: normalize Suite_version, conda package version, and workflow versions --- bioconda-import/bioconda_importer.py | 5 +++ bioconductor-import/import.py | 5 +++ biotools-import/import.py | 7 ++++ common/metadata.py | 51 ++++++++++++++++++++++++++ galaxytool-import/galaxytool-import.py | 12 ++++++ 5 files changed, 80 insertions(+) create mode 100644 common/metadata.py diff --git a/bioconda-import/bioconda_importer.py b/bioconda-import/bioconda_importer.py index 0c3b132..2c38c3f 100644 --- a/bioconda-import/bioconda_importer.py +++ b/bioconda-import/bioconda_importer.py @@ -1,11 +1,15 @@ #!/usr/bin/env python import os +import sys import yaml import argparse from pathlib import Path import jinja2 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + def clean(content_path): import_directory = os.path.join(content_path, "imports", "bioconda") os.makedirs(import_directory, exist_ok=True) @@ -56,6 +60,7 @@ def merge(conda, content_path): biotools_data_path = os.path.join(content_path, 'data') for name, data in conda.items(): try: + data = normalize_version_fields(data, ["package.version"]) package_name = data['package']['name'] import_file_path = os.path.join(bioconda_import_path, f"bioconda_{package_name}.yaml") with open(import_file_path, "w") as out: diff --git a/bioconductor-import/import.py b/bioconductor-import/import.py index 851549d..e9c7ba7 100644 --- a/bioconductor-import/import.py +++ b/bioconductor-import/import.py @@ -2,10 +2,14 @@ import glob import json import os +import sys import requests import logging import yaml +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger() @@ -86,6 +90,7 @@ def retrieve(version, filters=None): path = os.path.join("imports", "bioconductor", f"{package_name}.bioconductor.json") try: + pack = normalize_version_fields(pack, ["Version"]) with open(path, "w") as write_file: json.dump(pack, write_file, sort_keys=True, indent=4, separators=(",", ": ")) logger.info(f"Saved {idx}/{total_packs} - {package_name}") diff --git a/biotools-import/import.py b/biotools-import/import.py index edf3d4f..b282073 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -7,6 +7,9 @@ import requests from boltons.iterutils import remap +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + BIOTOOLS_DOMAIN = "https://bio.tools" SSL_VERIFY = True @@ -49,6 +52,10 @@ def retrieve(filters=None): with open(os.path.join(directory, tpe_id + ".biotools.json"), "w") as write_file: drop_false = lambda path, key, value: bool(value) tool_cleaned = remap(tool, visit=drop_false) + tool_cleaned = normalize_version_fields( + tool_cleaned, ["version", "version[].version"] + ) + json.dump( tool_cleaned, write_file, sort_keys=True, indent=4, separators=(",", ": ") ) diff --git a/common/metadata.py b/common/metadata.py new file mode 100644 index 0000000..ea7b151 --- /dev/null +++ b/common/metadata.py @@ -0,0 +1,51 @@ +def normalize_version_to_string(value): + if value is None or isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, list): + return [normalize_version_to_string(v) for v in value] + if isinstance(value, dict): + return {k: normalize_version_to_string(v) for k, v in value.items()} + return value + + +def normalize_version_fields(data, field_paths): + if not isinstance(data, dict): + raise TypeError(f"Expected dict, got {type(data).__name__}") + + for field_path in field_paths: + try: + if "[" in field_path: + if "[]." not in field_path: + list_key = field_path.rstrip("[]") + if list_key in data and isinstance(data[list_key], list): + data[list_key] = normalize_version_to_string(data[list_key]) + else: + list_key, item_path = field_path.split("[].", 1) + if list_key in data and isinstance(data[list_key], list): + for item in data[list_key]: + if isinstance(item, dict) and item_path in item: + item[item_path] = normalize_version_to_string( + item[item_path] + ) + elif "." in field_path: + keys = field_path.split(".") + current = data + for key in keys[:-1]: + if not isinstance(current, dict) or key not in current: + break + current = current[key] + else: + final_key = keys[-1] + if isinstance(current, dict) and final_key in current: + current[final_key] = normalize_version_to_string( + current[final_key] + ) + else: + if field_path in data: + data[field_path] = normalize_version_to_string(data[field_path]) + except Exception: + continue + + return data diff --git a/galaxytool-import/galaxytool-import.py b/galaxytool-import/galaxytool-import.py index 6ebc3b2..df92bdf 100644 --- a/galaxytool-import/galaxytool-import.py +++ b/galaxytool-import/galaxytool-import.py @@ -1,10 +1,14 @@ import glob import json import os +import sys import requests from boltons.iterutils import remap +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + GALAXY_ALL_TOOLS_METADATA = "https://raw.githubusercontent.com/galaxyproject/galaxy_codex/refs/heads/main/communities/all/resources/tools.json" GALAXY_ALL_WORKFLOWS_METADATA = "https://raw.githubusercontent.com/galaxyproject/galaxy_codex/refs/heads/main/communities/all/resources/workflows.json" @@ -73,6 +77,14 @@ def retrieve(): # store tool json in galaxy import folder galaxy_tool_id = galaxy_tool_id.lower() tool_cleaned = {k.replace(" ", "_"): v for k, v in tool.items()} + tool_cleaned = normalize_version_fields( + tool_cleaned, + [ + "Suite_version", + "Latest_suite_conda_package_version", + "Related_Workflows[].latest_version", + ], + ) save_path = os.path.join(galaxy_directory, f"{galaxy_tool_id}.galaxy.json") with open(save_path, "w") as write_file: json.dump( From 386ca09c6a6ee310f73015ea39f4169f6b78a752 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 2 Feb 2026 13:43:16 +0100 Subject: [PATCH 2/4] Refactor version normalization to target only the main version field --- biotools-import/import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biotools-import/import.py b/biotools-import/import.py index b282073..0d2d2d9 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -53,7 +53,7 @@ def retrieve(filters=None): drop_false = lambda path, key, value: bool(value) tool_cleaned = remap(tool, visit=drop_false) tool_cleaned = normalize_version_fields( - tool_cleaned, ["version", "version[].version"] + tool_cleaned, ["version"] ) json.dump( From a1825cd3b6f5050a91a75dda487d620ec34b56be Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 2 Feb 2026 13:43:23 +0100 Subject: [PATCH 3/4] Enhance version normalization functions with detailed docstrings and improved error handling --- common/metadata.py | 58 ++++++++++++++++++++++++++++- research-software-ecosystem-content | 1 + 2 files changed, 57 insertions(+), 2 deletions(-) create mode 160000 research-software-ecosystem-content diff --git a/common/metadata.py b/common/metadata.py index ea7b151..6d92642 100644 --- a/common/metadata.py +++ b/common/metadata.py @@ -1,4 +1,32 @@ +import logging + + def normalize_version_to_string(value): + """ + Recursively convert version values to strings. + + This function processes version data by converting numeric types to strings + while preserving None and boolean values. It recursively processes nested + structures (lists and dicts). + + Args: + value: The value to normalize. Can be any type. + + Returns: + - None and bool values are returned unchanged + - int and float values are converted to strings + - Lists are processed recursively, returning a new list with normalized values + - Dicts are processed recursively, returning a new dict with normalized values + - Other types are returned unchanged + + Examples: + >>> normalize_version_to_string(1) + '1' + >>> normalize_version_to_string([1, 2, 3]) + ['1', '2', '3'] + >>> normalize_version_to_string({'version': 1.5}) + {'version': '1.5'} + """ if value is None or isinstance(value, bool): return value if isinstance(value, (int, float)): @@ -11,6 +39,31 @@ def normalize_version_to_string(value): def normalize_version_fields(data, field_paths): + """ + Normalize version fields to strings in a data dictionary. + + This function takes a dictionary and a collection of field paths, then normalizes + the version values at those paths to strings using normalize_version_to_string. + + Args: + data (dict): The dictionary to process. + field_paths (iterable): An iterable of field path strings. Supports: + - Simple fields: "version" + - Nested fields: "tool.version" + - List fields: "versions[]" + - List item nested fields: "versions[].version" + + Returns: + dict: The modified data dictionary with normalized version fields. + + Raises: + TypeError: If data is not a dictionary. + + Examples: + >>> data = {"version": 1, "versions": [{"version": 2}]} + >>> normalize_version_fields(data, ["version", "versions[].version"]) + {'version': '1', 'versions': [{'version': '2'}]} + """ if not isinstance(data, dict): raise TypeError(f"Expected dict, got {type(data).__name__}") @@ -18,7 +71,7 @@ def normalize_version_fields(data, field_paths): try: if "[" in field_path: if "[]." not in field_path: - list_key = field_path.rstrip("[]") + list_key = field_path[:-2] if field_path.endswith("[]") else field_path if list_key in data and isinstance(data[list_key], list): data[list_key] = normalize_version_to_string(data[list_key]) else: @@ -45,7 +98,8 @@ def normalize_version_fields(data, field_paths): else: if field_path in data: data[field_path] = normalize_version_to_string(data[field_path]) - except Exception: + except (KeyError, TypeError, IndexError, AttributeError) as e: + logging.debug(f"Skipping field path '{field_path}': {e}") continue return data diff --git a/research-software-ecosystem-content b/research-software-ecosystem-content new file mode 160000 index 0000000..f1355a7 --- /dev/null +++ b/research-software-ecosystem-content @@ -0,0 +1 @@ +Subproject commit f1355a7c14310cbf3eeed9661335fff8f6a57d91 From 14e7448206442cc863fb40369848fc54367c2ce4 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 2 Feb 2026 13:44:29 +0100 Subject: [PATCH 4/4] Remove obsolete subproject content directory --- research-software-ecosystem-content | 1 - 1 file changed, 1 deletion(-) delete mode 160000 research-software-ecosystem-content diff --git a/research-software-ecosystem-content b/research-software-ecosystem-content deleted file mode 160000 index f1355a7..0000000 --- a/research-software-ecosystem-content +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f1355a7c14310cbf3eeed9661335fff8f6a57d91